Fix Neural Solution SQL/CMD injection (#1627)

Kaihui-intel · pre-commit-ci[bot] · web-flow · commit 14b7b0ab8ccb · 2024-02-27T14:25:23.000+08:00
Signed-off-by: Kaihui-intel &lt;kaihui.tang@intel.com&gt;
Co-authored-by: pre-commit-ci[bot] &lt;66853113+pre-commit-ci[bot]@users.noreply.github.com&gt;
diff --git a/neural_solution/backend/scheduler.py b/neural_solution/backend/scheduler.py
@@ -154,7 +154,7 @@ def prepare_task(self, task: Task):
         if not task.optimized:
             # Generate quantization code with Neural Coder API
             neural_coder_cmd = ["python -m neural_coder --enable --approach"]
-            # for users to define approach: "static, ""static_ipex", "dynamic", "auto"
+            # for users to define approach: "static", "static_ipex", "dynamic", "auto"
             approach = task.approach
             neural_coder_cmd.append(approach)
             if is_remote_url(task.script_url):
diff --git a/neural_solution/examples/custom_models_optimized/tf_example1/README.md b/neural_solution/examples/custom_models_optimized/tf_example1/README.md
@@ -91,7 +91,7 @@ optional arguments:
     "script_url": "tf_example1",
     "optimized": "True",
     "arguments": [
-        "--dataset_location=dataset --model_path=model"
+        "--dataset_location=dataset", "--model_path=model"
     ],
     "approach": "static",
     "requirements": [
@@ -106,7 +106,7 @@ When using distributed quantization, the `workers` needs to be set to greater th
     "script_url": "tf_example1",
     "optimized": "True",
     "arguments": [
-        "--dataset_location=dataset --model_path=model"
+        "--dataset_location=dataset", "--model_path=model"
     ],
     "approach": "static",
     "requirements": [
diff --git a/neural_solution/examples/custom_models_optimized/tf_example1/task_request.json b/neural_solution/examples/custom_models_optimized/tf_example1/task_request.json
@@ -2,7 +2,7 @@
     "script_url": "custom_models_optimized/tf_example1",
     "optimized": "True",
     "arguments": [
-        "--dataset_location=dataset --model_path=model"
+        "--dataset_location=dataset", "--model_path=model"
     ],
     "approach": "static",
     "requirements": ["tensorflow"
diff --git a/neural_solution/examples/custom_models_optimized/tf_example1/task_request_distributed.json b/neural_solution/examples/custom_models_optimized/tf_example1/task_request_distributed.json
@@ -2,7 +2,7 @@
     "script_url": "custom_models_optimized/tf_example1",
     "optimized": "True",
     "arguments": [
-        "--dataset_location=dataset --model_path=model"
+        "--dataset_location=dataset", "--model_path=model"
     ],
     "approach": "static",
     "requirements": ["tensorflow"
diff --git a/neural_solution/examples/hf_models/README.md b/neural_solution/examples/hf_models/README.md
@@ -72,7 +72,7 @@ optional arguments:
     "script_url": "https://github.com/huggingface/transformers/blob/v4.21-release/examples/pytorch/text-classification/run_glue.py",
     "optimized": "False",
     "arguments": [
-        "--model_name_or_path bert-base-cased --task_name mrpc --do_eval --output_dir result"
+        "--model_name_or_path=bert-base-cased", "--task_name=mrpc", "--do_eval", "--output_dir=result"
     ],
     "approach": "static",
     "requirements": [],
diff --git a/neural_solution/examples/hf_models/task_request.json b/neural_solution/examples/hf_models/task_request.json
@@ -2,7 +2,7 @@
     "script_url": "https://github.com/huggingface/transformers/blob/v4.21-release/examples/pytorch/text-classification/run_glue.py",
     "optimized": "False",
     "arguments": [
-        "--model_name_or_path bert-base-cased --task_name mrpc --do_eval --output_dir result"
+        "--model_name_or_path=bert-base-cased", "--task_name=mrpc", "--do_eval", "--output_dir=result"
     ],
     "approach": "static",
     "requirements": ["datasets", "transformers=4.21.0", "torch"],
diff --git a/neural_solution/examples/hf_models_grpc/README.md b/neural_solution/examples/hf_models_grpc/README.md
@@ -68,7 +68,7 @@ optional arguments:
     "script_url": "https://github.com/huggingface/transformers/blob/v4.21-release/examples/pytorch/text-classification/run_glue.py",
     "optimized": "False",
     "arguments": [
-        "--model_name_or_path bert-base-cased --task_name mrpc --do_eval --output_dir result"
+        "--model_name_or_path=bert-base-cased", "--task_name=mrpc", "--do_eval", "--output_dir=result"
     ],
     "approach": "static",
     "requirements": [],
diff --git a/neural_solution/examples/hf_models_grpc/task_request.json b/neural_solution/examples/hf_models_grpc/task_request.json
@@ -2,7 +2,7 @@
     "script_url": "https://github.com/huggingface/transformers/blob/v4.21-release/examples/pytorch/text-classification/run_glue.py",
     "optimized": "False",
     "arguments": [
-        "--model_name_or_path bert-base-cased --task_name mrpc --do_eval --output_dir result"
+        "--model_name_or_path=bert-base-cased", "--task_name=mrpc", "--do_eval", "--output_dir=result"
     ],
     "approach": "static",
     "requirements": ["datasets", "transformers=4.21.0", "torch"],
diff --git a/neural_solution/frontend/fastapi/main_server.py b/neural_solution/frontend/fastapi/main_server.py
@@ -36,6 +36,7 @@
     get_cluster_info,
     get_cluster_table,
     get_res_during_tuning,
+    is_valid_task,
     list_to_string,
     serialize,
 )
@@ -153,10 +154,14 @@ async def submit_task(task: Task):
     Returns:
         json: status , id of task and messages.
     """
+    if not is_valid_task(task.dict()):
+        raise HTTPException(status_code=422, detail="Invalid task")
+
     msg = "Task submitted successfully"
     status = "successfully"
     # search the current
     db_path = get_db_path(config.workspace)
+
     if os.path.isfile(db_path):
         conn = sqlite3.connect(db_path)
         cursor = conn.cursor()
diff --git a/neural_solution/frontend/utility.py b/neural_solution/frontend/utility.py
@@ -295,3 +295,61 @@ def list_to_string(lst: list):
         str: string
     """
     return " ".join(str(i) for i in lst)
+
+
+def is_invalid_str(to_test_str: str):
+    """Verify whether the to_test_str is valid.
+
+    Args:
+        to_test_str (str): string to be tested.
+
+    Returns:
+        bool: valid or invalid
+    """
+    return any(char in to_test_str for char in [" ", '"', "'", "&", "|", ";", "`", ">"])
+
+
+def is_valid_task(task: dict) -> bool:
+    """Verify whether the task is valid.
+
+    Args:
+        task (dict): task request
+
+    Returns:
+        bool: valid or invalid
+    """
+    required_fields = ["script_url", "optimized", "arguments", "approach", "requirements", "workers"]
+
+    for field in required_fields:
+        if field not in task:
+            return False
+
+    if not isinstance(task["script_url"], str) or is_invalid_str(task["script_url"]):
+        return False
+
+    if (isinstance(task["optimized"], str) and task["optimized"] not in ["True", "False"]) or (
+        not isinstance(task["optimized"], str) and not isinstance(task["optimized"], bool)
+    ):
+        return False
+
+    if not isinstance(task["arguments"], list):
+        return False
+    else:
+        for argument in task["arguments"]:
+            if is_invalid_str(argument):
+                return False
+
+    if not isinstance(task["approach"], str) or task["approach"] not in ["static", "static_ipex", "dynamic", "auto"]:
+        return False
+
+    if not isinstance(task["requirements"], list):
+        return False
+    else:
+        for requirement in task["requirements"]:
+            if is_invalid_str(requirement):
+                return False
+
+    if not isinstance(task["workers"], int) or task["workers"] < 1:
+        return False
+
+    return True
diff --git a/neural_solution/test/frontend/fastapi/test_main_server.py b/neural_solution/test/frontend/fastapi/test_main_server.py
@@ -118,12 +118,26 @@ def test_get_description(self):
     def test_submit_task(self, mock_submit_task):
         task = {
             "script_url": "http://example.com/script.py",
-            "optimized": True,
+            "optimized": "True",
             "arguments": ["arg1", "arg2"],
-            "approach": "approach1",
+            "approach": "static",
+            "requirements": ["req1", "req2"],
+            "workers": 3,
+        }
+
+        # test invalid task
+        task_invalid = {
+            "script_url": "http://example.com/script.py",
+            "optimized": "True",
+            "arguments": "invalid str, should be list",
+            "approach": "static",
             "requirements": ["req1", "req2"],
             "workers": 3,
         }
+        response = client.post("/task/submit/", json=task_invalid)
+        print(response)
+        self.assertEqual(response.status_code, 422)
+        self.assertIn("arguments", response.text)
 
         # test no db case
         delete_db()
@@ -174,7 +188,7 @@ def test_get_task_by_id(self, mock_submit_task):
             "script_url": "http://example.com/script.py",
             "optimized": True,
             "arguments": ["arg1", "arg2"],
-            "approach": "approach1",
+            "approach": "static",
             "requirements": ["req1", "req2"],
             "workers": 3,
         }
@@ -200,7 +214,7 @@ def test_get_task_status_by_id(self, mock_submit_task):
             "script_url": "http://example.com/script.py",
             "optimized": True,
             "arguments": ["arg1", "arg2"],
-            "approach": "approach1",
+            "approach": "static",
             "requirements": ["req1", "req2"],
             "workers": 3,
         }
diff --git a/neural_solution/test/frontend/fastapi/test_utils.py b/neural_solution/test/frontend/fastapi/test_utils.py
@@ -10,6 +10,7 @@
     get_cluster_info,
     get_cluster_table,
     get_res_during_tuning,
+    is_valid_task,
     list_to_string,
     serialize,
 )
@@ -110,6 +111,102 @@ def test_list_to_string(self):
         expected_result = "Hello Neural Solution"
         self.assertEqual(list_to_string(lst), expected_result)
 
+    def test_is_valid_task(self):
+        task_sql_injection = {
+            "script_url": "https://github.com/huggingface/transformers/blob/v4.21-release/examples/pytorch/text-classification/run_glue.py",
+            "optimized": "False",
+            "arguments": [],
+            "approach": "5', '6', 7, 'pending'), ('1b9ff5c2fd2143d58522bd71d18845a3', '2', 3, '4', '5', '6', 7, 'pending') ON CONFLICT (id) DO UPDATE SET id = '1b9ff5c2fd2143d58522bd71d18845a3', q_model_path = '/home/victim/.ssh' --",
+            "requirements": [],
+            "workers": 1,
+        }
+        self.assertFalse(is_valid_task(task_sql_injection))
+
+        task_cmd_injection = {
+            "script_url": 'https://github.com/huggingface/transformers/blob/v4.21-release/examples/pytorch/text-classification/run_glue.py & eval "$(echo ZWNobyAiRG9tYWluIGV4cGFuc2lvbiIgPiB+L2F0dGFjay5weSI= | base64 --decode)"',
+            "optimized": "False",
+            "arguments": ["--model_name_or_path bert-base-cased --task_name mrpc --do_eval --output_dir result"],
+            "approach": "static",
+            "requirements": [],
+            "workers": 1,
+        }
+        self.assertFalse(is_valid_task(task_cmd_injection))
+
+        task_lack_field = {
+            "optimized": "True",
+        }
+        self.assertFalse(is_valid_task(task_lack_field))
+
+        task_script_url_not_str = {
+            "script_url": ["custom_models_optimized/tf_example1"],
+            "optimized": "True",
+            "arguments": ["--dataset_location=dataset --model_path=model"],
+            "approach": "static",
+            "requirements": ["tensorflow"],
+            "workers": 1,
+        }
+        self.assertFalse(is_valid_task(task_script_url_not_str))
+
+        task_optimized_not_bool_str = {
+            "script_url": ["custom_models_optimized/tf_example1"],
+            "optimized": "True or False",
+            "arguments": ["--dataset_location=dataset", "--model_path=model"],
+            "approach": "static",
+            "requirements": ["tensorflow"],
+            "workers": 1,
+        }
+        self.assertFalse(is_valid_task(task_optimized_not_bool_str))
+
+        task_arguments_not_list = {
+            "script_url": ["custom_models_optimized/tf_example1"],
+            "optimized": "True",
+            "arguments": 123,
+            "approach": "static",
+            "requirements": ["tensorflow"],
+            "workers": 1,
+        }
+        self.assertFalse(is_valid_task(task_arguments_not_list))
+
+        task_arguments_invalid = {
+            "script_url": ["custom_models_optimized/tf_example1"],
+            "optimized": "True",
+            "arguments": ["--dataset_location=dataset --model_path=model"],
+            "approach": "static",
+            "requirements": ["tensorflow"],
+            "workers": 1,
+        }
+        self.assertFalse(is_valid_task(task_arguments_not_list))
+
+        task_approach_is_invalid = {
+            "script_url": ["custom_models_optimized/tf_example1"],
+            "optimized": "True",
+            "arguments": [],
+            "approach": "static or dynamic",
+            "requirements": ["tensorflow"],
+            "workers": 1,
+        }
+        self.assertFalse(is_valid_task(task_approach_is_invalid))
+
+        task_requirements_not_list = {
+            "script_url": ["custom_models_optimized/tf_example1"],
+            "optimized": "True",
+            "arguments": [],
+            "approach": "static",
+            "requirements": "tensorflow",
+            "workers": 1,
+        }
+        self.assertFalse(is_valid_task(task_requirements_not_list))
+
+        task_normal = {
+            "script_url": "custom_models_optimized/tf_example1",
+            "optimized": "True",
+            "arguments": ["--dataset_location=dataset", "--model_path=model"],
+            "approach": "static",
+            "requirements": ["tensorflow"],
+            "workers": 1,
+        }
+        self.assertTrue(is_valid_task(task_normal))
+
 
 if __name__ == "__main__":
     unittest.main()