Merge pull request #1306 from N1ghtmarecus/refactor/regex-malicious-k…

…eywords Improve query safety by using regex for keyword detection and fix exclude benign phrases from malicious keyword detection (e.g. "through osmosis")
sinaptik-ai · Aug 13, 2024 · e15281b · e15281b
2 parents 1db8dd0 + 9737c68
commit e15281b
Show file tree

Hide file tree

Showing 2 changed files with 41 additions and 16 deletions.
diff --git a/pandasai/agent/base.py b/pandasai/agent/base.py
@@ -1,5 +1,6 @@
 import json
 import os
+import re
 import uuid
 from typing import List, Optional, Union
 
@@ -234,22 +235,11 @@ def call_llm_with_prompt(self, prompt: BasePrompt):
                 retry_count += 1
 
     def check_malicious_keywords_in_query(self, query):
-        dangerous_modules = [
-            " os",
-            " io",
-            ".os",
-            ".io",
-            "'os'",
-            "'io'",
-            '"os"',
-            '"io"',
-            "chr(",
-            "chr)",
-            "chr ",
-            "(chr",
-            "b64decode",
-        ]
-        return any(module in query for module in dangerous_modules)
+        dangerous_pattern = re.compile(
+            r"\b(os|io|chr|b64decode)\b|"
+            r"(\.os|\.io|'os'|'io'|\"os\"|\"io\"|chr\(|chr\)|chr |\(chr)"
+        )
+        return bool(dangerous_pattern.search(query))
 
     def chat(self, query: str, output_type: Optional[str] = None):
         """

diff --git a/tests/unit_tests/agent/test_agent.py b/tests/unit_tests/agent/test_agent.py
@@ -675,3 +675,38 @@ def test_(self, sample_df, config):
 The query contains references to io or os modules or b64decode method which can be used to execute or access system resources in unsafe ways.
 """
         )
+
+    def test_query_detection(self, sample_df, config):
+        agent = Agent(sample_df, config, memory_size=10)
+
+        # Positive cases: should detect malicious keywords
+        malicious_queries = [
+            "import os",
+            "import io",
+            "chr(97)",
+            "base64.b64decode",
+            "file = open('file.txt', 'os')",
+            "os.system('rm -rf /')",
+            "io.open('file.txt', 'w')",
+        ]
+
+        expected_malicious_response = (
+            """Unfortunately, I was not able to get your answers, because of the following error:\n\n"""
+            """The query contains references to io or os modules or b64decode method which can be used to execute or access system resources in unsafe ways.\n"""
+        )
+
+        for query in malicious_queries:
+            response = agent.chat(query)
+            assert response == expected_malicious_response
+
+        # Negative cases: should not detect any malicious keywords
+        safe_queries = [
+            "print('Hello world')",
+            "through osmosis",
+            "the ionosphere",
+            "the capital of Norway is Oslo",
+        ]
+
+        for query in safe_queries:
+            response = agent.chat(query)
+            assert "Unfortunately, I was not able to get your answers" not in response