From cc73f91a0300f399e7872fb9d0af3f63da75a51d Mon Sep 17 00:00:00 2001 From: ArslanSaleem Date: Thu, 2 Jan 2025 12:36:56 +0100 Subject: [PATCH 1/8] feat(security): add security config to disable it --- pandasai/agent/base.py | 5 ++- pandasai/helpers/optional.py | 45 +++++++++++++------ pandasai/pipelines/chat/code_cleaning.py | 11 ++++- pandasai/pipelines/chat/code_execution.py | 5 ++- pandasai/schemas/df_config.py | 3 +- .../helpers/test_optional_dependency.py | 15 +++++++ .../smart_datalake/test_code_cleaning.py | 30 ++++++++++--- 7 files changed, 91 insertions(+), 23 deletions(-) diff --git a/pandasai/agent/base.py b/pandasai/agent/base.py index c6203111d..0c29ada1c 100644 --- a/pandasai/agent/base.py +++ b/pandasai/agent/base.py @@ -259,7 +259,10 @@ def chat(self, query: str, output_type: Optional[str] = None): self.assign_prompt_id() - if self.check_malicious_keywords_in_query(query): + if self.config.security in [ + "standard", + "advanced", + ] and self.check_malicious_keywords_in_query(query): raise MaliciousQueryError( "The query contains references to io or os modules or b64decode method which can be used to execute or access system resources in unsafe ways." ) diff --git a/pandasai/helpers/optional.py b/pandasai/helpers/optional.py index f1d7a830a..bf43a2115 100644 --- a/pandasai/helpers/optional.py +++ b/pandasai/helpers/optional.py @@ -52,7 +52,7 @@ def get_version(module: types.ModuleType) -> str: return version -def get_environment(additional_deps: List[dict]) -> dict: +def get_environment(additional_deps: List[dict], secure: bool = True) -> dict: """ Returns the environment for the code to be executed. @@ -74,22 +74,41 @@ def get_environment(additional_deps: List[dict]) -> dict: }, } - env["pd"] = RestrictedPandas() - env["plt"] = RestrictedMatplotlib() - env["np"] = RestrictedNumpy() + if secure: + env["pd"] = RestrictedPandas() + env["plt"] = RestrictedMatplotlib() + env["np"] = RestrictedNumpy() - for lib in additional_deps: - if lib["name"] == "seaborn": - env["sns"] = RestrictedSeaborn() + for lib in additional_deps: + if lib["name"] == "seaborn": + env["sns"] = RestrictedSeaborn() - if lib["name"] == "datetime": - env["datetime"] = RestrictedDatetime() + if lib["name"] == "datetime": + env["datetime"] = RestrictedDatetime() - if lib["name"] == "json": - env["json"] = RestrictedJson() + if lib["name"] == "json": + env["json"] = RestrictedJson() - if lib["name"] == "base64": - env["base64"] = RestrictedBase64() + if lib["name"] == "base64": + env["base64"] = RestrictedBase64() + + else: + env["pd"] = import_dependency("pandas") + env["plt"] = import_dependency("matplotlib.pyplot") + env["np"] = import_dependency("numpy") + + for lib in additional_deps: + if lib["name"] == "seaborn": + env["sns"] = import_dependency("seaborn") + + if lib["name"] == "datetime": + env["datetime"] = import_dependency("datetime") + + if lib["name"] == "json": + env["json"] = import_dependency("json") + + if lib["name"] == "base64": + env["base64"] = import_dependency("base64") return env diff --git a/pandasai/pipelines/chat/code_cleaning.py b/pandasai/pipelines/chat/code_cleaning.py index 5e86975df..97acd5a88 100644 --- a/pandasai/pipelines/chat/code_cleaning.py +++ b/pandasai/pipelines/chat/code_cleaning.py @@ -121,10 +121,14 @@ def _replace_plot_png(self, code): return re.sub(r"""(['"])([^'"]*\.png)\1""", r"\1temp_chart.png\1", code) def get_code_to_run(self, code: str, context: CodeExecutionContext) -> Any: - if self._is_malicious_code(code): + if self._config.security in [ + "standard", + "advanced", + ] and self._is_malicious_code(code): raise MaliciousQueryError( "Code shouldn't use 'os', 'io' or 'chr', 'b64decode' functions as this could lead to malicious code execution." ) + code = self._replace_plot_png(code) self._current_code_executed = code @@ -471,7 +475,10 @@ def _extract_fix_dataframe_redeclarations( if target_names and self._check_is_df_declaration(node): # Construct dataframe from node code = "\n".join(code_lines) - env = get_environment(self._additional_dependencies) + env = get_environment( + self._additional_dependencies, + secure=self._config.security in ["standard", "advanced"], + ) env["dfs"] = copy.deepcopy(self._get_originals(self._dfs)) if context.skills_manager.used_skills: for skill_func_name in context.skills_manager.used_skills: diff --git a/pandasai/pipelines/chat/code_execution.py b/pandasai/pipelines/chat/code_execution.py index 65acf4656..6ee25ce22 100644 --- a/pandasai/pipelines/chat/code_execution.py +++ b/pandasai/pipelines/chat/code_execution.py @@ -153,7 +153,10 @@ def execute_code(self, code: str, context: CodeExecutionContext) -> Any: # List the required dfs, so we can avoid to run the connectors # if the code does not need them dfs = self._required_dfs(code) - environment: dict = get_environment(self._additional_dependencies) + environment: dict = get_environment( + self._additional_dependencies, + secure=self._config.security in ["standard", "advanced"], + ) environment["dfs"] = self._get_originals(dfs) if len(environment["dfs"]) == 1: environment["df"] = environment["dfs"][0] diff --git a/pandasai/schemas/df_config.py b/pandasai/schemas/df_config.py index eec8307c4..e92188b4f 100644 --- a/pandasai/schemas/df_config.py +++ b/pandasai/schemas/df_config.py @@ -1,4 +1,4 @@ -from typing import Any, List, Optional, TypedDict +from typing import Any, List, Optional, TypedDict, Literal from pandasai.constants import DEFAULT_CHART_DIRECTORY from pandasai.helpers.dataframe_serializer import DataframeSerializerType @@ -30,6 +30,7 @@ class Config(BaseModel): log_server: LogServerConfig = None direct_sql: bool = False dataframe_serializer: DataframeSerializerType = DataframeSerializerType.CSV + security: Literal["standard", "none", "advanced"] = "standard" class Config: arbitrary_types_allowed = True diff --git a/tests/unit_tests/helpers/test_optional_dependency.py b/tests/unit_tests/helpers/test_optional_dependency.py index db7ed2c2f..e06008ed7 100644 --- a/tests/unit_tests/helpers/test_optional_dependency.py +++ b/tests/unit_tests/helpers/test_optional_dependency.py @@ -9,6 +9,9 @@ import pytest from pandasai.helpers.optional import VERSIONS, get_environment, import_dependency +from pandasai.safe_libs.restricted_matplotlib import RestrictedMatplotlib +from pandasai.safe_libs.restricted_numpy import RestrictedNumpy +from pandasai.safe_libs.restricted_pandas import RestrictedPandas def test_import_optional(): @@ -91,3 +94,15 @@ def test_env_for_necessary_deps(): assert "pd" in env assert "plt" in env assert "np" in env + + +def test_env_for_security(): + env = get_environment([], secure=True) + assert "pd" in env and isinstance(env["pd"], RestrictedPandas) + assert "plt" in env and isinstance(env["plt"], RestrictedMatplotlib) + assert "np" in env and isinstance(env["np"], RestrictedNumpy) + + env = get_environment([], secure=False) + assert "pd" in env and not isinstance(env["pd"], RestrictedPandas) + assert "plt" in env and not isinstance(env["plt"], RestrictedMatplotlib) + assert "np" in env and not isinstance(env["np"], RestrictedNumpy) diff --git a/tests/unit_tests/pipelines/smart_datalake/test_code_cleaning.py b/tests/unit_tests/pipelines/smart_datalake/test_code_cleaning.py index 49169f373..436b5d848 100644 --- a/tests/unit_tests/pipelines/smart_datalake/test_code_cleaning.py +++ b/tests/unit_tests/pipelines/smart_datalake/test_code_cleaning.py @@ -577,13 +577,16 @@ def test_clean_code_using_multi_incorrect_sql_table( assert str(excinfo.value) == ("Query uses unauthorized table: table1.") @patch("pandasai.connectors.pandas.PandasConnector.head") - def test_fix_dataframe_redeclarations(self, mock_head, context: PipelineContext): + def test_fix_dataframe_redeclarations( + self, mock_head, context: PipelineContext, config: dict + ): df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) mock_head.return_value = df pandas_connector = PandasConnector({"original_df": df}) code_cleaning = CodeCleaning() code_cleaning._dfs = [pandas_connector] + code_cleaning._config = Config(**config) context.dfs = [pandas_connector] python_code = """ @@ -601,7 +604,7 @@ def test_fix_dataframe_redeclarations(self, mock_head, context: PipelineContext) @patch("pandasai.connectors.pandas.PandasConnector.head") def test_fix_dataframe_multiline_redeclarations( - self, mock_head, context: PipelineContext + self, mock_head, context: PipelineContext, config: dict ): df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) mock_head.return_value = df @@ -609,6 +612,7 @@ def test_fix_dataframe_multiline_redeclarations( code_cleaning = CodeCleaning() code_cleaning._dfs = [pandas_connector] + code_cleaning._config = Config(**config) context.dfs = [pandas_connector] python_code = """ @@ -660,7 +664,7 @@ def test_fix_dataframe_no_redeclarations(self, mock_head, context: PipelineConte @patch("pandasai.connectors.pandas.PandasConnector.head") def test_fix_dataframe_redeclarations_with_subscript( - self, mock_head, context: PipelineContext + self, mock_head, context: PipelineContext, config: dict ): df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) mock_head.return_value = df @@ -668,6 +672,7 @@ def test_fix_dataframe_redeclarations_with_subscript( code_cleaning = CodeCleaning() code_cleaning._dfs = [pandas_connector] + code_cleaning._config = Config(**config) context.dfs = [pandas_connector] python_code = """ @@ -685,7 +690,7 @@ def test_fix_dataframe_redeclarations_with_subscript( @patch("pandasai.connectors.pandas.PandasConnector.head") def test_fix_dataframe_redeclarations_with_subscript_and_data_variable( - self, mock_head, context: PipelineContext + self, mock_head, context: PipelineContext, config: dict ): data = { "country": ["China", "United States", "Japan", "Germany", "United Kingdom"], @@ -697,6 +702,7 @@ def test_fix_dataframe_redeclarations_with_subscript_and_data_variable( code_cleaning = CodeCleaning() code_cleaning._dfs = [pandas_connector] + code_cleaning._config = Config(**config) context.dfs = [pandas_connector] python_code = """ @@ -719,7 +725,7 @@ def test_fix_dataframe_redeclarations_with_subscript_and_data_variable( @patch("pandasai.connectors.pandas.PandasConnector.head") def test_fix_dataframe_redeclarations_and_data_variable( - self, mock_head, context: PipelineContext + self, mock_head, context: PipelineContext, config: Config ): data = { "country": ["China", "United States", "Japan", "Germany", "United Kingdom"], @@ -731,6 +737,7 @@ def test_fix_dataframe_redeclarations_and_data_variable( code_cleaning = CodeCleaning() code_cleaning._dfs = [pandas_connector] + code_cleaning._config = Config(**config) context.dfs = [pandas_connector] python_code = """ @@ -928,3 +935,16 @@ def test_clean_code_raise_import_with_restricted_using_import_statement( """ with pytest.raises(MaliciousQueryError): code_cleaning.execute(malicious_code, context=context, logger=logger) + + def test_clean_code_raise_not_whitelisted_lib_with_none_security( + self, + code_cleaning: CodeCleaning, + context: PipelineContext, + logger: Logger, + ): + builtins_code = """import scipy +result = {'type': 'number', 'value': set([1, 2, 3])}""" + + context.config.security = "none" + with pytest.raises(BadImportError): + code_cleaning.execute(builtins_code, context=context, logger=logger) From c58726f4c89100e7f81936b4872b7a4bab523979 Mon Sep 17 00:00:00 2001 From: ArslanSaleem Date: Thu, 2 Jan 2025 13:37:41 +0100 Subject: [PATCH 2/8] fix: linting errors --- pandasai/ee/vectorstores/chroma.py | 1 - pandasai/schemas/df_config.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/pandasai/ee/vectorstores/chroma.py b/pandasai/ee/vectorstores/chroma.py index de008a176..8606a9d1b 100644 --- a/pandasai/ee/vectorstores/chroma.py +++ b/pandasai/ee/vectorstores/chroma.py @@ -5,7 +5,6 @@ import chromadb from chromadb import config from chromadb.utils import embedding_functions - from pandasai.helpers.logger import Logger from pandasai.helpers.path import find_project_root from pandasai.vectorstores.vectorstore import VectorStore diff --git a/pandasai/schemas/df_config.py b/pandasai/schemas/df_config.py index e92188b4f..398c466d7 100644 --- a/pandasai/schemas/df_config.py +++ b/pandasai/schemas/df_config.py @@ -1,4 +1,4 @@ -from typing import Any, List, Optional, TypedDict, Literal +from typing import Any, List, Literal, Optional, TypedDict from pandasai.constants import DEFAULT_CHART_DIRECTORY from pandasai.helpers.dataframe_serializer import DataframeSerializerType From fdb798581b4b60acf7673bda730339fc11f0cbc8 Mon Sep 17 00:00:00 2001 From: ArslanSaleem Date: Thu, 2 Jan 2025 13:46:26 +0100 Subject: [PATCH 3/8] fix(safety): push exact match for get attributes --- pandasai/safe_libs/base_restricted_module.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandasai/safe_libs/base_restricted_module.py b/pandasai/safe_libs/base_restricted_module.py index 3067a3aab..65e1864bd 100644 --- a/pandasai/safe_libs/base_restricted_module.py +++ b/pandasai/safe_libs/base_restricted_module.py @@ -4,7 +4,7 @@ def wrapper(*args, **kwargs): # Check for any suspicious arguments that might be used for importing for arg in args + tuple(kwargs.values()): if isinstance(arg, str) and any( - module in arg.lower() + module == arg.lower() for module in ["io", "os", "subprocess", "sys", "importlib"] ): raise SecurityError( From 8606ea396350f256e5efd1f20175ae3fc9d9b4af Mon Sep 17 00:00:00 2001 From: ArslanSaleem Date: Thu, 2 Jan 2025 13:59:49 +0100 Subject: [PATCH 4/8] add additional test case --- tests/unit_tests/agent/test_agent.py | 36 ++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/tests/unit_tests/agent/test_agent.py b/tests/unit_tests/agent/test_agent.py index 59b9e6958..c970174f5 100644 --- a/tests/unit_tests/agent/test_agent.py +++ b/tests/unit_tests/agent/test_agent.py @@ -710,3 +710,39 @@ def test_query_detection(self, sample_df, config): for query in safe_queries: response = agent.chat(query) assert "Unfortunately, I was not able to get your answers" not in response + + def test_query_detection_disable_security(self, sample_df, config): + config["security"] = "none" + agent = Agent(sample_df, config, memory_size=10) + + # Positive cases: should detect malicious keywords + malicious_queries = [ + "import os", + "import io", + "chr(97)", + "base64.b64decode", + "file = open('file.txt', 'os')", + "os.system('rm -rf /')", + "io.open('file.txt', 'w')", + ] + + expected_malicious_response = ( + """Unfortunately, I was not able to get your answers, because of the following error:\n\n""" + """The query contains references to io or os modules or b64decode method which can be used to execute or access system resources in unsafe ways.\n""" + ) + + for query in malicious_queries: + response = agent.chat(query) + assert response != expected_malicious_response + + # Negative cases: should not detect any malicious keywords + safe_queries = [ + "print('Hello world')", + "through osmosis", + "the ionosphere", + "the capital of Norway is Oslo", + ] + + for query in safe_queries: + response = agent.chat(query) + assert "Unfortunately, I was not able to get your answers" not in response From d5eaa65211436ae13fe3468713fffe8447fc69bc Mon Sep 17 00:00:00 2001 From: ArslanSaleem Date: Thu, 2 Jan 2025 14:05:18 +0100 Subject: [PATCH 5/8] fix: test case --- tests/unit_tests/agent/test_agent.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/unit_tests/agent/test_agent.py b/tests/unit_tests/agent/test_agent.py index c970174f5..c1e9986fd 100644 --- a/tests/unit_tests/agent/test_agent.py +++ b/tests/unit_tests/agent/test_agent.py @@ -715,7 +715,6 @@ def test_query_detection_disable_security(self, sample_df, config): config["security"] = "none" agent = Agent(sample_df, config, memory_size=10) - # Positive cases: should detect malicious keywords malicious_queries = [ "import os", "import io", @@ -735,7 +734,6 @@ def test_query_detection_disable_security(self, sample_df, config): response = agent.chat(query) assert response != expected_malicious_response - # Negative cases: should not detect any malicious keywords safe_queries = [ "print('Hello world')", "through osmosis", From 00f35ab3fa203ec948503b520715cbec17593481 Mon Sep 17 00:00:00 2001 From: ArslanSaleem Date: Thu, 2 Jan 2025 14:10:46 +0100 Subject: [PATCH 6/8] fix: linting errors --- pandasai/ee/vectorstores/chroma.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandasai/ee/vectorstores/chroma.py b/pandasai/ee/vectorstores/chroma.py index 8606a9d1b..08cccee00 100644 --- a/pandasai/ee/vectorstores/chroma.py +++ b/pandasai/ee/vectorstores/chroma.py @@ -1,6 +1,12 @@ import os import uuid -from typing import Callable, Iterable, List, Optional, Union +from typing import ( + Callable, + Iterable, + List, + Optional, + Union, +) import chromadb from chromadb import config From 21b9ba3b8bd49e1b34c6bddcfc0eac3b37ada2b1 Mon Sep 17 00:00:00 2001 From: ArslanSaleem Date: Thu, 2 Jan 2025 14:17:43 +0100 Subject: [PATCH 7/8] fix: linting errors --- pandasai/ee/vectorstores/chroma.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/pandasai/ee/vectorstores/chroma.py b/pandasai/ee/vectorstores/chroma.py index 08cccee00..de008a176 100644 --- a/pandasai/ee/vectorstores/chroma.py +++ b/pandasai/ee/vectorstores/chroma.py @@ -1,16 +1,11 @@ import os import uuid -from typing import ( - Callable, - Iterable, - List, - Optional, - Union, -) +from typing import Callable, Iterable, List, Optional, Union import chromadb from chromadb import config from chromadb.utils import embedding_functions + from pandasai.helpers.logger import Logger from pandasai.helpers.path import find_project_root from pandasai.vectorstores.vectorstore import VectorStore From 23630871cdafeeb684fc2a13eaae510998b564e5 Mon Sep 17 00:00:00 2001 From: ArslanSaleem Date: Thu, 2 Jan 2025 18:47:22 +0100 Subject: [PATCH 8/8] docs(config): update config doc to add new config attribute --- docs/library.mdx | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/library.mdx b/docs/library.mdx index a1e01097d..228c6d95f 100644 --- a/docs/library.mdx +++ b/docs/library.mdx @@ -222,6 +222,7 @@ Settings: - `use_error_correction_framework`: whether to use the error correction framework. Defaults to `True`. If set to `True`, PandasAI will try to correct the errors in the code generated by the LLM with further calls to the LLM. If set to `False`, PandasAI will not try to correct the errors in the code generated by the LLM. - `max_retries`: the maximum number of retries to use when using the error correction framework. Defaults to `3`. You can use this setting to override the default number of retries. - `custom_whitelisted_dependencies`: the custom whitelisted dependencies to use. Defaults to `{}`. You can use this setting to override the default custom whitelisted dependencies. You can find more information about custom whitelisted dependencies [here](/custom-whitelisted-dependencies). +- `security`: The “security” parameter allows for three levels depending on specific use cases: “none,” “standard,” and “advanced.” "standard" and "advanced" are especially useful for detecting malicious intent from user queries and avoiding the execution of potentially harmful code. By default, the “security” is set to "standard." The security check might introduce stricter rules that could flag benign queries as harmful. You can deactivate it in the configuration by setting “security” to “none.” ## Demo in Google Colab