run-llama · logan-markewich · May 29, 2025 · May 28, 2025 · masci · May 28, 2025
diff --git a/llama-index-core/llama_index/core/readers/json.py b/llama-index-core/llama_index/core/readers/json.py
@@ -2,6 +2,7 @@
 
 import json
 import re
+import warnings
 from typing import Any, Dict, Generator, List, Optional
 
 from llama_index.core.readers.base import BaseReader
@@ -97,49 +98,57 @@ def load_data(
         self, input_file: str, extra_info: Optional[Dict] = {}
     ) -> List[Document]:
         """Load data from the input file."""
-        with open(input_file, encoding="utf-8") as f:
-            load_data = []
-            if self.is_jsonl:
-                for line in f:
-                    load_data.append(json.loads(line.strip()))
-            else:
-                load_data = [json.load(f)]
-
-            documents = []
-            for data in load_data:
-                if self.levels_back is None and self.clean_json is True:
-                    # If levels_back isn't set and clean json is set,
-                    # remove lines containing only formatting, we just format and make each
-                    # line an embedding
-                    json_output = json.dumps(
-                        data, indent=0, ensure_ascii=self.ensure_ascii
-                    )
-                    lines = json_output.split("\n")
-                    useful_lines = [
-                        line for line in lines if not re.match(r"^[{}\[\],]*$", line)
-                    ]
-                    documents.append(
-                        Document(text="\n".join(useful_lines), metadata=extra_info)
-                    )
-
-                elif self.levels_back is None and self.clean_json is False:
-                    # If levels_back isn't set  and clean json is False, create documents without cleaning
-                    json_output = json.dumps(data, ensure_ascii=self.ensure_ascii)
-                    documents.append(Document(text=json_output, metadata=extra_info))
-
-                elif self.levels_back is not None:
-                    # If levels_back is set, we make the embeddings contain the labels
-                    # from further up the JSON tree
-                    lines = [
-                        *_depth_first_yield(
-                            data,
-                            self.levels_back,
-                            self.collapse_length,
-                            [],
-                            self.ensure_ascii,
+        try:
+            with open(input_file, encoding="utf-8") as f:
+                load_data = []
+                if self.is_jsonl:
+                    for line in f:
+                        load_data.append(json.loads(line.strip()))
+                else:
+                    load_data = [json.load(f)]
+
+                documents = []
+                for data in load_data:
+                    if self.levels_back is None and self.clean_json is True:
+                        # If levels_back isn't set and clean json is set,
+                        # remove lines containing only formatting, we just format and make each
+                        # line an embedding
+                        json_output = json.dumps(
+                            data, indent=0, ensure_ascii=self.ensure_ascii
+                        )
+                        lines = json_output.split("\n")
+                        useful_lines = [
+                            line
+                            for line in lines
+                            if not re.match(r"^[{}\[\],]*$", line)
+                        ]
+                        documents.append(
+                            Document(text="\n".join(useful_lines), metadata=extra_info)
+                        )
+
+                    elif self.levels_back is None and self.clean_json is False:
+                        # If levels_back isn't set  and clean json is False, create documents without cleaning
+                        json_output = json.dumps(data, ensure_ascii=self.ensure_ascii)
+                        documents.append(
+                            Document(text=json_output, metadata=extra_info)
+                        )
+
+                    elif self.levels_back is not None:
+                        # If levels_back is set, we make the embeddings contain the labels
+                        # from further up the JSON tree
+                        lines = [
+                            *_depth_first_yield(
+                                data,
+                                self.levels_back,
+                                self.collapse_length,
+                                [],
+                                self.ensure_ascii,
+                            )
+                        ]
+                        documents.append(
+                            Document(text="\n".join(lines), metadata=extra_info)
                         )
-                    ]
-                    documents.append(
-                        Document(text="\n".join(lines), metadata=extra_info)
-                    )
             return documents
+        except RecursionError:
+            warnings.warn("Recursion error occurred while processing JSON data.")
+            return []
diff --git a/llama-index-core/tests/readers/test_json.py b/llama-index-core/tests/readers/test_json.py
@@ -1,7 +1,10 @@
 """Test file reader."""
 
+import json
+import sys
 from tempfile import TemporaryDirectory
 
+import pytest
 from llama_index.core.readers.json import JSONReader
 
 
@@ -93,3 +96,32 @@ def test_clean_json() -> None:
         reader1 = JSONReader(clean_json=True)
         data1 = reader1.load_data(file_name)
         assert data1[0].get_content() == '"a": {\n"b": "c"'
+
+
+def test_max_recursion_attack(tmp_path):
+    original_limit = sys.getrecursionlimit()
+    try:
+        nested_dict = {}
+        current_level = nested_dict
+        sys.setrecursionlimit(5000)
+
+        for i in range(1, 2001):  # Create 2000 levels of nesting
+            if i == 2000:
+                current_level[f"level{i}"] = "final_value"
+            else:
+                current_level[f"level{i}"] = {}
+                current_level = current_level[f"level{i}"]
+
+        file_name = tmp_path / "test_nested.json"
+        with open(file_name, "w") as f:
+            f.write(json.dumps(nested_dict))
+
+        # Force a recursion error
+        sys.setrecursionlimit(500)
+        reader = JSONReader(levels_back=1)
+        with pytest.warns(UserWarning):
+            data = reader.load_data(file_name)
+            assert data == []
+
+    finally:
+        sys.setrecursionlimit(original_limit)