Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 53 additions & 44 deletions llama-index-core/llama_index/core/readers/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import json
import re
import warnings
from typing import Any, Dict, Generator, List, Optional

from llama_index.core.readers.base import BaseReader
Expand Down Expand Up @@ -97,49 +98,57 @@ def load_data(
self, input_file: str, extra_info: Optional[Dict] = {}
) -> List[Document]:
"""Load data from the input file."""
with open(input_file, encoding="utf-8") as f:
load_data = []
if self.is_jsonl:
for line in f:
load_data.append(json.loads(line.strip()))
else:
load_data = [json.load(f)]

documents = []
for data in load_data:
if self.levels_back is None and self.clean_json is True:
# If levels_back isn't set and clean json is set,
# remove lines containing only formatting, we just format and make each
# line an embedding
json_output = json.dumps(
data, indent=0, ensure_ascii=self.ensure_ascii
)
lines = json_output.split("\n")
useful_lines = [
line for line in lines if not re.match(r"^[{}\[\],]*$", line)
]
documents.append(
Document(text="\n".join(useful_lines), metadata=extra_info)
)

elif self.levels_back is None and self.clean_json is False:
# If levels_back isn't set and clean json is False, create documents without cleaning
json_output = json.dumps(data, ensure_ascii=self.ensure_ascii)
documents.append(Document(text=json_output, metadata=extra_info))

elif self.levels_back is not None:
# If levels_back is set, we make the embeddings contain the labels
# from further up the JSON tree
lines = [
*_depth_first_yield(
data,
self.levels_back,
self.collapse_length,
[],
self.ensure_ascii,
try:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are too many points in this reader where a recursion error can happen. Since the goal is ultimately being defensive against attacks, I think this is the simplest and most robust solution

with open(input_file, encoding="utf-8") as f:
load_data = []
if self.is_jsonl:
for line in f:
load_data.append(json.loads(line.strip()))
else:
load_data = [json.load(f)]

documents = []
for data in load_data:
if self.levels_back is None and self.clean_json is True:
# If levels_back isn't set and clean json is set,
# remove lines containing only formatting, we just format and make each
# line an embedding
json_output = json.dumps(
data, indent=0, ensure_ascii=self.ensure_ascii
)
lines = json_output.split("\n")
useful_lines = [
line
for line in lines
if not re.match(r"^[{}\[\],]*$", line)
]
documents.append(
Document(text="\n".join(useful_lines), metadata=extra_info)
)

elif self.levels_back is None and self.clean_json is False:
# If levels_back isn't set and clean json is False, create documents without cleaning
json_output = json.dumps(data, ensure_ascii=self.ensure_ascii)
documents.append(
Document(text=json_output, metadata=extra_info)
)

elif self.levels_back is not None:
# If levels_back is set, we make the embeddings contain the labels
# from further up the JSON tree
lines = [
*_depth_first_yield(
data,
self.levels_back,
self.collapse_length,
[],
self.ensure_ascii,
)
]
documents.append(
Document(text="\n".join(lines), metadata=extra_info)
)
]
documents.append(
Document(text="\n".join(lines), metadata=extra_info)
)
return documents
except RecursionError:
warnings.warn("Recursion error occurred while processing JSON data.")
return []
32 changes: 32 additions & 0 deletions llama-index-core/tests/readers/test_json.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
"""Test file reader."""

import json
import sys
from tempfile import TemporaryDirectory

import pytest
from llama_index.core.readers.json import JSONReader


Expand Down Expand Up @@ -93,3 +96,32 @@ def test_clean_json() -> None:
reader1 = JSONReader(clean_json=True)
data1 = reader1.load_data(file_name)
assert data1[0].get_content() == '"a": {\n"b": "c"'


def test_max_recursion_attack(tmp_path):
original_limit = sys.getrecursionlimit()
try:
nested_dict = {}
current_level = nested_dict
sys.setrecursionlimit(5000)

for i in range(1, 2001): # Create 2000 levels of nesting
if i == 2000:
current_level[f"level{i}"] = "final_value"
else:
current_level[f"level{i}"] = {}
current_level = current_level[f"level{i}"]

file_name = tmp_path / "test_nested.json"
with open(file_name, "w") as f:
f.write(json.dumps(nested_dict))

# Force a recursion error
sys.setrecursionlimit(500)
reader = JSONReader(levels_back=1)
with pytest.warns(UserWarning):
data = reader.load_data(file_name)
assert data == []

finally:
sys.setrecursionlimit(original_limit)