Skip to content

Commit

Permalink
community: fixes json loader not getting texts with json standard (#2…
Browse files Browse the repository at this point in the history
…7327)

This PR fixes JSONLoader._get_text not converting objects to json string
correctly.
If an object is serializable and is not a dict, JSONLoader will use
python built-in str() method to convert it to string. This may cause
object converted to strings not following json standard. For example, a
list will be converted to string with single quotes, and if json.loads
try to load this string, it will cause error.

---------

Co-authored-by: Erick Friis <[email protected]>
  • Loading branch information
TonyBotongChu and efriis authored Dec 12, 2024
1 parent 4149c0d commit 13c3c4a
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 13 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ def _get_text(self, sample: Any) -> str:
# In case the text is None, set it to an empty string
elif isinstance(content, str):
return content
elif isinstance(content, dict):
elif isinstance(content, (dict, list)):
return json.dumps(content) if content else ""
else:
return str(content) if content is not None else ""
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import io
from pathlib import Path
from typing import Any, Dict

import pytest
Expand All @@ -12,7 +13,7 @@


def test_load_valid_string_content(mocker: MockerFixture) -> None:
file_path = "/workspaces/langchain/test.json"
file_path = str(Path("/workspaces/langchain/test.json").resolve())
expected_docs = [
Document(
page_content="value1",
Expand All @@ -37,7 +38,7 @@ def test_load_valid_string_content(mocker: MockerFixture) -> None:


def test_load_valid_dict_content(mocker: MockerFixture) -> None:
file_path = "/workspaces/langchain/test.json"
file_path = str(Path("/workspaces/langchain/test.json").resolve())
expected_docs = [
Document(
page_content='{"text": "value1"}',
Expand All @@ -64,7 +65,7 @@ def test_load_valid_dict_content(mocker: MockerFixture) -> None:


def test_load_valid_bool_content(mocker: MockerFixture) -> None:
file_path = "/workspaces/langchain/test.json"
file_path = str(Path("/workspaces/langchain/test.json").resolve())
expected_docs = [
Document(
page_content="False",
Expand Down Expand Up @@ -93,7 +94,7 @@ def test_load_valid_bool_content(mocker: MockerFixture) -> None:


def test_load_valid_numeric_content(mocker: MockerFixture) -> None:
file_path = "/workspaces/langchain/test.json"
file_path = str(Path("/workspaces/langchain/test.json").resolve())
expected_docs = [
Document(
page_content="99",
Expand Down Expand Up @@ -122,7 +123,7 @@ def test_load_valid_numeric_content(mocker: MockerFixture) -> None:


def test_load_invalid_test_content(mocker: MockerFixture) -> None:
file_path = "/workspaces/langchain/test.json"
file_path = str(Path("/workspaces/langchain/test.json").resolve())

mocker.patch("builtins.open", mocker.mock_open())
mocker.patch(
Expand All @@ -139,7 +140,7 @@ def test_load_invalid_test_content(mocker: MockerFixture) -> None:


def test_load_jsonlines(mocker: MockerFixture) -> None:
file_path = "/workspaces/langchain/test.json"
file_path = str(Path("/workspaces/langchain/test.json").resolve())
expected_docs = [
Document(
page_content="value1",
Expand Down Expand Up @@ -177,7 +178,7 @@ def test_load_jsonlines(mocker: MockerFixture) -> None:
),
)
def test_load_jsonlines_list(params: Dict, mocker: MockerFixture) -> None:
file_path = "/workspaces/langchain/test.json"
file_path = str(Path("/workspaces/langchain/test.json").resolve())
expected_docs = [
Document(
page_content="value1",
Expand Down Expand Up @@ -250,7 +251,7 @@ def test_json_meta_01(
mocker.patch("builtins.open", mocker.mock_open())
mocker.patch(patch_func, return_value=patch_func_value)

file_path = "/workspaces/langchain/test.json"
file_path = str(Path("/workspaces/langchain/test.json").resolve())
expected_docs = [
Document(
page_content="value1",
Expand Down Expand Up @@ -300,7 +301,7 @@ def test_json_meta_02(
mocker.patch("builtins.open", mocker.mock_open())
mocker.patch(patch_func, return_value=patch_func_value)

file_path = "/workspaces/langchain/test.json"
file_path = str(Path("/workspaces/langchain/test.json").resolve())
expected_docs = [
Document(
page_content="value1",
Expand Down Expand Up @@ -336,7 +337,7 @@ def metadata_func(record: Dict, metadata: Dict) -> Dict:
def test_load_json_with_jq_parsable_content_key(
params: Dict, mocker: MockerFixture
) -> None:
file_path = "/workspaces/langchain/test.json"
file_path = str(Path("/workspaces/langchain/test.json").resolve())
expected_docs = [
Document(
page_content="value1",
Expand Down Expand Up @@ -364,7 +365,7 @@ def test_load_json_with_jq_parsable_content_key(


def test_load_json_with_nested_jq_parsable_content_key(mocker: MockerFixture) -> None:
file_path = "/workspaces/langchain/test.json"
file_path = str(Path("/workspaces/langchain/test.json").resolve())
expected_docs = [
Document(
page_content="message1",
Expand Down Expand Up @@ -401,7 +402,7 @@ def test_load_json_with_nested_jq_parsable_content_key(mocker: MockerFixture) ->
def test_load_json_with_nested_jq_parsable_content_key_with_metadata(
mocker: MockerFixture,
) -> None:
file_path = "/workspaces/langchain/test.json"
file_path = str(Path("/workspaces/langchain/test.json").resolve())
expected_docs = [
Document(
page_content="message1",
Expand Down

0 comments on commit 13c3c4a

Please sign in to comment.