Skip to content

Commit

Permalink
fix: non utf8 code decode close #10691 (#10698)
Browse files Browse the repository at this point in the history
Signed-off-by: yihong0618 <[email protected]>
  • Loading branch information
yihong0618 authored Nov 14, 2024
1 parent fbb9c1c commit 7229646
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 4 deletions.
8 changes: 4 additions & 4 deletions api/core/workflow/nodes/document_extractor/node.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,14 +143,14 @@ def _extract_text_by_file_extension(*, file_content: bytes, file_extension: str)

def _extract_text_from_plain_text(file_content: bytes) -> str:
try:
return file_content.decode("utf-8")
return file_content.decode("utf-8", "ignore")
except UnicodeDecodeError as e:
raise TextExtractionError("Failed to decode plain text file") from e


def _extract_text_from_json(file_content: bytes) -> str:
try:
json_data = json.loads(file_content.decode("utf-8"))
json_data = json.loads(file_content.decode("utf-8", "ignore"))
return json.dumps(json_data, indent=2, ensure_ascii=False)
except (UnicodeDecodeError, json.JSONDecodeError) as e:
raise TextExtractionError(f"Failed to decode or parse JSON file: {e}") from e
Expand All @@ -159,7 +159,7 @@ def _extract_text_from_json(file_content: bytes) -> str:
def _extract_text_from_yaml(file_content: bytes) -> str:
"""Extract the content from yaml file"""
try:
yaml_data = yaml.safe_load_all(file_content.decode("utf-8"))
yaml_data = yaml.safe_load_all(file_content.decode("utf-8", "ignore"))
return yaml.dump_all(yaml_data, allow_unicode=True, sort_keys=False)
except (UnicodeDecodeError, yaml.YAMLError) as e:
raise TextExtractionError(f"Failed to decode or parse YAML file: {e}") from e
Expand Down Expand Up @@ -217,7 +217,7 @@ def _extract_text_from_file(file: File):

def _extract_text_from_csv(file_content: bytes) -> str:
try:
csv_file = io.StringIO(file_content.decode("utf-8"))
csv_file = io.StringIO(file_content.decode("utf-8", "ignore"))
csv_reader = csv.reader(csv_file)
rows = list(csv_reader)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,17 @@ def test_extract_text_from_plain_text():
assert text == "Hello, world!"


def tet_extract_text_from_plain_text_non_utf8():
import tempfile

non_utf8_content = b"Hello world\xa9." # \xA9 represents © in Latin-1
with tempfile.NamedTemporaryFile(delete=True) as temp_file:
temp_file.write(non_utf8_content)
temp_file.seek(0)
text = _extract_text_from_plain_text(temp_file.read())
assert text == "Hello, world."


@patch("pypdfium2.PdfDocument")
def test_extract_text_from_pdf(mock_pdf_document):
mock_page = Mock()
Expand Down

0 comments on commit 7229646

Please sign in to comment.