From f440eb476cf75d6109e8a3719cadf893529dcef8 Mon Sep 17 00:00:00 2001 From: Austin Walker Date: Wed, 28 Aug 2024 10:19:58 -0400 Subject: [PATCH] feat: Support encoding parameter in partition_csv (#3564) See added test file. Added support for the encoding parameter, which can be passed directly to `pd.read_csv`. --- CHANGELOG.md | 4 +++- example-docs/stanley-cups-utf-16.csv | Bin 0 -> 174 bytes test_unstructured/partition/test_csv.py | 8 ++++++++ typings/pandas/io/parsers/readers.pyi | 1 + unstructured/__version__.py | 2 +- unstructured/partition/auto.py | 1 + unstructured/partition/csv.py | 14 ++++++++++++-- 7 files changed, 26 insertions(+), 4 deletions(-) create mode 100644 example-docs/stanley-cups-utf-16.csv diff --git a/CHANGELOG.md b/CHANGELOG.md index d238b721c9..d06493d428 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,11 @@ -## 0.15.9-dev0 +## 0.15.9-dev1 ### Enhancements ### Features +* **Add support for encoding parameter in partition_csv** + ### Fixes * **Fix disk space leaks and Windows errors when accessing file.name on a NamedTemporaryFile** Uses of `NamedTemporaryFile(..., delete=False)` and/or uses of `file.name` of NamedTemporaryFiles have been replaced with TemporaryFileDirectory to avoid a known issue: https://docs.python.org/3/library/tempfile.html#tempfile.NamedTemporaryFile diff --git a/example-docs/stanley-cups-utf-16.csv b/example-docs/stanley-cups-utf-16.csv new file mode 100644 index 0000000000000000000000000000000000000000..b152e27aac8e0aa3e0b619b0879de279c3af47d5 GIT binary patch literal 174 zcmZ{eOA5k35Jca4PLTs-W4wU@QAkisE)YaPFdy<+VlrTdgi6-bpAC12WgAVjZ(=d+&gJU{Z)UA$(0jllgy-)Jj6dF QUT<$c<4w(lTWs$MKQss(*Z=?k literal 0 HcmV?d00001 diff --git a/test_unstructured/partition/test_csv.py b/test_unstructured/partition/test_csv.py index 4360b4771c..e1d3dc0bd9 100644 --- a/test_unstructured/partition/test_csv.py +++ b/test_unstructured/partition/test_csv.py @@ -74,6 +74,12 @@ def test_partition_csv_from_filename_with_metadata_filename(): assert elements[0].metadata.filename == "test" +def test_partition_csv_with_encoding(): + elements = partition_csv(example_doc_path("stanley-cups-utf-16.csv"), encoding="utf-16") + + assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT + + @pytest.mark.parametrize( ("filename", "expected_text", "expected_table"), [ @@ -279,6 +285,7 @@ def it_provides_a_validating_alternate_constructor(self): ctx = _CsvPartitioningContext.load( file_path=example_doc_path("stanley-cups.csv"), file=None, + encoding=None, metadata_file_path=None, metadata_last_modified=None, include_header=True, @@ -292,6 +299,7 @@ def and_the_validating_constructor_raises_on_an_invalid_context(self): _CsvPartitioningContext.load( file_path=None, file=None, + encoding=None, metadata_file_path=None, metadata_last_modified=None, include_header=True, diff --git a/typings/pandas/io/parsers/readers.pyi b/typings/pandas/io/parsers/readers.pyi index eb79991c3b..a22076b564 100644 --- a/typings/pandas/io/parsers/readers.pyi +++ b/typings/pandas/io/parsers/readers.pyi @@ -7,6 +7,7 @@ from pandas.core.frame import DataFrame def read_csv( filepath_or_buffer: str | IO[bytes], *, + encoding: str | None = ..., sep: str | None = ..., header: int | None | Literal["infer"] = ..., ) -> DataFrame: ... diff --git a/unstructured/__version__.py b/unstructured/__version__.py index ec2035ad54..5b85cacf11 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.15.9-dev0" # pragma: no cover +__version__ = "0.15.9-dev1" # pragma: no cover diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py index 21c15d2f44..35cbc37ab4 100644 --- a/unstructured/partition/auto.py +++ b/unstructured/partition/auto.py @@ -207,6 +207,7 @@ def partition( elements = partition_csv( filename=filename, file=file, + encoding=encoding, infer_table_structure=infer_table_structure, languages=languages, detect_language_per_element=detect_language_per_element, diff --git a/unstructured/partition/csv.py b/unstructured/partition/csv.py index e3c177bb10..9d250505a7 100644 --- a/unstructured/partition/csv.py +++ b/unstructured/partition/csv.py @@ -29,6 +29,7 @@ def partition_csv( filename: str | None = None, file: IO[bytes] | None = None, + encoding: str | None = None, metadata_filename: str | None = None, metadata_last_modified: str | None = None, include_header: bool = False, @@ -47,6 +48,8 @@ def partition_csv( A string defining the target filename path. file A file-like object using "rb" mode --> open(filename, "rb"). + encoding + The encoding method used to decode the text input. If None, utf-8 will be used. metadata_filename The filename to use for the metadata. metadata_last_modified @@ -73,6 +76,7 @@ def partition_csv( ctx = _CsvPartitioningContext( file_path=filename, file=file, + encoding=encoding, metadata_file_path=metadata_filename, metadata_last_modified=metadata_last_modified, include_header=include_header, @@ -81,7 +85,7 @@ def partition_csv( ) with ctx.open() as file: - dataframe = pd.read_csv(file, header=ctx.header, sep=ctx.delimiter) + dataframe = pd.read_csv(file, header=ctx.header, sep=ctx.delimiter, encoding=encoding) html_text = dataframe.to_html(index=False, header=include_header, na_rep="") text = soupparser_fromstring(html_text).text_content() @@ -110,6 +114,7 @@ def __init__( self, file_path: str | None = None, file: IO[bytes] | None = None, + encoding: str | None = None, metadata_file_path: str | None = None, metadata_last_modified: str | None = None, include_header: bool = False, @@ -118,6 +123,7 @@ def __init__( ): self._file_path = file_path self._file = file + self._encoding = encoding self._metadata_file_path = metadata_file_path self._metadata_last_modified = metadata_last_modified self._include_header = include_header @@ -129,6 +135,7 @@ def load( cls, file_path: str | None, file: IO[bytes] | None, + encoding: str | None, metadata_file_path: str | None, metadata_last_modified: str | None, include_header: bool, @@ -138,6 +145,7 @@ def load( return cls( file_path=file_path, file=file, + encoding=encoding, metadata_file_path=metadata_file_path, metadata_last_modified=metadata_last_modified, include_header=include_header, @@ -156,7 +164,9 @@ def delimiter(self) -> str | None: with self.open() as file: # -- read whole lines, sniffer can be confused by a trailing partial line -- - data = "\n".join(ln.decode("utf-8") for ln in file.readlines(num_bytes)) + data = "\n".join( + ln.decode(self._encoding or "utf-8") for ln in file.readlines(num_bytes) + ) try: return sniffer.sniff(data, delimiters=",;").delimiter