diff --git a/CHANGELOG.md b/CHANGELOG.md index d0d6aaec76..60e1a9b0d0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,9 @@ -## 0.14.6-dev7 +## 0.14.6 ### Enhancements +* **Bump unstructured-inference==0.7.35** Fix syntax for generated HTML tables. + ### Features * **tqdm ingest support** add optional flag to ingest flow to print out progress bar of each step in the process. diff --git a/requirements/dev.txt b/requirements/dev.txt index 84a5416227..da3ab91074 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -82,7 +82,7 @@ executing==2.0.1 # via stack-data fastjsonschema==2.19.1 # via nbformat -filelock==3.14.0 +filelock==3.15.1 # via virtualenv fqdn==1.5.1 # via jsonschema diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt index 5b0e68cdbf..d8c470beaa 100644 --- a/requirements/extra-paddleocr.txt +++ b/requirements/extra-paddleocr.txt @@ -121,7 +121,7 @@ opencv-python==4.8.0.76 # -c ././deps/constraints.txt # imgaug # unstructured-paddleocr -openpyxl==3.1.3 +openpyxl==3.1.4 # via unstructured-paddleocr packaging==23.2 # via diff --git a/requirements/extra-pdf-image.in b/requirements/extra-pdf-image.in index a4c7910a5b..19414ad3c9 100644 --- a/requirements/extra-pdf-image.in +++ b/requirements/extra-pdf-image.in @@ -12,7 +12,7 @@ google-cloud-vision effdet # Do not move to constraints.in, otherwise unstructured-inference will not be upgraded # when unstructured library is. -unstructured-inference==0.7.33 +unstructured-inference==0.7.35 # unstructured fork of pytesseract that provides an interface to allow for multiple output formats # from one tesseract call unstructured.pytesseract>=0.3.12 diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index ef1d4852e5..1eae297670 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -32,7 +32,7 @@ deprecated==1.2.14 # via pikepdf effdet==0.4.1 # via -r ./extra-pdf-image.in -filelock==3.14.0 +filelock==3.15.1 # via # huggingface-hub # torch @@ -287,7 +287,7 @@ typing-extensions==4.12.2 # torch tzdata==2024.1 # via pandas -unstructured-inference==0.7.33 +unstructured-inference==0.7.35 # via -r ./extra-pdf-image.in unstructured-pytesseract==0.3.12 # via diff --git a/requirements/extra-xlsx.txt b/requirements/extra-xlsx.txt index 44fc938c07..85e83c6f22 100644 --- a/requirements/extra-xlsx.txt +++ b/requirements/extra-xlsx.txt @@ -13,7 +13,7 @@ numpy==1.26.4 # -c ././deps/constraints.txt # -c ./base.txt # pandas -openpyxl==3.1.3 +openpyxl==3.1.4 # via -r ./extra-xlsx.in pandas==2.2.2 # via -r ./extra-xlsx.in diff --git a/requirements/huggingface.txt b/requirements/huggingface.txt index 123dd5d15c..f860995ccd 100644 --- a/requirements/huggingface.txt +++ b/requirements/huggingface.txt @@ -17,7 +17,7 @@ click==8.1.7 # via # -c ./base.txt # sacremoses -filelock==3.14.0 +filelock==3.15.1 # via # huggingface-hub # torch diff --git a/requirements/ingest/chroma.txt b/requirements/ingest/chroma.txt index cd6219723b..d53d88336f 100644 --- a/requirements/ingest/chroma.txt +++ b/requirements/ingest/chroma.txt @@ -52,7 +52,7 @@ exceptiongroup==1.2.1 # via anyio fastapi==0.110.3 # via chromadb -filelock==3.14.0 +filelock==3.15.1 # via huggingface-hub flatbuffers==24.3.25 # via onnxruntime diff --git a/requirements/ingest/clarifai.txt b/requirements/ingest/clarifai.txt index fa3b83b069..a0cacae20c 100644 --- a/requirements/ingest/clarifai.txt +++ b/requirements/ingest/clarifai.txt @@ -15,7 +15,7 @@ charset-normalizer==3.3.2 # requests clarifai==10.5.0 # via -r ./ingest/clarifai.in -clarifai-grpc==10.5.1 +clarifai-grpc==10.5.2 # via clarifai contextlib2==21.6.0 # via schema diff --git a/requirements/ingest/embed-huggingface.txt b/requirements/ingest/embed-huggingface.txt index 44c654cf4f..1141796378 100644 --- a/requirements/ingest/embed-huggingface.txt +++ b/requirements/ingest/embed-huggingface.txt @@ -31,7 +31,7 @@ dataclasses-json==0.6.7 # via # -c ./ingest/../base.txt # langchain-community -filelock==3.14.0 +filelock==3.15.1 # via # huggingface-hub # torch diff --git a/requirements/ingest/embed-octoai.txt b/requirements/ingest/embed-octoai.txt index 220b9411a7..589adc381d 100644 --- a/requirements/ingest/embed-octoai.txt +++ b/requirements/ingest/embed-octoai.txt @@ -38,7 +38,7 @@ idna==3.7 # anyio # httpx # requests -openai==1.33.0 +openai==1.34.0 # via -r ./ingest/embed-octoai.in pydantic==2.7.4 # via openai diff --git a/requirements/ingest/embed-openai.txt b/requirements/ingest/embed-openai.txt index d0127e634e..81330081c9 100644 --- a/requirements/ingest/embed-openai.txt +++ b/requirements/ingest/embed-openai.txt @@ -98,7 +98,7 @@ numpy==1.26.4 # -c ./ingest/../deps/constraints.txt # langchain # langchain-community -openai==1.33.0 +openai==1.34.0 # via -r ./ingest/embed-openai.in orjson==3.10.4 # via langsmith diff --git a/test_unstructured/partition/pdf_image/test_image.py b/test_unstructured/partition/pdf_image/test_image.py index 7e5b59ab05..55018cb3a8 100644 --- a/test_unstructured/partition/pdf_image/test_image.py +++ b/test_unstructured/partition/pdf_image/test_image.py @@ -155,7 +155,8 @@ def test_partition_image_with_table_extraction( ) table = [el.metadata.text_as_html for el in elements if el.metadata.text_as_html] assert len(table) == 1 - assert "
" in table[0] + assert "" in table[0] + assert "" in table[0] def test_partition_image_with_multipage_tiff( @@ -180,7 +181,8 @@ def test_partition_image_with_bmp( ) table = [el.metadata.text_as_html for el in elements if el.metadata.text_as_html] assert len(table) == 1 - assert "
" in table[0] + assert "" in table[0] + assert "" in table[0] def test_partition_image_with_language_passed(filename="example-docs/example.jpg"): @@ -657,7 +659,8 @@ def test_partition_image_hi_res_ocr_mode_with_table_extraction(ocr_mode): ) table = [el.metadata.text_as_html for el in elements if el.metadata.text_as_html] assert len(table) == 1 - assert "
" in table[0] + assert "" in table[0] + assert "" in table[0] assert "Layouts of history Japanese documents" in table[0] assert "Layouts of scanned modern magazines and scientific reports" in table[0] diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py index 8ada03d078..c6750ff9e7 100644 --- a/test_unstructured/partition/pdf_image/test_pdf.py +++ b/test_unstructured/partition/pdf_image/test_pdf.py @@ -494,7 +494,8 @@ def test_partition_pdf_hi_table_extraction_with_languages(ocr_mode): table = [el.metadata.text_as_html for el in elements if el.metadata.text_as_html] assert elements[0].metadata.languages == ["kor"] assert len(table) == 2 - assert "
" in table[0] + assert "" in table[0] + assert "" in table[0] # FIXME(yuming): didn't test full sentence here since unit test and docker test have # some differences on spaces between characters assert "업" in table[0] @@ -535,7 +536,8 @@ def test_partition_pdf_hi_res_ocr_mode_with_table_extraction(ocr_mode): ) table = [el.metadata.text_as_html for el in elements if el.metadata.text_as_html] assert len(table) == 2 - assert "
" in table[0] + assert "" in table[0] + assert "" in table[0] assert "Layouts of history Japanese documents" in table[0] assert "Layouts of scanned modern magazines and scientific report" in table[0] assert "Layouts of scanned US newspapers from the 20th century" in table[0] diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py index afcbd14f7e..ce3d442ef2 100644 --- a/test_unstructured/partition/test_auto.py +++ b/test_unstructured/partition/test_auto.py @@ -1272,7 +1272,8 @@ def test_partition_image_with_bmp_with_auto( ) table = [el.metadata.text_as_html for el in elements if el.metadata.text_as_html] assert len(table) == 1 - assert "
" in table[0] + assert "" in table[0] + assert "" in table[0] def test_auto_partition_eml_add_signature_to_metadata(): diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.json b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.json index f00183ac2b..b94674c1f7 100644 --- a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.json +++ b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.json @@ -48,7 +48,7 @@ "element_id": "dddac446da6c93dc1449ecb5d997c423", "text": "Dataset | Base Model\" Large Model | Notes PubLayNet [38] P/M M Layouts of modern scientific documents PRImA [3) M - Layouts of scanned modern magazines and scientific reports Newspaper [17] P - Layouts of scanned US newspapers from the 20th century \u2018TableBank (18) P P Table region on modern scientific and business document HJDataset (31) | F/M - Layouts of history Japanese documents", "metadata": { - "text_as_html": "
Dataset| Base Model!|Large Model| Notes
PubLayNet [33]P/MMLayouts of modern scientific documents
PRImA [3]MLayouts of scanned modern magazines and scientific reports
Newspaper [17]PLayouts of scanned US newspapers from the 20th century
TableBank [18]PTable region on modern scientific and business document
HIDataset [31]P/MLayouts of history Japanese documents
", + "text_as_html": "
Dataset| Base Model!|Large Model| Notes
PubLayNet [33]P/MMLayouts of modern scientific documents
PRImA [3]MLayouts of scanned modern magazines and scientific reports
Newspaper [17]PLayouts of scanned US newspapers from the 20th century
TableBank [18]PTable region on modern scientific and business document
HIDataset [31]P/MLayouts of history Japanese documents
", "filetype": "image/jpeg", "languages": [ "eng" diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json index 03a9917a81..94be393544 100644 --- a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json @@ -840,7 +840,7 @@ "element_id": "2a62c55be8401908c18140e858ec3345", "text": "Dataset Base Model1 Large Model Notes PubLayNet [38] PRImA [3] Newspaper [17] TableBank [18] HJDataset [31] F / M M F F F / M M - - F - Layouts of modern scienti\ufb01c documents Layouts of scanned modern magazines and scienti\ufb01c reports Layouts of scanned US newspapers from the 20th century Table region on modern scienti\ufb01c and business document Layouts of history Japanese documents", "metadata": { - "text_as_html": "
Dataset| Base Model'|| Notes
PubLayNet B8]|F/MLayouts of modern scientific documents
PRImAMLayouts of scanned modern magazines and scientific report
NewspaperFLayouts of scanned US newspapers from the 20th century
TableBankFTable region on modern scientific and business document
HJDatasetF/MLayouts of history Japanese documents
", + "text_as_html": "
Dataset| Base Model'|| Notes
PubLayNet B8]|F/MLayouts of modern scientific documents
PRImAMLayouts of scanned modern magazines and scientific report
NewspaperFLayouts of scanned US newspapers from the 20th century
TableBankFTable region on modern scientific and business document
HJDatasetF/MLayouts of history Japanese documents
", "filetype": "application/pdf", "languages": [ "eng" @@ -1391,7 +1391,7 @@ "element_id": "64bc79d1132a89c71837f420d6e4e2dc", "text": "Operation Name Description block.pad(top, bottom, right, left) Enlarge the current block according to the input block.scale(fx, fy) Scale the current block given the ratio in x and y direction block.shift(dx, dy) Move the current block with the shift distances in x and y direction block1.is in(block2) Whether block1 is inside of block2 block1.intersect(block2) Return the intersection region of block1 and block2. Coordinate type to be determined based on the inputs. block1.union(block2) Return the union region of block1 and block2. Coordinate type to be determined based on the inputs. block1.relative to(block2) Convert the absolute coordinates of block1 to relative coordinates to block2 block1.condition on(block2) Calculate the absolute coordinates of block1 given the canvas block2\u2019s absolute coordinates block.crop image(image) Obtain the image segments in the block region", "metadata": { - "text_as_html": "
block.pad(top, bottom,right,left)Enlarge the current block according to the input
block.scale(fx, fy)Scale the current block given the ratio in x and y direction
block.shift(dx, dy)Move the current block with the shift distances in x and y direction
block1.is_in(block2)Whether block] is inside of block2
block1. intersect (block2)Return the intersection region of blockl and block2. Coordinate type to be determined based on the inputs
block1.union(block2)Return the union region of blockl and block2. Coordinate type to be determined based on the inputs
block1.relative_to(block2)Convert the absolute coordinates of block to relative coordinates to block2
block1.condition_on(block2)Calculate the absolute coordinates of blockl given the canvas block2\u2019s absolute coordinates
block. crop_image (image)Obtain the image segments in the block region
", + "text_as_html": "
block.pad(top, bottom,right,left)Enlarge the current block according to the input
block.scale(fx, fy)Scale the current block given the ratio in x and y direction
block.shift(dx, dy)Move the current block with the shift distances in x and y direction
block1.is_in(block2)Whether block] is inside of block2
block1. intersect (block2)Return the intersection region of blockl and block2. Coordinate type to be determined based on the inputs
block1.union(block2)Return the union region of blockl and block2. Coordinate type to be determined based on the inputs
block1.relative_to(block2)Convert the absolute coordinates of block to relative coordinates to block2
block1.condition_on(block2)Calculate the absolute coordinates of blockl given the canvas block2\u2019s absolute coordinates
block. crop_image (image)Obtain the image segments in the block region
", "filetype": "application/pdf", "languages": [ "eng" diff --git a/unstructured/__version__.py b/unstructured/__version__.py index d1884901fb..283ba1c53b 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.14.6-dev7" # pragma: no cover +__version__ = "0.14.6" # pragma: no cover