diff --git a/CHANGELOG.md b/CHANGELOG.md index 57b26336ec..72fc6cd3ea 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.15.5-dev1 +## 0.15.5 ### Enhancements @@ -6,6 +6,7 @@ ### Fixes +* **Revert to using `unstructured.pytesseract` fork**. Due to the unavailability of some recent release versions of `pytesseract` on PyPI, the project now uses the `unstructured.pytesseract` fork to ensure stability and continued support. * **Bump `libreoffice` verson in image.** Bumps the `libreoffice` version to `25.2.5.2` to address CVEs. * **Downgrade NLTK dependency version for compatibility**. Due to the unavailability of `nltk==3.8.2` on PyPI, the NLTK dependency has been downgraded to `<3.8.2`. This change ensures continued functionality and compatibility. diff --git a/Makefile b/Makefile index 1708fe57ea..f856ee5526 100644 --- a/Makefile +++ b/Makefile @@ -45,7 +45,7 @@ install-test: python3 -m pip install -r requirements/test.txt # NOTE(yao) - CI seem to always install tesseract to test so it would make sense to also require # pytesseract installation into the virtual env for testing - python3 -m pip install pytesseract -c requirements/deps/constraints.txt + python3 -m pip install unstructured_pytesseract # python3 -m pip install argilla==1.28.0 -c requirements/deps/constraints.txt # NOTE(robinson) - Installing weaviate-client separately here because the requests # version conflicts with label_studio_sdk diff --git a/requirements/extra-pdf-image.in b/requirements/extra-pdf-image.in index 6fd3ba8703..5e1e6b2b18 100644 --- a/requirements/extra-pdf-image.in +++ b/requirements/extra-pdf-image.in @@ -12,6 +12,4 @@ effdet # Do not move to constraints.in, otherwise unstructured-inference will not be upgraded # when unstructured library is. unstructured-inference==0.7.36 -# NOTE(christine): Pinned to a specific version of pytesseract from the GitHub repository. -# Remove this pin and switch to the latest version from PyPI once version 0.3.13 or newer is officially released. -pytesseract @ git+https://github.com/madmaze/pytesseract.git@v0.3.13 +unstructured.pytesseract>=0.3.12 diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index 14fa114004..264e3976a9 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -135,8 +135,8 @@ packaging==23.2 # matplotlib # onnxruntime # pikepdf - # pytesseract # transformers + # unstructured-pytesseract pandas==2.2.2 # via layoutparser pdf2image==1.17.0 @@ -159,8 +159,8 @@ pillow==10.4.0 # pdfplumber # pikepdf # pillow-heif - # pytesseract # torchvision + # unstructured-pytesseract pillow-heif==0.18.0 # via -r ./extra-pdf-image.in portalocker==2.10.1 @@ -201,8 +201,6 @@ pypdf==4.3.1 # -r ./extra-pdf-image.in pypdfium2==4.30.0 # via pdfplumber -pytesseract @ git+https://github.com/madmaze/pytesseract.git@v0.3.13 - # via -r ./extra-pdf-image.in python-dateutil==2.9.0.post0 # via # -c ./base.txt @@ -289,6 +287,8 @@ tzdata==2024.1 # via pandas unstructured-inference==0.7.36 # via -r ./extra-pdf-image.in +unstructured-pytesseract==0.3.13 + # via -r ./extra-pdf-image.in urllib3==1.26.19 # via # -c ././deps/constraints.txt diff --git a/test_unstructured/partition/pdf_image/test_image.py b/test_unstructured/partition/pdf_image/test_image.py index 270b38c5ea..6c47c613b7 100644 --- a/test_unstructured/partition/pdf_image/test_image.py +++ b/test_unstructured/partition/pdf_image/test_image.py @@ -7,8 +7,8 @@ import pytest from PIL import Image -from pytesseract import TesseractError from unstructured_inference.inference import layout +from unstructured_pytesseract import TesseractError from test_unstructured.partition.pdf_image.test_pdf import assert_element_extraction from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path diff --git a/test_unstructured/partition/pdf_image/test_ocr.py b/test_unstructured/partition/pdf_image/test_ocr.py index c650db112f..e07fb23d3e 100644 --- a/test_unstructured/partition/pdf_image/test_ocr.py +++ b/test_unstructured/partition/pdf_image/test_ocr.py @@ -3,8 +3,8 @@ import numpy as np import pandas as pd -import pytesseract import pytest +import unstructured_pytesseract from pdf2image.exceptions import PDFPageCountError from PIL import Image, UnidentifiedImageError from unstructured_inference.inference.elements import EmbeddedTextRegion, TextRegion @@ -70,7 +70,7 @@ def test_supplement_page_layout_with_ocr_invalid_ocr(monkeypatch): def test_get_ocr_layout_from_image_tesseract(monkeypatch): monkeypatch.setattr( - pytesseract, + unstructured_pytesseract, "image_to_data", lambda *args, **kwargs: pd.DataFrame( { @@ -156,7 +156,7 @@ def test_get_ocr_layout_from_image_paddle(monkeypatch): def test_get_ocr_text_from_image_tesseract(monkeypatch): monkeypatch.setattr( - pytesseract, + unstructured_pytesseract, "image_to_string", lambda *args, **kwargs: "Hello World", ) @@ -443,7 +443,7 @@ def test_auto_zoom_not_exceed_tesseract_limit(monkeypatch): monkeypatch.setenv("TESSERACT_MIN_TEXT_HEIGHT", "1000") monkeypatch.setenv("TESSERACT_OPTIMUM_TEXT_HEIGHT", "100000") monkeypatch.setattr( - pytesseract, + unstructured_pytesseract, "image_to_data", lambda *args, **kwargs: pd.DataFrame( { diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py index 0ed804e9b0..d730e5e64a 100644 --- a/test_unstructured/partition/pdf_image/test_pdf.py +++ b/test_unstructured/partition/pdf_image/test_pdf.py @@ -384,7 +384,7 @@ def test_partition_pdf_falls_back_to_fast( filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"), ): def mock_exists(dep): - return dep not in ["unstructured_inference", "pytesseract"] + return dep not in ["unstructured_inference", "unstructured_pytesseract"] monkeypatch.setattr(strategies, "dependency_exists", mock_exists) @@ -406,7 +406,7 @@ def test_partition_pdf_falls_back_to_fast_from_ocr_only( filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"), ): def mock_exists(dep): - return dep not in ["pytesseract"] + return dep not in ["unstructured_pytesseract"] monkeypatch.setattr(strategies, "dependency_exists", mock_exists) @@ -432,7 +432,7 @@ def test_partition_pdf_falls_back_to_hi_res_from_ocr_only( filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"), ): def mock_exists(dep): - return dep not in ["pytesseract"] + return dep not in ["unstructured_pytesseract"] monkeypatch.setattr(strategies, "dependency_exists", mock_exists) monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: []) @@ -584,7 +584,7 @@ def test_partition_pdf_fails_if_pdf_not_processable( filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"), ): def mock_exists(dep): - return dep not in ["unstructured_inference", "pytesseract"] + return dep not in ["unstructured_inference", "unstructured_pytesseract"] monkeypatch.setattr(strategies, "dependency_exists", mock_exists) monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: []) @@ -978,15 +978,15 @@ def test_partition_hi_res_model_name_default_to_None(): [ ( PartitionStrategy.HI_RES, - "pytesseract.image_to_data", + "unstructured_pytesseract.image_to_data", ), ( PartitionStrategy.OCR_ONLY, - "pytesseract.image_to_data", + "unstructured_pytesseract.image_to_data", ), ( PartitionStrategy.OCR_ONLY, - "pytesseract.image_to_string", + "unstructured_pytesseract.image_to_string", ), ], ) diff --git a/unstructured/__version__.py b/unstructured/__version__.py index dfec7dd472..61f494787f 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.15.5-dev1" # pragma: no cover +__version__ = "0.15.5" # pragma: no cover diff --git a/unstructured/partition/strategies.py b/unstructured/partition/strategies.py index 2a3bc226c6..67e15cc739 100644 --- a/unstructured/partition/strategies.py +++ b/unstructured/partition/strategies.py @@ -31,7 +31,7 @@ def determine_pdf_or_image_strategy( ): """Determines what strategy to use for processing PDFs or images, accounting for fallback logic if some dependencies are not available.""" - pytesseract_installed = dependency_exists("pytesseract") + pytesseract_installed = dependency_exists("unstructured_pytesseract") unstructured_inference_installed = dependency_exists("unstructured_inference") if strategy == PartitionStrategy.AUTO: diff --git a/unstructured/partition/utils/constants.py b/unstructured/partition/utils/constants.py index 03e26eb0c0..4b4dadeaa1 100644 --- a/unstructured/partition/utils/constants.py +++ b/unstructured/partition/utils/constants.py @@ -43,7 +43,7 @@ class PartitionStrategy: UNSTRUCTURED_INCLUDE_DEBUG_METADATA = os.getenv("UNSTRUCTURED_INCLUDE_DEBUG_METADATA", False) -# this field is defined by pytesseract +# this field is defined by unstructured_pytesseract TESSERACT_TEXT_HEIGHT = "height" TESSERACT_LANGUAGES_SPLITTER = "+" diff --git a/unstructured/partition/utils/ocr_models/tesseract_ocr.py b/unstructured/partition/utils/ocr_models/tesseract_ocr.py index bba58c02e9..46eb8a0cbd 100644 --- a/unstructured/partition/utils/ocr_models/tesseract_ocr.py +++ b/unstructured/partition/utils/ocr_models/tesseract_ocr.py @@ -6,9 +6,9 @@ import cv2 import numpy as np import pandas as pd -import pytesseract +import unstructured_pytesseract from PIL import Image as PILImage -from pytesseract import Output +from unstructured_pytesseract import Output from unstructured.logger import trace_logger from unstructured.partition.utils.config import env_config @@ -40,14 +40,14 @@ def is_text_sorted(self): return True def get_text_from_image(self, image: PILImage.Image) -> str: - return pytesseract.image_to_string(np.array(image), lang=self.language) + return unstructured_pytesseract.image_to_string(np.array(image), lang=self.language) def get_layout_from_image(self, image: PILImage.Image) -> List[TextRegion]: """Get the OCR regions from image as a list of text regions with tesseract.""" trace_logger.detail("Processing entire page OCR with tesseract...") zoom = 1 - ocr_df: pd.DataFrame = pytesseract.image_to_data( + ocr_df: pd.DataFrame = unstructured_pytesseract.image_to_data( np.array(image), lang=self.language, output_type=Output.DATAFRAME, @@ -76,7 +76,7 @@ def get_layout_from_image(self, image: PILImage.Image) -> List[TextRegion]: np.round(env_config.TESSERACT_OPTIMUM_TEXT_HEIGHT / text_height, 1), max_zoom, ) - ocr_df = pytesseract.image_to_data( + ocr_df = unstructured_pytesseract.image_to_data( np.array(zoom_image(image, zoom)), lang=self.language, output_type=Output.DATAFRAME, @@ -96,9 +96,9 @@ def get_layout_elements_from_image(self, image: PILImage.Image) -> List["LayoutE ocr_regions = self.get_layout_from_image(image) # NOTE(christine): For tesseract, the ocr_text returned by - # `pytesseract.image_to_string()` doesn't contain bounding box data but is + # `unstructured_pytesseract.image_to_string()` doesn't contain bounding box data but is # well grouped. Conversely, the ocr_layout returned by parsing - # `pytesseract.image_to_data()` contains bounding box data but is not well + # `unstructured_pytesseract.image_to_data()` contains bounding box data but is not well # grouped. Therefore, we need to first group the `ocr_layout` by `ocr_text` and then merge # the text regions in each group to create a list of layout elements.