Skip to content

Commit

Permalink
feat: replace pytesseract with unstructured.pytesseract fork (#3528)
Browse files Browse the repository at this point in the history
This PR reverts `pytesseract` dependency to `unstructured.pytesseract`
fork due to the unavailability of some recent release versions of
`pytesseract` on PyPI.

This PR also addresses an issue encountered during the publication of
`unstructured==0.15.4` to PyPI. The error was due to the fact that PyPI
does not allow direct dependencies from Version Control System URLs like
GitHub in the `install_requires` or `extras_require` sections of the
`setup.py` file.
  • Loading branch information
christinestraub authored Aug 16, 2024
1 parent e64e095 commit fc26426
Show file tree
Hide file tree
Showing 11 changed files with 30 additions and 31 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
## 0.15.5-dev1
## 0.15.5

### Enhancements

### Features

### Fixes

* **Revert to using `unstructured.pytesseract` fork**. Due to the unavailability of some recent release versions of `pytesseract` on PyPI, the project now uses the `unstructured.pytesseract` fork to ensure stability and continued support.
* **Bump `libreoffice` verson in image.** Bumps the `libreoffice` version to `25.2.5.2` to address CVEs.
* **Downgrade NLTK dependency version for compatibility**. Due to the unavailability of `nltk==3.8.2` on PyPI, the NLTK dependency has been downgraded to `<3.8.2`. This change ensures continued functionality and compatibility.

Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ install-test:
python3 -m pip install -r requirements/test.txt
# NOTE(yao) - CI seem to always install tesseract to test so it would make sense to also require
# pytesseract installation into the virtual env for testing
python3 -m pip install pytesseract -c requirements/deps/constraints.txt
python3 -m pip install unstructured_pytesseract
# python3 -m pip install argilla==1.28.0 -c requirements/deps/constraints.txt
# NOTE(robinson) - Installing weaviate-client separately here because the requests
# version conflicts with label_studio_sdk
Expand Down
4 changes: 1 addition & 3 deletions requirements/extra-pdf-image.in
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,4 @@ effdet
# Do not move to constraints.in, otherwise unstructured-inference will not be upgraded
# when unstructured library is.
unstructured-inference==0.7.36
# NOTE(christine): Pinned to a specific version of pytesseract from the GitHub repository.
# Remove this pin and switch to the latest version from PyPI once version 0.3.13 or newer is officially released.
pytesseract @ git+https://github.com/madmaze/[email protected]
unstructured.pytesseract>=0.3.12
8 changes: 4 additions & 4 deletions requirements/extra-pdf-image.txt
Original file line number Diff line number Diff line change
Expand Up @@ -135,8 +135,8 @@ packaging==23.2
# matplotlib
# onnxruntime
# pikepdf
# pytesseract
# transformers
# unstructured-pytesseract
pandas==2.2.2
# via layoutparser
pdf2image==1.17.0
Expand All @@ -159,8 +159,8 @@ pillow==10.4.0
# pdfplumber
# pikepdf
# pillow-heif
# pytesseract
# torchvision
# unstructured-pytesseract
pillow-heif==0.18.0
# via -r ./extra-pdf-image.in
portalocker==2.10.1
Expand Down Expand Up @@ -201,8 +201,6 @@ pypdf==4.3.1
# -r ./extra-pdf-image.in
pypdfium2==4.30.0
# via pdfplumber
pytesseract @ git+https://github.com/madmaze/[email protected]
# via -r ./extra-pdf-image.in
python-dateutil==2.9.0.post0
# via
# -c ./base.txt
Expand Down Expand Up @@ -289,6 +287,8 @@ tzdata==2024.1
# via pandas
unstructured-inference==0.7.36
# via -r ./extra-pdf-image.in
unstructured-pytesseract==0.3.13
# via -r ./extra-pdf-image.in
urllib3==1.26.19
# via
# -c ././deps/constraints.txt
Expand Down
2 changes: 1 addition & 1 deletion test_unstructured/partition/pdf_image/test_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@

import pytest
from PIL import Image
from pytesseract import TesseractError
from unstructured_inference.inference import layout
from unstructured_pytesseract import TesseractError

from test_unstructured.partition.pdf_image.test_pdf import assert_element_extraction
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
Expand Down
8 changes: 4 additions & 4 deletions test_unstructured/partition/pdf_image/test_ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@

import numpy as np
import pandas as pd
import pytesseract
import pytest
import unstructured_pytesseract
from pdf2image.exceptions import PDFPageCountError
from PIL import Image, UnidentifiedImageError
from unstructured_inference.inference.elements import EmbeddedTextRegion, TextRegion
Expand Down Expand Up @@ -70,7 +70,7 @@ def test_supplement_page_layout_with_ocr_invalid_ocr(monkeypatch):

def test_get_ocr_layout_from_image_tesseract(monkeypatch):
monkeypatch.setattr(
pytesseract,
unstructured_pytesseract,
"image_to_data",
lambda *args, **kwargs: pd.DataFrame(
{
Expand Down Expand Up @@ -156,7 +156,7 @@ def test_get_ocr_layout_from_image_paddle(monkeypatch):

def test_get_ocr_text_from_image_tesseract(monkeypatch):
monkeypatch.setattr(
pytesseract,
unstructured_pytesseract,
"image_to_string",
lambda *args, **kwargs: "Hello World",
)
Expand Down Expand Up @@ -443,7 +443,7 @@ def test_auto_zoom_not_exceed_tesseract_limit(monkeypatch):
monkeypatch.setenv("TESSERACT_MIN_TEXT_HEIGHT", "1000")
monkeypatch.setenv("TESSERACT_OPTIMUM_TEXT_HEIGHT", "100000")
monkeypatch.setattr(
pytesseract,
unstructured_pytesseract,
"image_to_data",
lambda *args, **kwargs: pd.DataFrame(
{
Expand Down
14 changes: 7 additions & 7 deletions test_unstructured/partition/pdf_image/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,7 +384,7 @@ def test_partition_pdf_falls_back_to_fast(
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
):
def mock_exists(dep):
return dep not in ["unstructured_inference", "pytesseract"]
return dep not in ["unstructured_inference", "unstructured_pytesseract"]

monkeypatch.setattr(strategies, "dependency_exists", mock_exists)

Expand All @@ -406,7 +406,7 @@ def test_partition_pdf_falls_back_to_fast_from_ocr_only(
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
):
def mock_exists(dep):
return dep not in ["pytesseract"]
return dep not in ["unstructured_pytesseract"]

monkeypatch.setattr(strategies, "dependency_exists", mock_exists)

Expand All @@ -432,7 +432,7 @@ def test_partition_pdf_falls_back_to_hi_res_from_ocr_only(
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
):
def mock_exists(dep):
return dep not in ["pytesseract"]
return dep not in ["unstructured_pytesseract"]

monkeypatch.setattr(strategies, "dependency_exists", mock_exists)
monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: [])
Expand Down Expand Up @@ -584,7 +584,7 @@ def test_partition_pdf_fails_if_pdf_not_processable(
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
):
def mock_exists(dep):
return dep not in ["unstructured_inference", "pytesseract"]
return dep not in ["unstructured_inference", "unstructured_pytesseract"]

monkeypatch.setattr(strategies, "dependency_exists", mock_exists)
monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: [])
Expand Down Expand Up @@ -978,15 +978,15 @@ def test_partition_hi_res_model_name_default_to_None():
[
(
PartitionStrategy.HI_RES,
"pytesseract.image_to_data",
"unstructured_pytesseract.image_to_data",
),
(
PartitionStrategy.OCR_ONLY,
"pytesseract.image_to_data",
"unstructured_pytesseract.image_to_data",
),
(
PartitionStrategy.OCR_ONLY,
"pytesseract.image_to_string",
"unstructured_pytesseract.image_to_string",
),
],
)
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.15.5-dev1" # pragma: no cover
__version__ = "0.15.5" # pragma: no cover
2 changes: 1 addition & 1 deletion unstructured/partition/strategies.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def determine_pdf_or_image_strategy(
):
"""Determines what strategy to use for processing PDFs or images, accounting for fallback
logic if some dependencies are not available."""
pytesseract_installed = dependency_exists("pytesseract")
pytesseract_installed = dependency_exists("unstructured_pytesseract")
unstructured_inference_installed = dependency_exists("unstructured_inference")

if strategy == PartitionStrategy.AUTO:
Expand Down
2 changes: 1 addition & 1 deletion unstructured/partition/utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ class PartitionStrategy:

UNSTRUCTURED_INCLUDE_DEBUG_METADATA = os.getenv("UNSTRUCTURED_INCLUDE_DEBUG_METADATA", False)

# this field is defined by pytesseract
# this field is defined by unstructured_pytesseract
TESSERACT_TEXT_HEIGHT = "height"

TESSERACT_LANGUAGES_SPLITTER = "+"
Expand Down
14 changes: 7 additions & 7 deletions unstructured/partition/utils/ocr_models/tesseract_ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@
import cv2
import numpy as np
import pandas as pd
import pytesseract
import unstructured_pytesseract
from PIL import Image as PILImage
from pytesseract import Output
from unstructured_pytesseract import Output

from unstructured.logger import trace_logger
from unstructured.partition.utils.config import env_config
Expand Down Expand Up @@ -40,14 +40,14 @@ def is_text_sorted(self):
return True

def get_text_from_image(self, image: PILImage.Image) -> str:
return pytesseract.image_to_string(np.array(image), lang=self.language)
return unstructured_pytesseract.image_to_string(np.array(image), lang=self.language)

def get_layout_from_image(self, image: PILImage.Image) -> List[TextRegion]:
"""Get the OCR regions from image as a list of text regions with tesseract."""

trace_logger.detail("Processing entire page OCR with tesseract...")
zoom = 1
ocr_df: pd.DataFrame = pytesseract.image_to_data(
ocr_df: pd.DataFrame = unstructured_pytesseract.image_to_data(
np.array(image),
lang=self.language,
output_type=Output.DATAFRAME,
Expand Down Expand Up @@ -76,7 +76,7 @@ def get_layout_from_image(self, image: PILImage.Image) -> List[TextRegion]:
np.round(env_config.TESSERACT_OPTIMUM_TEXT_HEIGHT / text_height, 1),
max_zoom,
)
ocr_df = pytesseract.image_to_data(
ocr_df = unstructured_pytesseract.image_to_data(
np.array(zoom_image(image, zoom)),
lang=self.language,
output_type=Output.DATAFRAME,
Expand All @@ -96,9 +96,9 @@ def get_layout_elements_from_image(self, image: PILImage.Image) -> List["LayoutE
ocr_regions = self.get_layout_from_image(image)

# NOTE(christine): For tesseract, the ocr_text returned by
# `pytesseract.image_to_string()` doesn't contain bounding box data but is
# `unstructured_pytesseract.image_to_string()` doesn't contain bounding box data but is
# well grouped. Conversely, the ocr_layout returned by parsing
# `pytesseract.image_to_data()` contains bounding box data but is not well
# `unstructured_pytesseract.image_to_data()` contains bounding box data but is not well
# grouped. Therefore, we need to first group the `ocr_layout` by `ocr_text` and then merge
# the text regions in each group to create a list of layout elements.

Expand Down

0 comments on commit fc26426

Please sign in to comment.