Skip to content

Commit

Permalink
tests: fix ocr test assertions
Browse files Browse the repository at this point in the history
  • Loading branch information
nmammeri committed Nov 11, 2024
1 parent 8e5c919 commit eca83f5
Showing 1 changed file with 11 additions and 9 deletions.
20 changes: 11 additions & 9 deletions bindings/extractous-python/tests/test_ocr.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,20 @@
from extractous import Extractor, PdfOcrStrategy, PdfParserConfig, TesseractOcrConfig
from utils import cosine_similarity


def test_ara_ocr_png():
ocr_config = TesseractOcrConfig().set_language("ara")
extractor = Extractor().set_ocr_config(ocr_config)
result = extractor.extract_file_to_string("../../test_files/documents/ara-ocr.png")

with open("../../test_files/expected_result/ara-ocr.png.txt", "r", encoding="utf8") as file:
with open("../../test_files/expected_result/ara-ocr.png.txt", "r", encoding="utf8") as file:
expected = file.read()

assert cosine_similarity(result, expected)
assert cosine_similarity(result, expected) > 0.9


def test_ocr_only_strategy_extract_deu_ocr_pdf_to_string():
test_file = "../../test_files/documents/eng-ocr.pdf"
def test_extract_file_to_string_ocr_only_strategy_deu_ocr_pdf():
test_file = "../../test_files/documents/deu-ocr.pdf"
expected_result_file = "../../test_files/expected_result/deu-ocr.pdf.txt"

pdf_config = PdfParserConfig().set_ocr_strategy(PdfOcrStrategy.OCR_ONLY)
Expand All @@ -26,21 +27,22 @@ def test_ocr_only_strategy_extract_deu_ocr_pdf_to_string():

result = extractor.extract_file_to_string(test_file)

with open(expected_result_file, "r", encoding="utf8") as file:
with open(expected_result_file, "r", encoding="utf8") as file:
expected = file.read()

assert cosine_similarity(result, expected)
assert cosine_similarity(result, expected) > 0.9


def test_no_ocr_strategy_extract_deu_ocr_pdf_to_string():
def test_test_extract_file_to_string_no_ocr_strategy_deu_ocr_pdf():
test_file = "../../test_files/documents/deu-ocr.pdf"

pdf_config = PdfParserConfig()
pdf_config = pdf_config.set_ocr_strategy(PdfOcrStrategy.NO_OCR)
ocr_config = TesseractOcrConfig()
ocr_config = ocr_config.set_language("deu")

extractor = Extractor().set_ocr_config(ocr_config).set_pdf_config(PdfParserConfig().set_ocr_strategy(PdfOcrStrategy.NO_OCR))
extractor = Extractor().set_ocr_config(ocr_config).set_pdf_config(pdf_config)

result = extractor.extract_file_to_string(test_file)

assert result.strip() == ""
assert result.strip() == ""

0 comments on commit eca83f5

Please sign in to comment.