From 3d26c248905f18d3ec708f12ec76d81c534c25f3 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Sun, 24 Dec 2023 11:18:18 +0100 Subject: [PATCH] TST: Add xfail test for #2336 --- tests/example_files.yaml | 2 ++ tests/test_text_extraction.py | 13 +++++++++++++ 2 files changed, 15 insertions(+) diff --git a/tests/example_files.yaml b/tests/example_files.yaml index f12a78444..2eae58e17 100644 --- a/tests/example_files.yaml +++ b/tests/example_files.yaml @@ -110,3 +110,5 @@ url: https://github.com/py-pdf/pypdf/assets/4083478/56c93021-33cd-4387-ae13-5cbe7e673f42 - local_filename: paid.pdf url: https://github.com/py-pdf/pypdf/files/12050253/tt.pdf +- local_filename: Pesquisa-de-Precos-Combustiveis-novembro-2023.pdf + url: https://www.joinville.sc.gov.br/wp-content/uploads/2023/11/Pesquisa-de-Precos-Combustiveis-novembro-2023.pdf diff --git a/tests/test_text_extraction.py b/tests/test_text_extraction.py index 790ce6cf6..b3e922bfb 100644 --- a/tests/test_text_extraction.py +++ b/tests/test_text_extraction.py @@ -3,6 +3,7 @@ The tested code might be in _page.py. """ +from io import BytesIO from pathlib import Path import pytest @@ -10,6 +11,8 @@ from pypdf import PdfReader, mult from pypdf._text_extraction import set_custom_rtl +from . import get_data_from_url + TESTS_ROOT = Path(__file__).parent.resolve() PROJECT_ROOT = TESTS_ROOT.parent RESOURCE_ROOT = PROJECT_ROOT / "resources" @@ -99,3 +102,13 @@ def visitor_text(text, cm, tm, font_dict, font_size) -> None: x = matches[0]["x"] y = matches[0]["y"] assert constraint(x, y), f'Line "{text}" is wrong at x:{x}, y:{y}' + + +@pytest.mark.xfail(reason="known whitespace issue #2336") +@pytest.mark.enable_socket() +def test_issue_2336(): + name = "Pesquisa-de-Precos-Combustiveis-novembro-2023.pdf" + reader = PdfReader(BytesIO(get_data_from_url(name=name))) + page = reader.pages[0] + actual_text = page.extract_text() + assert "Beira Rio" in actual_text