Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Missing spaces in extract_text() method (#1328) #2868

Merged
merged 10 commits into from
Sep 24, 2024
4 changes: 3 additions & 1 deletion pypdf/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -1985,11 +1985,13 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
process_operation(b"TL", [-operands[1]])
process_operation(b"Td", operands)
elif operator == b"TJ":
# The space width may be smaller than the font width, so the width should be 95%.
_confirm_space_width = _space_width * 0.95
for op in operands[0]:
if isinstance(op, (str, bytes)):
process_operation(b"Tj", [op])
if isinstance(op, (int, float, NumberObject, FloatObject)) and (
(abs(float(op)) >= _space_width)
(abs(float(op)) >= _confirm_space_width)
and (len(text) > 0)
and (text[-1] != " ")
):
Expand Down
11 changes: 11 additions & 0 deletions tests/test_text_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,3 +189,14 @@ def test_layout_mode_warnings(mock_logger_warning):
mock_logger_warning.assert_called_with(
"Argument visitor_text is ignored in layout mode", "pypdf._page"
)


@pytest.mark.enable_socket()
def test_space_with_one_unit_smaller_than_font_width():
"""Tests for #1328"""
url = "https://github.com/py-pdf/pypdf/files/9498481/0004.pdf"
name = "iss1328.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
page = reader.pages[0]
extracted = page.extract_text()
assert "Reporting crude oil leak.\n" in extracted
Loading