Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Missing spaces in extract_text() method (#1328) #2868

Merged
merged 10 commits into from
Sep 24, 2024
3 changes: 2 additions & 1 deletion pypdf/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -1988,8 +1988,9 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
for op in operands[0]:
if isinstance(op, (str, bytes)):
process_operation(b"Tj", [op])
# The space width may be smaller than the font width, so the width should be 95%.
if isinstance(op, (int, float, NumberObject, FloatObject)) and (
(abs(float(op)) >= _space_width)
(abs(float(op) / 0.95) >= _space_width)
pubpub-zz marked this conversation as resolved.
Show resolved Hide resolved
and (len(text) > 0)
and (text[-1] != " ")
):
Expand Down
17 changes: 17 additions & 0 deletions tests/test_text_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,3 +189,20 @@ def test_layout_mode_warnings(mock_logger_warning):
mock_logger_warning.assert_called_with(
"Argument visitor_text is ignored in layout mode", "pypdf._page"
)


@pytest.mark.enable_socket()
def test_space_with_one_unit_smaller_than_font_width():
"""Tests for #1328"""
url = "https://github.com/py-pdf/PyPDF2/files/9498481/0004.pdf"
ssjkamei marked this conversation as resolved.
Show resolved Hide resolved
name = "iss1328.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
page = reader.pages[0]
extracted = page.extract_text().split("Description:")[1].split("8/11/22")[0].strip()
pubpub-zz marked this conversation as resolved.
Show resolved Hide resolved
assert extracted == """Reporting crude oil leak.
Leak was isolated to well
pad. Segment of line was
immediately isolated, now
estimated at 5 barrels of oil
spilt. Root cause still
unknown at this time."""
Loading