py-pdf · pubpub-zz · Sep 24, 2024 · Sep 24, 2024 · Sep 24, 2024 · Sep 24, 2024
diff --git a/pypdf/_page.py b/pypdf/_page.py
@@ -1985,11 +1985,13 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
                 process_operation(b"TL", [-operands[1]])
                 process_operation(b"Td", operands)
             elif operator == b"TJ":
+                # The space width may be smaller than the font width, so the width should be 95%.
+                _confirm_space_width = _space_width * 0.95
                 for op in operands[0]:
                     if isinstance(op, (str, bytes)):
                         process_operation(b"Tj", [op])
                     if isinstance(op, (int, float, NumberObject, FloatObject)) and (
-                        (abs(float(op)) >= _space_width)
+                        (abs(float(op)) >= _confirm_space_width)
                         and (len(text) > 0)
                         and (text[-1] != " ")
                     ):

diff --git a/tests/test_text_extraction.py b/tests/test_text_extraction.py
@@ -189,3 +189,14 @@ def test_layout_mode_warnings(mock_logger_warning):
     mock_logger_warning.assert_called_with(
         "Argument visitor_text is ignored in layout mode", "pypdf._page"
     )
+
+
+@pytest.mark.enable_socket()
+def test_space_with_one_unit_smaller_than_font_width():
+    """Tests for #1328"""
+    url = "https://github.com/py-pdf/pypdf/files/9498481/0004.pdf"
+    name = "iss1328.pdf"
+    reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
+    page = reader.pages[0]
+    extracted = page.extract_text()
+    assert "Reporting crude oil leak.\n" in extracted