BUG: Handle Sequence as an IndirectObject when extracting text with l…

…ayout mode (#2788) * Handle Sequence as an IndirectObject The spec allows an int or float to be an IndirectObject as well, but this commit does not address that theoretical possibility. * Update pypdf/_text_extraction/_layout_mode/_font.py Co-authored-by: Stefan <[email protected]> * Address PR comments -Rename w_1 to w_next_entry -Utilize ParseError instead of PdfReadError -Write a test (both positive and negative) * Handle unlikely case of IndirectObjects for float/int width elements Also adds a comment to clarify that we don't explicitly handle the IndexError exception. Rather, we let it be raised as an IndexError. * Yoda condition I removed * Last commit was a bad patch, confused by non-committed changes * Use test files from URL rather than resources * Update tests/test_text_extraction.py Co-authored-by: pubpub-zz <[email protected]> * Fix code style warnings in range() call --------- Co-authored-by: Stefan <[email protected]> Co-authored-by: pubpub-zz <[email protected]>
py-pdf · Aug 5, 2024 · b2d7204 · b2d7204
1 parent 09f9b7e
commit b2d7204
Show file tree

Hide file tree

Showing 2 changed files with 34 additions and 9 deletions.
diff --git a/pypdf/_text_extraction/_layout_mode/_font.py b/pypdf/_text_extraction/_layout_mode/_font.py
@@ -1,8 +1,9 @@
 """Font constants and classes for "layout" mode text operations"""
 
 from dataclasses import dataclass, field
-from typing import Any, Dict, Sequence, Union
+from typing import Any, Dict, Sequence, Union, cast
 
+from ...errors import ParseError
 from ...generic import IndirectObject
 from ._font_widths import STANDARD_WIDTHS
 
@@ -58,6 +59,7 @@ def __post_init__(self) -> None:
                 skip_count = 0
                 _w = d_font.get("/W", [])
                 for idx, w_entry in enumerate(_w):
+                    w_entry = w_entry.get_object()
                     if skip_count:
                         skip_count -= 1
                         continue
@@ -66,32 +68,38 @@ def __post_init__(self) -> None:
                         # warning and or use reader's "strict" to force an ex???
                         continue
                     # check for format (1): `int [int int int int ...]`
-                    if isinstance(_w[idx + 1], Sequence):
-                        start_idx, width_list = _w[idx : idx + 2]
+                    w_next_entry = _w[idx + 1].get_object()
+                    if isinstance(w_next_entry, Sequence):
+                        start_idx, width_list = w_entry, w_next_entry
                         self.width_map.update(
                             {
                                 ord_map[_cidx]: _width
                                 for _cidx, _width in zip(
-                                    range(start_idx, start_idx + len(width_list), 1),
+                                    range(cast(int, start_idx), cast(int, start_idx) + len(width_list), 1),
                                     width_list,
                                 )
                                 if _cidx in ord_map
                             }
                         )
                         skip_count = 1
                     # check for format (2): `int int int`
-                    if not isinstance(_w[idx + 1], Sequence) and not isinstance(
-                        _w[idx + 2], Sequence
-                    ):
-                        start_idx, stop_idx, const_width = _w[idx : idx + 3]
+                    elif isinstance(w_next_entry, (int, float)) and isinstance(_w[idx + 2].get_object(), (int, float)):
+                        start_idx, stop_idx, const_width = w_entry, w_next_entry, _w[idx + 2].get_object()
                         self.width_map.update(
                             {
                                 ord_map[_cidx]: const_width
-                                for _cidx in range(start_idx, stop_idx + 1, 1)
+                                for _cidx in range(cast(int, start_idx), cast(int, stop_idx + 1), 1)
                                 if _cidx in ord_map
                             }
                         )
                         skip_count = 2
+                    else:
+                        # Note: this doesn't handle the case of out of bounds (reaching the end of the width definitions
+                        # while expecting more elements). This raises an IndexError which is sufficient.
+                        raise ParseError(
+                            f"Invalid font width definition. Next elements: {w_entry}, {w_next_entry}, {_w[idx + 2]}"
+                        )  # pragma: no cover
+
         if not self.width_map and "/BaseFont" in self.font_dictionary:
             for key in STANDARD_WIDTHS:
                 if self.font_dictionary["/BaseFont"].startswith(f"/{key}"):

diff --git a/tests/test_text_extraction.py b/tests/test_text_extraction.py
@@ -10,6 +10,7 @@
 
 from pypdf import PdfReader, mult
 from pypdf._text_extraction import set_custom_rtl
+from pypdf.errors import ParseError
 
 from . import get_data_from_url
 
@@ -156,3 +157,19 @@ def test_layout_mode_type0_font_widths():
         encoding="utf-8"
     )
     assert expected == reader.pages[0].extract_text(extraction_mode="layout")
+
+
+@pytest.mark.enable_socket()
+def test_layout_mode_indirect_sequence_font_widths():
+    # Cover the situation where the sequence for font widths is an IndirectObject
+    # ref https://github.com/py-pdf/pypdf/pull/2788
+    url = "https://github.com/user-attachments/files/16491621/2788_example.pdf"
+    name ="2788_example.pdf"
+    reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
+    assert reader.pages[0].extract_text(extraction_mode="layout") == ""
+    url = "https://github.com/user-attachments/files/16491619/2788_example_malformed.pdf"
+    name = "2788_example_malformed.pdf"
+    reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
+    with pytest.raises(ParseError) as exc:
+        reader.pages[0].extract_text(extraction_mode="layout")
+        assert str(exc.value).startswith("Invalid font width definition")