Skip to content

Commit

Permalink
BUG: Handle Sequence as an IndirectObject when extracting text with l…
Browse files Browse the repository at this point in the history
…ayout mode (#2788)

* Handle Sequence as an IndirectObject

The spec allows an int or float to be an IndirectObject as well, but this commit does not address that theoretical possibility.

* Update pypdf/_text_extraction/_layout_mode/_font.py

Co-authored-by: Stefan <[email protected]>

* Address PR comments

-Rename w_1 to w_next_entry
-Utilize ParseError instead of PdfReadError
-Write a test (both positive and negative)

* Handle unlikely case of IndirectObjects for float/int width elements

Also adds a comment to clarify that we don't explicitly handle the IndexError exception. Rather, we let it be raised as an IndexError.

* Yoda condition I removed

* Last commit was a bad patch, confused by non-committed changes

* Use test files from URL rather than resources

* Update tests/test_text_extraction.py

Co-authored-by: pubpub-zz <[email protected]>

* Fix code style warnings in range() call

---------

Co-authored-by: Stefan <[email protected]>
Co-authored-by: pubpub-zz <[email protected]>
  • Loading branch information
3 people authored Aug 5, 2024
1 parent 09f9b7e commit b2d7204
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 9 deletions.
26 changes: 17 additions & 9 deletions pypdf/_text_extraction/_layout_mode/_font.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
"""Font constants and classes for "layout" mode text operations"""

from dataclasses import dataclass, field
from typing import Any, Dict, Sequence, Union
from typing import Any, Dict, Sequence, Union, cast

from ...errors import ParseError
from ...generic import IndirectObject
from ._font_widths import STANDARD_WIDTHS

Expand Down Expand Up @@ -58,6 +59,7 @@ def __post_init__(self) -> None:
skip_count = 0
_w = d_font.get("/W", [])
for idx, w_entry in enumerate(_w):
w_entry = w_entry.get_object()
if skip_count:
skip_count -= 1
continue
Expand All @@ -66,32 +68,38 @@ def __post_init__(self) -> None:
# warning and or use reader's "strict" to force an ex???
continue
# check for format (1): `int [int int int int ...]`
if isinstance(_w[idx + 1], Sequence):
start_idx, width_list = _w[idx : idx + 2]
w_next_entry = _w[idx + 1].get_object()
if isinstance(w_next_entry, Sequence):
start_idx, width_list = w_entry, w_next_entry
self.width_map.update(
{
ord_map[_cidx]: _width
for _cidx, _width in zip(
range(start_idx, start_idx + len(width_list), 1),
range(cast(int, start_idx), cast(int, start_idx) + len(width_list), 1),
width_list,
)
if _cidx in ord_map
}
)
skip_count = 1
# check for format (2): `int int int`
if not isinstance(_w[idx + 1], Sequence) and not isinstance(
_w[idx + 2], Sequence
):
start_idx, stop_idx, const_width = _w[idx : idx + 3]
elif isinstance(w_next_entry, (int, float)) and isinstance(_w[idx + 2].get_object(), (int, float)):
start_idx, stop_idx, const_width = w_entry, w_next_entry, _w[idx + 2].get_object()
self.width_map.update(
{
ord_map[_cidx]: const_width
for _cidx in range(start_idx, stop_idx + 1, 1)
for _cidx in range(cast(int, start_idx), cast(int, stop_idx + 1), 1)
if _cidx in ord_map
}
)
skip_count = 2
else:
# Note: this doesn't handle the case of out of bounds (reaching the end of the width definitions
# while expecting more elements). This raises an IndexError which is sufficient.
raise ParseError(
f"Invalid font width definition. Next elements: {w_entry}, {w_next_entry}, {_w[idx + 2]}"
) # pragma: no cover

if not self.width_map and "/BaseFont" in self.font_dictionary:
for key in STANDARD_WIDTHS:
if self.font_dictionary["/BaseFont"].startswith(f"/{key}"):
Expand Down
17 changes: 17 additions & 0 deletions tests/test_text_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

from pypdf import PdfReader, mult
from pypdf._text_extraction import set_custom_rtl
from pypdf.errors import ParseError

from . import get_data_from_url

Expand Down Expand Up @@ -156,3 +157,19 @@ def test_layout_mode_type0_font_widths():
encoding="utf-8"
)
assert expected == reader.pages[0].extract_text(extraction_mode="layout")


@pytest.mark.enable_socket()
def test_layout_mode_indirect_sequence_font_widths():
# Cover the situation where the sequence for font widths is an IndirectObject
# ref https://github.com/py-pdf/pypdf/pull/2788
url = "https://github.com/user-attachments/files/16491621/2788_example.pdf"
name ="2788_example.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
assert reader.pages[0].extract_text(extraction_mode="layout") == ""
url = "https://github.com/user-attachments/files/16491619/2788_example_malformed.pdf"
name = "2788_example_malformed.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
with pytest.raises(ParseError) as exc:
reader.pages[0].extract_text(extraction_mode="layout")
assert str(exc.value).startswith("Invalid font width definition")

0 comments on commit b2d7204

Please sign in to comment.