From abb5130723fa82aa3ee5f55e44f475c6dbe75629 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Sun, 29 Jun 2025 15:24:04 +0200 Subject: [PATCH 1/5] MAINT: Refactor _page.py This is an experiment to see how well Github Copilot works --- pypdf/__init__.py | 3 +- pypdf/_page.py | 383 +-------------------------- pypdf/_text_extractor.py | 557 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 573 insertions(+), 370 deletions(-) create mode 100644 pypdf/_text_extractor.py diff --git a/pypdf/__init__.py b/pypdf/__init__.py index 87accc401..d2812817e 100644 --- a/pypdf/__init__.py +++ b/pypdf/__init__.py @@ -11,8 +11,9 @@ from ._doc_common import DocumentInformation from ._encryption import PasswordType from ._merger import PdfMerger -from ._page import PageObject, Transformation, mult +from ._page import PageObject, Transformation from ._reader import PdfReader +from ._text_extraction import mult from ._version import __version__ from ._writer import ObjectDeletionFlag, PdfWriter from .constants import ImageType diff --git a/pypdf/_page.py b/pypdf/_page.py index 59f7bceca..9a429a035 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -54,17 +54,12 @@ build_font_width_map, compute_font_width, get_actual_str_key, - unknown_char_map, ) from ._protocols import PdfCommonDocProtocol from ._text_extraction import ( - OrientationNotFoundError, _layout_mode, - crlf_space_check, - get_display_str, - get_text_operands, - mult, ) +from ._text_extractor import TextExtraction from ._utils import ( CompressedTransformationMatrix, TransformationMatrixType, @@ -92,7 +87,6 @@ PdfObject, RectangleObject, StreamObject, - TextStringObject, is_null_or_none, ) @@ -1698,42 +1692,6 @@ def _get_actual_font_widths( font_widths += compute_font_width(font_width_map, char) return (font_widths * font_size, space_width * font_size, font_size) - def _handle_tj( - self, - text: str, - operands: List[Union[str, TextStringObject]], - cm_matrix: List[float], - tm_matrix: List[float], - cmap: Tuple[ - Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject] - ], - orientations: Tuple[int, ...], - font_size: float, - rtl_dir: bool, - visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]], - space_width: float, - actual_str_size: Dict[str, float] - ) -> Tuple[str, bool, Dict[str, float]]: - text_operands, is_str_operands = get_text_operands( - operands, cm_matrix, tm_matrix, cmap, orientations) - if is_str_operands: - text += text_operands - else: - text, rtl_dir = get_display_str( - text, - cm_matrix, - tm_matrix, # text matrix - cmap, - text_operands, - font_size, - rtl_dir, - visitor_text) - font_widths, actual_str_size["space_width"], actual_str_size["str_height"] = ( - self._get_actual_font_widths(cmap, text_operands, font_size, space_width)) - actual_str_size["str_widths"] += font_widths - - return text, rtl_dir, actual_str_size - def _extract_text( self, obj: Any, @@ -1754,332 +1712,19 @@ def _extract_text( default = "/Content" """ - text: str = "" - output: str = "" - rtl_dir: bool = False # right-to-left - cmaps: Dict[ - str, - Tuple[ - str, float, Union[str, Dict[int, str]], Dict[str, str], DictionaryObject - ], - ] = {} - - try: - objr = obj - while NameObject(PG.RESOURCES) not in objr: - # /Resources can be inherited so we look to parents - objr = objr["/Parent"].get_object() - # If no parents then no /Resources will be available, - # so an exception will be raised - resources_dict = cast(DictionaryObject, objr[PG.RESOURCES]) - except Exception: - # No resources means no text is possible (no font); we consider the - # file as not damaged, no need to check for TJ or Tj - return "" - - if "/Font" in resources_dict and (font := resources_dict["/Font"]): - for f in cast(DictionaryObject, font): - cmaps[f] = build_char_map(f, space_width, obj) - cmap: Tuple[ - Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject] - ] = ( - "charmap", - {}, - "NotInitialized", - None, - ) # (encoding, CMAP, font resource name, font) - - try: - content = ( - obj[content_key].get_object() if isinstance(content_key, str) else obj - ) - if not isinstance(content, ContentStream): - content = ContentStream(content, pdf, "bytes") - except (AttributeError, KeyError): # no content can be extracted (certainly empty page) - return "" - # We check all strings are TextStringObjects. ByteStringObjects - # are strings where the byte->string encoding was unknown, so adding - # them to the text here would be gibberish. - - cm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] - tm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] - cm_stack = [] - - # Store the last modified matrices; can be an intermediate position - cm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] - tm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] - - # Store the position at the beginning of building the text - memo_cm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] - memo_tm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] - - char_scale = 1.0 - space_scale = 1.0 - _space_width: float = 500.0 # will be set correctly at first Tf - _actual_str_size: Dict[str, float] = { - "str_widths": 0.0, "space_width": 0.0, "str_height": 0.0 - } # will be set to string length calculation result - TL = 0.0 - font_size = 12.0 # init just in case of - - def compute_str_widths(str_widths: float) -> float: - return str_widths / 1000 - - def process_operation(operator: bytes, operands: List[Any]) -> None: - nonlocal cm_matrix, tm_matrix, cm_stack, cm_prev, tm_prev, memo_cm, memo_tm - nonlocal char_scale, space_scale, _space_width, TL, font_size, cmap - nonlocal orientations, rtl_dir, visitor_text, output, text, _actual_str_size - - str_widths: float = 0.0 - - # Table 5.4 page 405 - if operator == b"BT": # Begin Text - tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] - # Flush text: - output += text - if visitor_text is not None: - visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) - text = "" - memo_cm = cm_matrix.copy() - memo_tm = tm_matrix.copy() - return - if operator == b"ET": # End Text - # Flush text: - output += text - if visitor_text is not None: - visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) - text = "" - memo_cm = cm_matrix.copy() - memo_tm = tm_matrix.copy() - - # Table 4.7 "Graphics state operators", page 219 - # cm_matrix calculation is reserved for later - elif operator == b"q": # Save graphics state - cm_stack.append( - ( - cm_matrix, - cmap, - font_size, - char_scale, - space_scale, - _space_width, - TL, - ) - ) - elif operator == b"Q": # Restore graphics state - try: - ( - cm_matrix, - cmap, - font_size, - char_scale, - space_scale, - _space_width, - TL, - ) = cm_stack.pop() - except Exception: - cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] - elif operator == b"cm": # Modify current matrix - output += text - if visitor_text is not None: - visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) - text = "" - try: - cm_matrix = mult( - [float(operand) for operand in operands[:6]], - cm_matrix - ) - except Exception: - cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] - memo_cm = cm_matrix.copy() - memo_tm = tm_matrix.copy() - - # Table 5.2 page 398 - elif operator == b"Tz": # Set horizontal text scaling - char_scale = float(operands[0]) / 100 if operands else 1.0 - elif operator == b"Tw": # Set word spacing - space_scale = 1.0 + float(operands[0] if operands else 0.0) - elif operator == b"TL": # Set Text Leading - scale_x = math.sqrt(tm_matrix[0]**2 + tm_matrix[2]**2) - TL = float(operands[0] if operands else 0.0) * font_size * scale_x - elif operator == b"Tf": # Set font size - if text != "": - output += text # .translate(cmap) - if visitor_text is not None: - visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) - text = "" - memo_cm = cm_matrix.copy() - memo_tm = tm_matrix.copy() - try: - # char_map_tuple: font_type, - # float(sp_width / 2), - # encoding, - # map_dict, - # font_dict (describes the font) - char_map_tuple = cmaps[operands[0]] - # current cmap: encoding, - # map_dict, - # font resource name (internal name, not the real font name), - # font_dict - cmap = ( - char_map_tuple[2], - char_map_tuple[3], - operands[0], - char_map_tuple[4], - ) - _space_width = char_map_tuple[1] - except KeyError: # font not found - cmap = ( - unknown_char_map[2], - unknown_char_map[3], - f"???{operands[0]}", - None, - ) - _space_width = unknown_char_map[1] - try: - font_size = float(operands[1]) - except Exception: - pass # keep previous size - # Table 5.5 page 406 - elif operator == b"Td": # Move text position - # A special case is a translating only tm: - # tm = [1, 0, 0, 1, e, f] - # i.e. tm[4] += tx, tm[5] += ty. - tx, ty = float(operands[0]), float(operands[1]) - tm_matrix[4] += tx * tm_matrix[0] + ty * tm_matrix[2] - tm_matrix[5] += tx * tm_matrix[1] + ty * tm_matrix[3] - str_widths = compute_str_widths(_actual_str_size["str_widths"]) - _actual_str_size["str_widths"] = 0.0 - elif operator == b"Tm": # Set text matrix - tm_matrix = [float(operand) for operand in operands[:6]] - str_widths = compute_str_widths(_actual_str_size["str_widths"]) - _actual_str_size["str_widths"] = 0.0 - elif operator == b"T*": # Move to next line - tm_matrix[4] -= TL * tm_matrix[2] - tm_matrix[5] -= TL * tm_matrix[3] - str_widths = compute_str_widths(_actual_str_size["str_widths"]) - _actual_str_size["str_widths"] = 0.0 - elif operator == b"Tj": # Show text - text, rtl_dir, _actual_str_size = self._handle_tj( - text, - operands, - cm_matrix, - tm_matrix, - cmap, - orientations, - font_size, - rtl_dir, - visitor_text, - _space_width, - _actual_str_size, - ) - else: - return - - if operator in {b"Td", b"Tm", b"T*", b"Tj"}: - try: - text, output, cm_prev, tm_prev = crlf_space_check( - text, - (cm_prev, tm_prev), - (cm_matrix, tm_matrix), - (memo_cm, memo_tm), - cmap, - orientations, - output, - font_size, - visitor_text, - str_widths, - compute_str_widths(_actual_str_size["space_width"]), - _actual_str_size["str_height"] - ) - if text == "": - memo_cm = cm_matrix.copy() - memo_tm = tm_matrix.copy() - except OrientationNotFoundError: - return - - for operands, operator in content.operations: - if visitor_operand_before is not None: - visitor_operand_before(operator, operands, cm_matrix, tm_matrix) - # Multiple operators are handled here - if operator == b"'": - process_operation(b"T*", []) - process_operation(b"Tj", operands) - elif operator == b'"': - process_operation(b"Tw", [operands[0]]) - process_operation(b"Tc", [operands[1]]) - process_operation(b"T*", []) - process_operation(b"Tj", operands[2:]) - elif operator == b"TJ": - # The space width may be smaller than the font width, so the width should be 95%. - _confirm_space_width = _space_width * 0.95 - if operands: - for op in operands[0]: - if isinstance(op, (str, bytes)): - process_operation(b"Tj", [op]) - if isinstance(op, (int, float, NumberObject, FloatObject)) and ( - abs(float(op)) >= _confirm_space_width - and text - and text[-1] != " " - ): - process_operation(b"Tj", [" "]) - elif operator == b"TD": - process_operation(b"TL", [-operands[1]]) - process_operation(b"Td", operands) - elif operator == b"Do": - output += text - if visitor_text is not None: - visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) - try: - if output[-1] != "\n": - output += "\n" - if visitor_text is not None: - visitor_text( - "\n", - memo_cm, - memo_tm, - cmap[3], - font_size, - ) - except IndexError: - pass - try: - xobj = resources_dict["/XObject"] - if xobj[operands[0]]["/Subtype"] != "/Image": # type: ignore - text = self.extract_xform_text( - xobj[operands[0]], # type: ignore - orientations, - space_width, - visitor_operand_before, - visitor_operand_after, - visitor_text, - ) - output += text - if visitor_text is not None: - visitor_text( - text, - memo_cm, - memo_tm, - cmap[3], - font_size, - ) - except Exception as exception: - logger_warning( - f"Impossible to decode XFormObject {operands[0]}: {exception}", - __name__, - ) - finally: - text = "" - memo_cm = cm_matrix.copy() - memo_tm = tm_matrix.copy() - else: - process_operation(operator, operands) - if visitor_operand_after is not None: - visitor_operand_after(operator, operands, cm_matrix, tm_matrix) - output += text # just in case - if text != "" and visitor_text is not None: - visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) - return output + # Use the new TextExtraction class + extractor = TextExtraction( + self, # Pass the page object for font width maps + obj, + pdf, + orientations, + space_width, + content_key, + visitor_operand_before, + visitor_operand_after, + visitor_text, + ) + return extractor.extract_text() def _layout_mode_fonts(self) -> Dict[str, _layout_mode.Font]: """ diff --git a/pypdf/_text_extractor.py b/pypdf/_text_extractor.py new file mode 100644 index 000000000..2f7ccfa56 --- /dev/null +++ b/pypdf/_text_extractor.py @@ -0,0 +1,557 @@ +# Copyright (c) 2006, Mathieu Fenniak +# Copyright (c) 2007, Ashish Kulkarni +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * The name of the author may not be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +import math +from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast + +from ._cmap import ( + build_char_map, + build_font_width_map, + compute_font_width, + get_actual_str_key, + unknown_char_map, +) +from ._text_extraction import ( + OrientationNotFoundError, + crlf_space_check, + get_display_str, + get_text_operands, + mult, +) +from ._utils import logger_warning +from .constants import PageAttributes as PG +from .generic import ( + ContentStream, + DictionaryObject, + FloatObject, + NameObject, + NumberObject, + TextStringObject, +) + + +class TextExtraction: + """ + A class to handle PDF text extraction operations. + + This class encapsulates all the state and operations needed for extracting + text from PDF content streams, replacing the nested functions and nonlocal + variables in the original implementation. + """ + + def __init__( + self, + page_obj: Any, # PageObject reference + obj: Any, + pdf: Any, + orientations: Tuple[int, ...] = (0, 90, 180, 270), + space_width: float = 200.0, + content_key: Optional[str] = PG.CONTENTS, + visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None, + visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None, + visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, + ) -> None: + """Initialize the text extraction with parameters and state.""" + self.page_obj = page_obj # Reference to the PageObject for font width maps + self.obj = obj + self.pdf = pdf + self.orientations = orientations + self.space_width = space_width + self.content_key = content_key + self.visitor_operand_before = visitor_operand_before + self.visitor_operand_after = visitor_operand_after + self.visitor_text = visitor_text + + # Text state + self.text: str = "" + self.output: str = "" + self.rtl_dir: bool = False # right-to-left + + # Matrix state + self.cm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] + self.tm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] + self.cm_stack: List[Tuple[Any, ...]] = [] + + # Previous matrices for tracking changes + self.cm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] + self.tm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] + + # Memo matrices for visitor callbacks + self.memo_cm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] + self.memo_tm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] + + # Font and text scaling state + self.char_scale: float = 1.0 + self.space_scale: float = 1.0 + self._space_width: float = 500.0 # will be set correctly at first Tf + self.TL: float = 0.0 + self.font_size: float = 12.0 # init just in case + + # Character map state + self.cmap: Tuple[Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]] = ( + "charmap", + {}, + "NotInitialized", + None, + ) # (encoding, CMAP, font resource name, font) + + # Actual string size tracking + self._actual_str_size: Dict[str, float] = {"str_widths": 0.0, "space_width": 0.0, "str_height": 0.0} + + # Character maps for fonts + self.cmaps: Dict[ + str, + Tuple[str, float, Union[str, Dict[int, str]], Dict[str, str], DictionaryObject], + ] = {} + + # Resources dictionary + self.resources_dict: Optional[DictionaryObject] = None + + # Operation handler mapping + self.operation_handlers = { + b"BT": self._handle_operation_begin_text, + b"ET": self._handle_operation_end_text, + b"q": self._handle_operation_save_graphics_state, + b"Q": self._handle_operation_restore_graphics_state, + b"cm": self._handle_operation_modify_current_matrix, + b"Tz": self._handle_operation_horizontal_text_scaling, + b"Tw": self._handle_operation_word_spacing, + b"TL": self._handle_operation_text_leading, + b"Tf": self._handle_operation_set_font, + b"Td": self._handle_operation_move_text_position, + b"Tm": self._handle_operation_set_text_matrix, + b"T*": self._handle_operation_move_to_next_line, + b"Tj": self._handle_operation_show_text, + } + + def extract_text(self) -> str: + """Extract text from the PDF object.""" + # Initialize resources and content + if not self._initialize_resources(): + return "" + + content = self._get_content() + if content is None: + return "" + + # Process all operations in the content stream + for operands, operator in content.operations: + self._process_operation(operator, operands) + + # Add any remaining text to output + self.output += self.text + if self.text != "" and self.visitor_text is not None: + self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.cmap[3], self.font_size) + + return self.output + + def _initialize_resources(self) -> bool: + """Initialize resources dictionary and character maps.""" + try: + objr = self.obj + while NameObject(PG.RESOURCES) not in objr: + # /Resources can be inherited so we look to parents + objr = objr["/Parent"].get_object() + # If no parents then no /Resources will be available, + # so an exception will be raised + self.resources_dict = cast("DictionaryObject", objr[PG.RESOURCES]) + except Exception: + # No resources means no text is possible (no font) + return False + + if "/Font" in self.resources_dict and (font := self.resources_dict["/Font"]): + for f in cast("DictionaryObject", font): + self.cmaps[f] = build_char_map(f, self.space_width, self.obj) + + return True + + def _get_content(self) -> Optional[ContentStream]: + """Get the content stream from the object.""" + try: + content = self.obj[self.content_key].get_object() if isinstance(self.content_key, str) else self.obj + if not isinstance(content, ContentStream): + content = ContentStream(content, self.pdf, "bytes") + return content + except (AttributeError, KeyError): + return None + + def _process_operation(self, operator: bytes, operands: List[Any]) -> None: + """Process a single PDF operation.""" + if self.visitor_operand_before is not None: + self.visitor_operand_before(operator, operands, self.cm_matrix, self.tm_matrix) + + # Handle compound operators + if operator == b"'": + self._handle_operation_move_to_next_line([]) + self._handle_operation_show_text(operands) + elif operator == b'"': + self._handle_operation_word_spacing([operands[0]]) + self._handle_operation_character_spacing([operands[1]]) + self._handle_operation_move_to_next_line([]) + self._handle_operation_show_text(operands[2:]) + elif operator == b"TJ": + self._handle_operation_show_text_with_positioning(operands) + elif operator == b"TD": + self._handle_operation_text_leading([-operands[1]]) + self._handle_operation_move_text_position(operands) + elif operator == b"Do": + self._handle_operation_do(operands) + else: + # Use the operation handler mapping + handler = self.operation_handlers.get(operator) + if handler: + handler(operands) + + if self.visitor_operand_after is not None: + self.visitor_operand_after(operator, operands, self.cm_matrix, self.tm_matrix) + + def _compute_str_widths(self, str_widths: float) -> float: + """Compute string widths.""" + return str_widths / 1000 + + def _flush_text(self) -> None: + """Flush current text to output.""" + self.output += self.text + if self.visitor_text is not None: + self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.cmap[3], self.font_size) + self.text = "" + self.memo_cm = self.cm_matrix.copy() + self.memo_tm = self.tm_matrix.copy() + + # Operation handlers + def _handle_operation_begin_text(self, operands: List[Any]) -> None: + """Handle BT (Begin Text) operation.""" + self.tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] + self._flush_text() + + def _handle_operation_end_text(self, operands: List[Any]) -> None: + """Handle ET (End Text) operation.""" + self._flush_text() + + def _handle_operation_save_graphics_state(self, operands: List[Any]) -> None: + """Handle q (Save graphics state) operation.""" + self.cm_stack.append( + ( + self.cm_matrix, + self.cmap, + self.font_size, + self.char_scale, + self.space_scale, + self._space_width, + self.TL, + ), + ) + + def _handle_operation_restore_graphics_state(self, operands: List[Any]) -> None: + """Handle Q (Restore graphics state) operation.""" + try: + ( + self.cm_matrix, + self.cmap, + self.font_size, + self.char_scale, + self.space_scale, + self._space_width, + self.TL, + ) = self.cm_stack.pop() + except Exception: + self.cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] + + def _handle_operation_modify_current_matrix(self, operands: List[Any]) -> None: + """Handle cm (Modify current matrix) operation.""" + self._flush_text() + try: + self.cm_matrix = mult([float(operand) for operand in operands[:6]], self.cm_matrix) + except Exception: + self.cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] + self.memo_cm = self.cm_matrix.copy() + self.memo_tm = self.tm_matrix.copy() + + def _handle_operation_horizontal_text_scaling(self, operands: List[Any]) -> None: + """Handle Tz (Set horizontal text scaling) operation.""" + self.char_scale = float(operands[0]) / 100 if operands else 1.0 + + def _handle_operation_word_spacing(self, operands: List[Any]) -> None: + """Handle Tw (Set word spacing) operation.""" + self.space_scale = 1.0 + float(operands[0] if operands else 0.0) + + def _handle_operation_character_spacing(self, operands: List[Any]) -> None: + """Handle Tc (Set character spacing) operation.""" + # This is a placeholder for character spacing handling + + def _handle_operation_text_leading(self, operands: List[Any]) -> None: + """Handle TL (Set Text Leading) operation.""" + scale_x = math.sqrt(self.tm_matrix[0] ** 2 + self.tm_matrix[2] ** 2) + self.TL = float(operands[0] if operands else 0.0) * self.font_size * scale_x + + def _handle_operation_set_font(self, operands: List[Any]) -> None: + """Handle Tf (Set font size) operation.""" + if self.text != "": + self._flush_text() + + try: + # char_map_tuple: font_type, + # float(sp_width / 2), + # encoding, + # map_dict, + # font_dict (describes the font) + char_map_tuple = self.cmaps[operands[0]] + # current cmap: encoding, + # map_dict, + # font resource name (internal name, not the real font name), + # font_dict + self.cmap = ( + char_map_tuple[2], + char_map_tuple[3], + operands[0], + char_map_tuple[4], + ) + self._space_width = char_map_tuple[1] + except KeyError: # font not found + self.cmap = ( + unknown_char_map[2], + unknown_char_map[3], + f"???{operands[0]}", + None, + ) + self._space_width = unknown_char_map[1] + + try: + self.font_size = float(operands[1]) + except Exception: + pass # keep previous size + + def _handle_operation_move_text_position(self, operands: List[Any]) -> None: + """Handle Td (Move text position) operation.""" + # A special case is a translating only tm: + # tm = [1, 0, 0, 1, e, f] + # i.e. tm[4] += tx, tm[5] += ty. + tx, ty = float(operands[0]), float(operands[1]) + self.tm_matrix[4] += tx * self.tm_matrix[0] + ty * self.tm_matrix[2] + self.tm_matrix[5] += tx * self.tm_matrix[1] + ty * self.tm_matrix[3] + str_widths = self._compute_str_widths(self._actual_str_size["str_widths"]) + self._actual_str_size["str_widths"] = 0.0 + self._handle_position_change(str_widths) + + def _handle_operation_set_text_matrix(self, operands: List[Any]) -> None: + """Handle Tm (Set text matrix) operation.""" + self.tm_matrix = [float(operand) for operand in operands[:6]] + str_widths = self._compute_str_widths(self._actual_str_size["str_widths"]) + self._actual_str_size["str_widths"] = 0.0 + self._handle_position_change(str_widths) + + def _handle_operation_move_to_next_line(self, operands: List[Any]) -> None: + """Handle T* (Move to next line) operation.""" + self.tm_matrix[4] -= self.TL * self.tm_matrix[2] + self.tm_matrix[5] -= self.TL * self.tm_matrix[3] + str_widths = self._compute_str_widths(self._actual_str_size["str_widths"]) + self._actual_str_size["str_widths"] = 0.0 + self._handle_position_change(str_widths) + + def _handle_operation_show_text(self, operands: List[Any]) -> None: + """Handle Tj (Show text) operation.""" + self.text, self.rtl_dir, self._actual_str_size = self._handle_tj( + self.text, + operands, + self.cm_matrix, + self.tm_matrix, + self.cmap, + self.orientations, + self.font_size, + self.rtl_dir, + self.visitor_text, + self._space_width, + self._actual_str_size, + ) + str_widths = self._compute_str_widths(self._actual_str_size["str_widths"]) + self._handle_position_change(str_widths) + + def _handle_operation_show_text_with_positioning(self, operands: List[Any]) -> None: + """Handle TJ (Show text with positioning) operation.""" + # The space width may be smaller than the font width, so the width should be 95%. + _confirm_space_width = self._space_width * 0.95 + if operands: + for op in operands[0]: + if isinstance(op, (str, bytes)): + self._handle_operation_show_text([op]) + if isinstance(op, (int, float, NumberObject, FloatObject)) and ( + abs(float(op)) >= _confirm_space_width and self.text and self.text[-1] != " " + ): + self._handle_operation_show_text([" "]) + + def _handle_operation_do(self, operands: List[Any]) -> None: + """Handle Do (Execute XObject) operation.""" + self._flush_text() + try: + if self.output and self.output[-1] != "\n": + self.output += "\n" + if self.visitor_text is not None: + self.visitor_text( + "\n", + self.memo_cm, + self.memo_tm, + self.cmap[3], + self.font_size, + ) + except IndexError: + pass + + try: + xobj = self.resources_dict["/XObject"] # type: ignore + if xobj[operands[0]]["/Subtype"] != "/Image": # type: ignore + # Extract text from XForm object + xform_extractor = TextExtraction( + self.page_obj, + xobj[operands[0]], # type: ignore + self.pdf, + self.orientations, + self.space_width, + None, # content_key = None for XForm objects + self.visitor_operand_before, + self.visitor_operand_after, + self.visitor_text, + ) + text = xform_extractor.extract_text() + self.output += text + if self.visitor_text is not None: + self.visitor_text( + text, + self.memo_cm, + self.memo_tm, + self.cmap[3], + self.font_size, + ) + except Exception as exception: + logger_warning( + f"Impossible to decode XFormObject {operands[0]}: {exception}", + __name__, + ) + finally: + self.text = "" + self.memo_cm = self.cm_matrix.copy() + self.memo_tm = self.tm_matrix.copy() + + def _handle_position_change(self, str_widths: float) -> None: + """Handle position changes for text positioning operations.""" + try: + self.text, self.output, self.cm_prev, self.tm_prev = crlf_space_check( + self.text, + (self.cm_prev, self.tm_prev), + (self.cm_matrix, self.tm_matrix), + (self.memo_cm, self.memo_tm), + self.cmap, + self.orientations, + self.output, + self.font_size, + self.visitor_text, + str_widths, + self._compute_str_widths(self._actual_str_size["space_width"]), + self._actual_str_size["str_height"], + ) + if self.text == "": + self.memo_cm = self.cm_matrix.copy() + self.memo_tm = self.tm_matrix.copy() + except OrientationNotFoundError: + return + + def _handle_tj( + self, + text: str, + operands: List[Union[str, TextStringObject]], + cm_matrix: List[float], + tm_matrix: List[float], + cmap: Tuple[Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]], + orientations: Tuple[int, ...], + font_size: float, + rtl_dir: bool, + visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]], + space_width: float, + actual_str_size: Dict[str, float], + ) -> Tuple[str, bool, Dict[str, float]]: + """Handle text showing operations.""" + text_operands, is_str_operands = get_text_operands(operands, cm_matrix, tm_matrix, cmap, orientations) + if is_str_operands: + text += text_operands + else: + text, rtl_dir = get_display_str( + text, + cm_matrix, + tm_matrix, # text matrix + cmap, + text_operands, + font_size, + rtl_dir, + visitor_text, + ) + + font_widths, actual_str_size["space_width"], actual_str_size["str_height"] = self._get_actual_font_widths( + cmap, text_operands, font_size, space_width, + ) + actual_str_size["str_widths"] += font_widths + + return text, rtl_dir, actual_str_size + + def _get_actual_font_widths( + self, + cmap: Tuple[Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]], + text_operands: str, + font_size: float, + space_width: float, + ) -> Tuple[float, float, float]: + """Get actual font widths for text operands.""" + font_widths: float = 0 + font_name: str = cmap[2] + + # Use the page object's font width maps + if font_name not in self.page_obj._font_width_maps: + if cmap[3] is None: + font_width_map: Dict[Any, float] = {} + space_char = " " + actual_space_width: float = space_width + font_width_map["default"] = actual_space_width * 2 + else: + space_char = get_actual_str_key(" ", cmap[0], cmap[1]) + font_width_map = build_font_width_map(cmap[3], space_width * 2) + actual_space_width = compute_font_width(font_width_map, space_char) + if actual_space_width == 0: + actual_space_width = space_width + self.page_obj._font_width_maps[font_name] = (font_width_map, space_char, actual_space_width) + + font_width_map = self.page_obj._font_width_maps[font_name][0] + space_char = self.page_obj._font_width_maps[font_name][1] + actual_space_width = self.page_obj._font_width_maps[font_name][2] + + if text_operands: + for char in text_operands: + if char == space_char: + font_widths += actual_space_width + continue + font_widths += compute_font_width(font_width_map, char) + + return (font_widths * font_size, space_width * font_size, font_size) From f4e9285a521ce32457bfca69e30df70496930c16 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Sun, 29 Jun 2025 21:39:08 +0200 Subject: [PATCH 2/5] Remove code duplication --- pypdf/_page.py | 39 --------------------------------------- 1 file changed, 39 deletions(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index 9a429a035..e079dd3c3 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -51,9 +51,6 @@ from ._cmap import ( build_char_map, - build_font_width_map, - compute_font_width, - get_actual_str_key, ) from ._protocols import PdfCommonDocProtocol from ._text_extraction import ( @@ -1656,42 +1653,6 @@ def _debug_for_extract(self) -> str: # pragma: no cover out += "No Font\n" return out - def _get_actual_font_widths( - self, - cmap: Tuple[ - Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject] - ], - text_operands: str, - font_size: float, - space_width: float - ) -> Tuple[float, float, float]: - font_widths: float = 0 - font_name: str = cmap[2] - if font_name not in self._font_width_maps: - if cmap[3] is None: - font_width_map: Dict[Any, float] = {} - space_char = " " - actual_space_width: float = space_width - font_width_map["default"] = actual_space_width * 2 - else: - space_char = get_actual_str_key(" ", cmap[0], cmap[1]) - font_width_map = build_font_width_map(cmap[3], space_width * 2) - actual_space_width = compute_font_width(font_width_map, space_char) - if actual_space_width == 0: - actual_space_width = space_width - self._font_width_maps[font_name] = (font_width_map, space_char, actual_space_width) - font_width_map = self._font_width_maps[font_name][0] - space_char = self._font_width_maps[font_name][1] - actual_space_width = self._font_width_maps[font_name][2] - - if text_operands: - for char in text_operands: - if char == space_char: - font_widths += actual_space_width - continue - font_widths += compute_font_width(font_width_map, char) - return (font_widths * font_size, space_width * font_size, font_size) - def _extract_text( self, obj: Any, From 1ed2e380da8b8f90a05bc159bca1fc4f61da0ad3 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Sun, 29 Jun 2025 21:43:45 +0200 Subject: [PATCH 3/5] Move _text_extractor into _text_extraction --- pypdf/_page.py | 2 +- .../{ => _text_extraction}/_text_extractor.py | 27 ++++++++++--------- 2 files changed, 16 insertions(+), 13 deletions(-) rename pypdf/{ => _text_extraction}/_text_extractor.py (98%) diff --git a/pypdf/_page.py b/pypdf/_page.py index e079dd3c3..89a159f1c 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -56,7 +56,7 @@ from ._text_extraction import ( _layout_mode, ) -from ._text_extractor import TextExtraction +from ._text_extraction._text_extractor import TextExtraction from ._utils import ( CompressedTransformationMatrix, TransformationMatrixType, diff --git a/pypdf/_text_extractor.py b/pypdf/_text_extraction/_text_extractor.py similarity index 98% rename from pypdf/_text_extractor.py rename to pypdf/_text_extraction/_text_extractor.py index 2f7ccfa56..cac68e1b5 100644 --- a/pypdf/_text_extractor.py +++ b/pypdf/_text_extraction/_text_extractor.py @@ -30,23 +30,16 @@ import math from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast -from ._cmap import ( +from .._cmap import ( build_char_map, build_font_width_map, compute_font_width, get_actual_str_key, unknown_char_map, ) -from ._text_extraction import ( - OrientationNotFoundError, - crlf_space_check, - get_display_str, - get_text_operands, - mult, -) -from ._utils import logger_warning -from .constants import PageAttributes as PG -from .generic import ( +from .._utils import logger_warning +from ..constants import PageAttributes as PG +from ..generic import ( ContentStream, DictionaryObject, FloatObject, @@ -54,6 +47,13 @@ NumberObject, TextStringObject, ) +from . import ( + OrientationNotFoundError, + crlf_space_check, + get_display_str, + get_text_operands, + mult, +) class TextExtraction: @@ -511,7 +511,10 @@ def _handle_tj( ) font_widths, actual_str_size["space_width"], actual_str_size["str_height"] = self._get_actual_font_widths( - cmap, text_operands, font_size, space_width, + cmap, + text_operands, + font_size, + space_width, ) actual_str_size["str_widths"] += font_widths From 2c643d8a27501ddfbba253aef73630b615c8cd25 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Fri, 4 Jul 2025 22:01:52 +0200 Subject: [PATCH 4/5] Reduce diff --- pypdf/_text_extraction/_text_extractor.py | 81 ++++++++++++----------- 1 file changed, 41 insertions(+), 40 deletions(-) diff --git a/pypdf/_text_extraction/_text_extractor.py b/pypdf/_text_extraction/_text_extractor.py index eb65fca60..8f6b66a1e 100644 --- a/pypdf/_text_extraction/_text_extractor.py +++ b/pypdf/_text_extraction/_text_extractor.py @@ -480,46 +480,6 @@ def _handle_position_change(self, str_widths: float) -> None: except OrientationNotFoundError: return - def _handle_tj( - self, - text: str, - operands: List[Union[str, TextStringObject]], - cm_matrix: List[float], - tm_matrix: List[float], - cmap: Tuple[Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]], - orientations: Tuple[int, ...], - font_size: float, - rtl_dir: bool, - visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]], - space_width: float, - actual_str_size: Dict[str, float], - ) -> Tuple[str, bool, Dict[str, float]]: - """Handle text showing operations.""" - text_operands, is_str_operands = get_text_operands(operands, cm_matrix, tm_matrix, cmap, orientations) - if is_str_operands: - text += text_operands - else: - text, rtl_dir = get_display_str( - text, - cm_matrix, - tm_matrix, # text matrix - cmap, - text_operands, - font_size, - rtl_dir, - visitor_text, - ) - - font_widths, actual_str_size["space_width"], actual_str_size["str_height"] = self._get_actual_font_widths( - cmap, - text_operands, - font_size, - space_width, - ) - actual_str_size["str_widths"] += font_widths - - return text, rtl_dir, actual_str_size - def _get_actual_font_widths( self, cmap: Tuple[Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]], @@ -559,3 +519,44 @@ def _get_actual_font_widths( return (font_widths * font_size, space_width * font_size, font_size) + + + def _handle_tj( + self, + text: str, + operands: List[Union[str, TextStringObject]], + cm_matrix: List[float], + tm_matrix: List[float], + cmap: Tuple[Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]], + orientations: Tuple[int, ...], + font_size: float, + rtl_dir: bool, + visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]], + space_width: float, + actual_str_size: Dict[str, float], + ) -> Tuple[str, bool, Dict[str, float]]: + """Handle text showing operations.""" + text_operands, is_str_operands = get_text_operands(operands, cm_matrix, tm_matrix, cmap, orientations) + if is_str_operands: + text += text_operands + else: + text, rtl_dir = get_display_str( + text, + cm_matrix, + tm_matrix, # text matrix + cmap, + text_operands, + font_size, + rtl_dir, + visitor_text, + ) + + font_widths, actual_str_size["space_width"], actual_str_size["str_height"] = self._get_actual_font_widths( + cmap, + text_operands, + font_size, + space_width, + ) + actual_str_size["str_widths"] += font_widths + + return text, rtl_dir, actual_str_size From c93aadd31c39b78159954bb983be38c405331d99 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Fri, 4 Jul 2025 22:08:15 +0200 Subject: [PATCH 5/5] Reduce diff --- pypdf/_text_extraction/_text_extractor.py | 90 +++++++++++------------ 1 file changed, 43 insertions(+), 47 deletions(-) diff --git a/pypdf/_text_extraction/_text_extractor.py b/pypdf/_text_extraction/_text_extractor.py index 8f6b66a1e..891ae058f 100644 --- a/pypdf/_text_extraction/_text_extractor.py +++ b/pypdf/_text_extraction/_text_extractor.py @@ -81,55 +81,60 @@ def __init__( self.page_obj = page_obj # Reference to the PageObject for font width maps self.obj = obj self.pdf = pdf - self.orientations = orientations + self.space_width = space_width self.content_key = content_key self.visitor_operand_before = visitor_operand_before self.visitor_operand_after = visitor_operand_after - self.visitor_text = visitor_text - - # Text state - self.text: str = "" - self.output: str = "" - self.rtl_dir: bool = False # right-to-left # Matrix state self.cm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] self.tm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] - self.cm_stack: List[Tuple[Any, ...]] = [] - - # Previous matrices for tracking changes + self.cm_stack: List[ + Tuple[ + List[float], + Tuple[Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]], + float, + float, + float, + float, + float, + ] + ] = [] + + # Store the last modified matrices; can be an intermediate position self.cm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] self.tm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] - # Memo matrices for visitor callbacks + # Store the position at the beginning of building the text self.memo_cm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] self.memo_tm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] # Font and text scaling state - self.char_scale: float = 1.0 - self.space_scale: float = 1.0 + self.char_scale = 1.0 + self.space_scale = 1.0 self._space_width: float = 500.0 # will be set correctly at first Tf - self.TL: float = 0.0 - self.font_size: float = 12.0 # init just in case + self.TL = 0.0 + self.font_size = 12.0 # init just in case + + # Text state + self.text: str = "" + self.output: str = "" + self.rtl_dir: bool = False # right-to-left - # Character map state self.cmap: Tuple[Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]] = ( "charmap", {}, "NotInitialized", None, ) # (encoding, CMAP, font resource name, font) + self.orientations: Tuple[int, ...] = orientations + self.visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None + self.cmaps: Dict[str, Tuple[str, float, Union[str, Dict[int, str]], Dict[str, str], DictionaryObject]] = {} # Actual string size tracking self._actual_str_size: Dict[str, float] = {"str_widths": 0.0, "space_width": 0.0, "str_height": 0.0} - # Character maps for fonts - self.cmaps: Dict[ - str, - Tuple[str, float, Union[str, Dict[int, str]], Dict[str, str], DictionaryObject], - ] = {} - # Resources dictionary self.resources_dict: Optional[DictionaryObject] = None @@ -231,8 +236,7 @@ def _process_operation(self, operator: bytes, operands: List[Any]) -> None: if self.visitor_operand_after is not None: self.visitor_operand_after(operator, operands, self.cm_matrix, self.tm_matrix) - def _compute_str_widths(self, str_widths: float) -> float: - """Compute string widths.""" + def compute_str_widths(self, str_widths: float) -> float: return str_widths / 1000 def _flush_text(self) -> None: @@ -355,14 +359,14 @@ def _handle_operation_move_text_position(self, operands: List[Any]) -> None: tx, ty = float(operands[0]), float(operands[1]) self.tm_matrix[4] += tx * self.tm_matrix[0] + ty * self.tm_matrix[2] self.tm_matrix[5] += tx * self.tm_matrix[1] + ty * self.tm_matrix[3] - str_widths = self._compute_str_widths(self._actual_str_size["str_widths"]) + str_widths = self.compute_str_widths(self._actual_str_size["str_widths"]) self._actual_str_size["str_widths"] = 0.0 self._handle_position_change(str_widths) def _handle_operation_set_text_matrix(self, operands: List[Any]) -> None: """Handle Tm (Set text matrix) operation.""" self.tm_matrix = [float(operand) for operand in operands[:6]] - str_widths = self._compute_str_widths(self._actual_str_size["str_widths"]) + str_widths = self.compute_str_widths(self._actual_str_size["str_widths"]) self._actual_str_size["str_widths"] = 0.0 self._handle_position_change(str_widths) @@ -370,7 +374,7 @@ def _handle_operation_move_to_next_line(self, operands: List[Any]) -> None: """Handle T* (Move to next line) operation.""" self.tm_matrix[4] -= self.TL * self.tm_matrix[2] self.tm_matrix[5] -= self.TL * self.tm_matrix[3] - str_widths = self._compute_str_widths(self._actual_str_size["str_widths"]) + str_widths = self.compute_str_widths(self._actual_str_size["str_widths"]) self._actual_str_size["str_widths"] = 0.0 self._handle_position_change(str_widths) @@ -389,7 +393,7 @@ def _handle_operation_show_text(self, operands: List[Any]) -> None: self._space_width, self._actual_str_size, ) - str_widths = self._compute_str_widths(self._actual_str_size["str_widths"]) + str_widths = self.compute_str_widths(self._actual_str_size["str_widths"]) self._handle_position_change(str_widths) def _handle_operation_show_text_with_positioning(self, operands: List[Any]) -> None: @@ -471,7 +475,7 @@ def _handle_position_change(self, str_widths: float) -> None: self.font_size, self.visitor_text, str_widths, - self._compute_str_widths(self._actual_str_size["space_width"]), + self.compute_str_widths(self._actual_str_size["space_width"]), self._actual_str_size["str_height"], ) if self.text == "": @@ -482,16 +486,15 @@ def _handle_position_change(self, str_widths: float) -> None: def _get_actual_font_widths( self, - cmap: Tuple[Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]], + cmap: Tuple[ + Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject] + ], text_operands: str, font_size: float, space_width: float, ) -> Tuple[float, float, float]: - """Get actual font widths for text operands.""" font_widths: float = 0 font_name: str = cmap[2] - - # Use the page object's font width maps if font_name not in self.page_obj._font_width_maps: if cmap[3] is None: font_width_map: Dict[Any, float] = {} @@ -505,7 +508,6 @@ def _get_actual_font_widths( if actual_space_width == 0: actual_space_width = space_width self.page_obj._font_width_maps[font_name] = (font_width_map, space_char, actual_space_width) - font_width_map = self.page_obj._font_width_maps[font_name][0] space_char = self.page_obj._font_width_maps[font_name][1] actual_space_width = self.page_obj._font_width_maps[font_name][2] @@ -516,18 +518,17 @@ def _get_actual_font_widths( font_widths += actual_space_width continue font_widths += compute_font_width(font_width_map, char) - return (font_widths * font_size, space_width * font_size, font_size) - - def _handle_tj( self, text: str, operands: List[Union[str, TextStringObject]], cm_matrix: List[float], tm_matrix: List[float], - cmap: Tuple[Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]], + cmap: Tuple[ + Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject] + ], orientations: Tuple[int, ...], font_size: float, rtl_dir: bool, @@ -535,8 +536,8 @@ def _handle_tj( space_width: float, actual_str_size: Dict[str, float], ) -> Tuple[str, bool, Dict[str, float]]: - """Handle text showing operations.""" - text_operands, is_str_operands = get_text_operands(operands, cm_matrix, tm_matrix, cmap, orientations) + text_operands, is_str_operands = get_text_operands( + operands, cm_matrix, tm_matrix, cmap, orientations) if is_str_operands: text += text_operands else: @@ -550,13 +551,8 @@ def _handle_tj( rtl_dir, visitor_text, ) - - font_widths, actual_str_size["space_width"], actual_str_size["str_height"] = self._get_actual_font_widths( - cmap, - text_operands, - font_size, - space_width, - ) + font_widths, actual_str_size["space_width"], actual_str_size["str_height"] = ( + self._get_actual_font_widths(cmap, text_operands, font_size, space_width)) actual_str_size["str_widths"] += font_widths return text, rtl_dir, actual_str_size