From abb5130723fa82aa3ee5f55e44f475c6dbe75629 Mon Sep 17 00:00:00 2001
From: Martin Thoma <info@martin-thoma.de>
Date: Sun, 29 Jun 2025 15:24:04 +0200
Subject: [PATCH 1/5] MAINT: Refactor _page.py

This is an experiment to see how well Github Copilot works
---
 pypdf/__init__.py        |   3 +-
 pypdf/_page.py           | 383 +--------------------------
 pypdf/_text_extractor.py | 557 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 573 insertions(+), 370 deletions(-)
 create mode 100644 pypdf/_text_extractor.py

diff --git a/pypdf/__init__.py b/pypdf/__init__.py
index 87accc401..d2812817e 100644
--- a/pypdf/__init__.py
+++ b/pypdf/__init__.py
@@ -11,8 +11,9 @@
 from ._doc_common import DocumentInformation
 from ._encryption import PasswordType
 from ._merger import PdfMerger
-from ._page import PageObject, Transformation, mult
+from ._page import PageObject, Transformation
 from ._reader import PdfReader
+from ._text_extraction import mult
 from ._version import __version__
 from ._writer import ObjectDeletionFlag, PdfWriter
 from .constants import ImageType
diff --git a/pypdf/_page.py b/pypdf/_page.py
index 59f7bceca..9a429a035 100644
--- a/pypdf/_page.py
+++ b/pypdf/_page.py
@@ -54,17 +54,12 @@
     build_font_width_map,
     compute_font_width,
     get_actual_str_key,
-    unknown_char_map,
 )
 from ._protocols import PdfCommonDocProtocol
 from ._text_extraction import (
-    OrientationNotFoundError,
     _layout_mode,
-    crlf_space_check,
-    get_display_str,
-    get_text_operands,
-    mult,
 )
+from ._text_extractor import TextExtraction
 from ._utils import (
     CompressedTransformationMatrix,
     TransformationMatrixType,
@@ -92,7 +87,6 @@
     PdfObject,
     RectangleObject,
     StreamObject,
-    TextStringObject,
     is_null_or_none,
 )
 
@@ -1698,42 +1692,6 @@ def _get_actual_font_widths(
                 font_widths += compute_font_width(font_width_map, char)
         return (font_widths * font_size, space_width * font_size, font_size)
 
-    def _handle_tj(
-        self,
-        text: str,
-        operands: List[Union[str, TextStringObject]],
-        cm_matrix: List[float],
-        tm_matrix: List[float],
-        cmap: Tuple[
-            Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
-        ],
-        orientations: Tuple[int, ...],
-        font_size: float,
-        rtl_dir: bool,
-        visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
-        space_width: float,
-        actual_str_size: Dict[str, float]
-    ) -> Tuple[str, bool, Dict[str, float]]:
-        text_operands, is_str_operands = get_text_operands(
-            operands, cm_matrix, tm_matrix, cmap, orientations)
-        if is_str_operands:
-            text += text_operands
-        else:
-            text, rtl_dir = get_display_str(
-                text,
-                cm_matrix,
-                tm_matrix,  # text matrix
-                cmap,
-                text_operands,
-                font_size,
-                rtl_dir,
-                visitor_text)
-        font_widths, actual_str_size["space_width"], actual_str_size["str_height"] = (
-            self._get_actual_font_widths(cmap, text_operands, font_size, space_width))
-        actual_str_size["str_widths"] += font_widths
-
-        return text, rtl_dir, actual_str_size
-
     def _extract_text(
         self,
         obj: Any,
@@ -1754,332 +1712,19 @@ def _extract_text(
                 default = "/Content"
 
         """
-        text: str = ""
-        output: str = ""
-        rtl_dir: bool = False  # right-to-left
-        cmaps: Dict[
-            str,
-            Tuple[
-                str, float, Union[str, Dict[int, str]], Dict[str, str], DictionaryObject
-            ],
-        ] = {}
-
-        try:
-            objr = obj
-            while NameObject(PG.RESOURCES) not in objr:
-                # /Resources can be inherited so we look to parents
-                objr = objr["/Parent"].get_object()
-                # If no parents then no /Resources will be available,
-                # so an exception will be raised
-            resources_dict = cast(DictionaryObject, objr[PG.RESOURCES])
-        except Exception:
-            # No resources means no text is possible (no font); we consider the
-            # file as not damaged, no need to check for TJ or Tj
-            return ""
-
-        if "/Font" in resources_dict and (font := resources_dict["/Font"]):
-            for f in cast(DictionaryObject, font):
-                cmaps[f] = build_char_map(f, space_width, obj)
-        cmap: Tuple[
-            Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
-        ] = (
-            "charmap",
-            {},
-            "NotInitialized",
-            None,
-        )  # (encoding, CMAP, font resource name, font)
-
-        try:
-            content = (
-                obj[content_key].get_object() if isinstance(content_key, str) else obj
-            )
-            if not isinstance(content, ContentStream):
-                content = ContentStream(content, pdf, "bytes")
-        except (AttributeError, KeyError):  # no content can be extracted (certainly empty page)
-            return ""
-        # We check all strings are TextStringObjects. ByteStringObjects
-        # are strings where the byte->string encoding was unknown, so adding
-        # them to the text here would be gibberish.
-
-        cm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
-        tm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
-        cm_stack = []
-
-        # Store the last modified matrices; can be an intermediate position
-        cm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
-        tm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
-
-        # Store the position at the beginning of building the text
-        memo_cm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
-        memo_tm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
-
-        char_scale = 1.0
-        space_scale = 1.0
-        _space_width: float = 500.0  # will be set correctly at first Tf
-        _actual_str_size: Dict[str, float] = {
-            "str_widths": 0.0, "space_width": 0.0, "str_height": 0.0
-        }  # will be set to string length calculation result
-        TL = 0.0
-        font_size = 12.0  # init just in case of
-
-        def compute_str_widths(str_widths: float) -> float:
-            return str_widths / 1000
-
-        def process_operation(operator: bytes, operands: List[Any]) -> None:
-            nonlocal cm_matrix, tm_matrix, cm_stack, cm_prev, tm_prev, memo_cm, memo_tm
-            nonlocal char_scale, space_scale, _space_width, TL, font_size, cmap
-            nonlocal orientations, rtl_dir, visitor_text, output, text, _actual_str_size
-
-            str_widths: float = 0.0
-
-            # Table 5.4 page 405
-            if operator == b"BT":  # Begin Text
-                tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
-                # Flush text:
-                output += text
-                if visitor_text is not None:
-                    visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
-                text = ""
-                memo_cm = cm_matrix.copy()
-                memo_tm = tm_matrix.copy()
-                return
-            if operator == b"ET":  # End Text
-                # Flush text:
-                output += text
-                if visitor_text is not None:
-                    visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
-                text = ""
-                memo_cm = cm_matrix.copy()
-                memo_tm = tm_matrix.copy()
-
-            # Table 4.7 "Graphics state operators", page 219
-            # cm_matrix calculation is reserved for later
-            elif operator == b"q":  # Save graphics state
-                cm_stack.append(
-                    (
-                        cm_matrix,
-                        cmap,
-                        font_size,
-                        char_scale,
-                        space_scale,
-                        _space_width,
-                        TL,
-                    )
-                )
-            elif operator == b"Q":  # Restore graphics state
-                try:
-                    (
-                        cm_matrix,
-                        cmap,
-                        font_size,
-                        char_scale,
-                        space_scale,
-                        _space_width,
-                        TL,
-                    ) = cm_stack.pop()
-                except Exception:
-                    cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
-            elif operator == b"cm":  # Modify current matrix
-                output += text
-                if visitor_text is not None:
-                    visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
-                text = ""
-                try:
-                    cm_matrix = mult(
-                        [float(operand) for operand in operands[:6]],
-                        cm_matrix
-                    )
-                except Exception:
-                    cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
-                memo_cm = cm_matrix.copy()
-                memo_tm = tm_matrix.copy()
-
-            # Table 5.2 page 398
-            elif operator == b"Tz":  # Set horizontal text scaling
-                char_scale = float(operands[0]) / 100 if operands else 1.0
-            elif operator == b"Tw":  # Set word spacing
-                space_scale = 1.0 + float(operands[0] if operands else 0.0)
-            elif operator == b"TL":  # Set Text Leading
-                scale_x = math.sqrt(tm_matrix[0]**2 + tm_matrix[2]**2)
-                TL = float(operands[0] if operands else 0.0) * font_size * scale_x
-            elif operator == b"Tf":  # Set font size
-                if text != "":
-                    output += text  # .translate(cmap)
-                    if visitor_text is not None:
-                        visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
-                text = ""
-                memo_cm = cm_matrix.copy()
-                memo_tm = tm_matrix.copy()
-                try:
-                    # char_map_tuple: font_type,
-                    #                 float(sp_width / 2),
-                    #                 encoding,
-                    #                 map_dict,
-                    #                 font_dict (describes the font)
-                    char_map_tuple = cmaps[operands[0]]
-                    # current cmap: encoding,
-                    #               map_dict,
-                    #               font resource name (internal name, not the real font name),
-                    #               font_dict
-                    cmap = (
-                        char_map_tuple[2],
-                        char_map_tuple[3],
-                        operands[0],
-                        char_map_tuple[4],
-                    )
-                    _space_width = char_map_tuple[1]
-                except KeyError:  # font not found
-                    cmap = (
-                        unknown_char_map[2],
-                        unknown_char_map[3],
-                        f"???{operands[0]}",
-                        None,
-                    )
-                    _space_width = unknown_char_map[1]
-                try:
-                    font_size = float(operands[1])
-                except Exception:
-                    pass  # keep previous size
-            # Table 5.5 page 406
-            elif operator == b"Td":  # Move text position
-                # A special case is a translating only tm:
-                # tm = [1, 0, 0, 1, e, f]
-                # i.e. tm[4] += tx, tm[5] += ty.
-                tx, ty = float(operands[0]), float(operands[1])
-                tm_matrix[4] += tx * tm_matrix[0] + ty * tm_matrix[2]
-                tm_matrix[5] += tx * tm_matrix[1] + ty * tm_matrix[3]
-                str_widths = compute_str_widths(_actual_str_size["str_widths"])
-                _actual_str_size["str_widths"] = 0.0
-            elif operator == b"Tm":  # Set text matrix
-                tm_matrix = [float(operand) for operand in operands[:6]]
-                str_widths = compute_str_widths(_actual_str_size["str_widths"])
-                _actual_str_size["str_widths"] = 0.0
-            elif operator == b"T*":  # Move to next line
-                tm_matrix[4] -= TL * tm_matrix[2]
-                tm_matrix[5] -= TL * tm_matrix[3]
-                str_widths = compute_str_widths(_actual_str_size["str_widths"])
-                _actual_str_size["str_widths"] = 0.0
-            elif operator == b"Tj":  # Show text
-                text, rtl_dir, _actual_str_size = self._handle_tj(
-                    text,
-                    operands,
-                    cm_matrix,
-                    tm_matrix,
-                    cmap,
-                    orientations,
-                    font_size,
-                    rtl_dir,
-                    visitor_text,
-                    _space_width,
-                    _actual_str_size,
-                )
-            else:
-                return
-
-            if operator in {b"Td", b"Tm", b"T*", b"Tj"}:
-                try:
-                    text, output, cm_prev, tm_prev = crlf_space_check(
-                        text,
-                        (cm_prev, tm_prev),
-                        (cm_matrix, tm_matrix),
-                        (memo_cm, memo_tm),
-                        cmap,
-                        orientations,
-                        output,
-                        font_size,
-                        visitor_text,
-                        str_widths,
-                        compute_str_widths(_actual_str_size["space_width"]),
-                        _actual_str_size["str_height"]
-                    )
-                    if text == "":
-                        memo_cm = cm_matrix.copy()
-                        memo_tm = tm_matrix.copy()
-                except OrientationNotFoundError:
-                    return
-
-        for operands, operator in content.operations:
-            if visitor_operand_before is not None:
-                visitor_operand_before(operator, operands, cm_matrix, tm_matrix)
-            # Multiple operators are handled here
-            if operator == b"'":
-                process_operation(b"T*", [])
-                process_operation(b"Tj", operands)
-            elif operator == b'"':
-                process_operation(b"Tw", [operands[0]])
-                process_operation(b"Tc", [operands[1]])
-                process_operation(b"T*", [])
-                process_operation(b"Tj", operands[2:])
-            elif operator == b"TJ":
-                # The space width may be smaller than the font width, so the width should be 95%.
-                _confirm_space_width = _space_width * 0.95
-                if operands:
-                    for op in operands[0]:
-                        if isinstance(op, (str, bytes)):
-                            process_operation(b"Tj", [op])
-                        if isinstance(op, (int, float, NumberObject, FloatObject)) and (
-                            abs(float(op)) >= _confirm_space_width
-                            and text
-                            and text[-1] != " "
-                        ):
-                            process_operation(b"Tj", [" "])
-            elif operator == b"TD":
-                process_operation(b"TL", [-operands[1]])
-                process_operation(b"Td", operands)
-            elif operator == b"Do":
-                output += text
-                if visitor_text is not None:
-                    visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
-                try:
-                    if output[-1] != "\n":
-                        output += "\n"
-                        if visitor_text is not None:
-                            visitor_text(
-                                "\n",
-                                memo_cm,
-                                memo_tm,
-                                cmap[3],
-                                font_size,
-                            )
-                except IndexError:
-                    pass
-                try:
-                    xobj = resources_dict["/XObject"]
-                    if xobj[operands[0]]["/Subtype"] != "/Image":  # type: ignore
-                        text = self.extract_xform_text(
-                            xobj[operands[0]],  # type: ignore
-                            orientations,
-                            space_width,
-                            visitor_operand_before,
-                            visitor_operand_after,
-                            visitor_text,
-                        )
-                        output += text
-                        if visitor_text is not None:
-                            visitor_text(
-                                text,
-                                memo_cm,
-                                memo_tm,
-                                cmap[3],
-                                font_size,
-                            )
-                except Exception as exception:
-                    logger_warning(
-                        f"Impossible to decode XFormObject {operands[0]}: {exception}",
-                        __name__,
-                    )
-                finally:
-                    text = ""
-                    memo_cm = cm_matrix.copy()
-                    memo_tm = tm_matrix.copy()
-            else:
-                process_operation(operator, operands)
-            if visitor_operand_after is not None:
-                visitor_operand_after(operator, operands, cm_matrix, tm_matrix)
-        output += text  # just in case
-        if text != "" and visitor_text is not None:
-            visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
-        return output
+        # Use the new TextExtraction class
+        extractor = TextExtraction(
+            self,  # Pass the page object for font width maps
+            obj,
+            pdf,
+            orientations,
+            space_width,
+            content_key,
+            visitor_operand_before,
+            visitor_operand_after,
+            visitor_text,
+        )
+        return extractor.extract_text()
 
     def _layout_mode_fonts(self) -> Dict[str, _layout_mode.Font]:
         """
diff --git a/pypdf/_text_extractor.py b/pypdf/_text_extractor.py
new file mode 100644
index 000000000..2f7ccfa56
--- /dev/null
+++ b/pypdf/_text_extractor.py
@@ -0,0 +1,557 @@
+# Copyright (c) 2006, Mathieu Fenniak
+# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com>
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# * Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+# * The name of the author may not be used to endorse or promote products
+# derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+import math
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
+
+from ._cmap import (
+    build_char_map,
+    build_font_width_map,
+    compute_font_width,
+    get_actual_str_key,
+    unknown_char_map,
+)
+from ._text_extraction import (
+    OrientationNotFoundError,
+    crlf_space_check,
+    get_display_str,
+    get_text_operands,
+    mult,
+)
+from ._utils import logger_warning
+from .constants import PageAttributes as PG
+from .generic import (
+    ContentStream,
+    DictionaryObject,
+    FloatObject,
+    NameObject,
+    NumberObject,
+    TextStringObject,
+)
+
+
+class TextExtraction:
+    """
+    A class to handle PDF text extraction operations.
+
+    This class encapsulates all the state and operations needed for extracting
+    text from PDF content streams, replacing the nested functions and nonlocal
+    variables in the original implementation.
+    """
+
+    def __init__(
+        self,
+        page_obj: Any,  # PageObject reference
+        obj: Any,
+        pdf: Any,
+        orientations: Tuple[int, ...] = (0, 90, 180, 270),
+        space_width: float = 200.0,
+        content_key: Optional[str] = PG.CONTENTS,
+        visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,
+        visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,
+        visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,
+    ) -> None:
+        """Initialize the text extraction with parameters and state."""
+        self.page_obj = page_obj  # Reference to the PageObject for font width maps
+        self.obj = obj
+        self.pdf = pdf
+        self.orientations = orientations
+        self.space_width = space_width
+        self.content_key = content_key
+        self.visitor_operand_before = visitor_operand_before
+        self.visitor_operand_after = visitor_operand_after
+        self.visitor_text = visitor_text
+
+        # Text state
+        self.text: str = ""
+        self.output: str = ""
+        self.rtl_dir: bool = False  # right-to-left
+
+        # Matrix state
+        self.cm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
+        self.tm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
+        self.cm_stack: List[Tuple[Any, ...]] = []
+
+        # Previous matrices for tracking changes
+        self.cm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
+        self.tm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
+
+        # Memo matrices for visitor callbacks
+        self.memo_cm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
+        self.memo_tm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
+
+        # Font and text scaling state
+        self.char_scale: float = 1.0
+        self.space_scale: float = 1.0
+        self._space_width: float = 500.0  # will be set correctly at first Tf
+        self.TL: float = 0.0
+        self.font_size: float = 12.0  # init just in case
+
+        # Character map state
+        self.cmap: Tuple[Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]] = (
+            "charmap",
+            {},
+            "NotInitialized",
+            None,
+        )  # (encoding, CMAP, font resource name, font)
+
+        # Actual string size tracking
+        self._actual_str_size: Dict[str, float] = {"str_widths": 0.0, "space_width": 0.0, "str_height": 0.0}
+
+        # Character maps for fonts
+        self.cmaps: Dict[
+            str,
+            Tuple[str, float, Union[str, Dict[int, str]], Dict[str, str], DictionaryObject],
+        ] = {}
+
+        # Resources dictionary
+        self.resources_dict: Optional[DictionaryObject] = None
+
+        # Operation handler mapping
+        self.operation_handlers = {
+            b"BT": self._handle_operation_begin_text,
+            b"ET": self._handle_operation_end_text,
+            b"q": self._handle_operation_save_graphics_state,
+            b"Q": self._handle_operation_restore_graphics_state,
+            b"cm": self._handle_operation_modify_current_matrix,
+            b"Tz": self._handle_operation_horizontal_text_scaling,
+            b"Tw": self._handle_operation_word_spacing,
+            b"TL": self._handle_operation_text_leading,
+            b"Tf": self._handle_operation_set_font,
+            b"Td": self._handle_operation_move_text_position,
+            b"Tm": self._handle_operation_set_text_matrix,
+            b"T*": self._handle_operation_move_to_next_line,
+            b"Tj": self._handle_operation_show_text,
+        }
+
+    def extract_text(self) -> str:
+        """Extract text from the PDF object."""
+        # Initialize resources and content
+        if not self._initialize_resources():
+            return ""
+
+        content = self._get_content()
+        if content is None:
+            return ""
+
+        # Process all operations in the content stream
+        for operands, operator in content.operations:
+            self._process_operation(operator, operands)
+
+        # Add any remaining text to output
+        self.output += self.text
+        if self.text != "" and self.visitor_text is not None:
+            self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.cmap[3], self.font_size)
+
+        return self.output
+
+    def _initialize_resources(self) -> bool:
+        """Initialize resources dictionary and character maps."""
+        try:
+            objr = self.obj
+            while NameObject(PG.RESOURCES) not in objr:
+                # /Resources can be inherited so we look to parents
+                objr = objr["/Parent"].get_object()
+                # If no parents then no /Resources will be available,
+                # so an exception will be raised
+            self.resources_dict = cast("DictionaryObject", objr[PG.RESOURCES])
+        except Exception:
+            # No resources means no text is possible (no font)
+            return False
+
+        if "/Font" in self.resources_dict and (font := self.resources_dict["/Font"]):
+            for f in cast("DictionaryObject", font):
+                self.cmaps[f] = build_char_map(f, self.space_width, self.obj)
+
+        return True
+
+    def _get_content(self) -> Optional[ContentStream]:
+        """Get the content stream from the object."""
+        try:
+            content = self.obj[self.content_key].get_object() if isinstance(self.content_key, str) else self.obj
+            if not isinstance(content, ContentStream):
+                content = ContentStream(content, self.pdf, "bytes")
+            return content
+        except (AttributeError, KeyError):
+            return None
+
+    def _process_operation(self, operator: bytes, operands: List[Any]) -> None:
+        """Process a single PDF operation."""
+        if self.visitor_operand_before is not None:
+            self.visitor_operand_before(operator, operands, self.cm_matrix, self.tm_matrix)
+
+        # Handle compound operators
+        if operator == b"'":
+            self._handle_operation_move_to_next_line([])
+            self._handle_operation_show_text(operands)
+        elif operator == b'"':
+            self._handle_operation_word_spacing([operands[0]])
+            self._handle_operation_character_spacing([operands[1]])
+            self._handle_operation_move_to_next_line([])
+            self._handle_operation_show_text(operands[2:])
+        elif operator == b"TJ":
+            self._handle_operation_show_text_with_positioning(operands)
+        elif operator == b"TD":
+            self._handle_operation_text_leading([-operands[1]])
+            self._handle_operation_move_text_position(operands)
+        elif operator == b"Do":
+            self._handle_operation_do(operands)
+        else:
+            # Use the operation handler mapping
+            handler = self.operation_handlers.get(operator)
+            if handler:
+                handler(operands)
+
+        if self.visitor_operand_after is not None:
+            self.visitor_operand_after(operator, operands, self.cm_matrix, self.tm_matrix)
+
+    def _compute_str_widths(self, str_widths: float) -> float:
+        """Compute string widths."""
+        return str_widths / 1000
+
+    def _flush_text(self) -> None:
+        """Flush current text to output."""
+        self.output += self.text
+        if self.visitor_text is not None:
+            self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.cmap[3], self.font_size)
+        self.text = ""
+        self.memo_cm = self.cm_matrix.copy()
+        self.memo_tm = self.tm_matrix.copy()
+
+    # Operation handlers
+    def _handle_operation_begin_text(self, operands: List[Any]) -> None:
+        """Handle BT (Begin Text) operation."""
+        self.tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
+        self._flush_text()
+
+    def _handle_operation_end_text(self, operands: List[Any]) -> None:
+        """Handle ET (End Text) operation."""
+        self._flush_text()
+
+    def _handle_operation_save_graphics_state(self, operands: List[Any]) -> None:
+        """Handle q (Save graphics state) operation."""
+        self.cm_stack.append(
+            (
+                self.cm_matrix,
+                self.cmap,
+                self.font_size,
+                self.char_scale,
+                self.space_scale,
+                self._space_width,
+                self.TL,
+            ),
+        )
+
+    def _handle_operation_restore_graphics_state(self, operands: List[Any]) -> None:
+        """Handle Q (Restore graphics state) operation."""
+        try:
+            (
+                self.cm_matrix,
+                self.cmap,
+                self.font_size,
+                self.char_scale,
+                self.space_scale,
+                self._space_width,
+                self.TL,
+            ) = self.cm_stack.pop()
+        except Exception:
+            self.cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
+
+    def _handle_operation_modify_current_matrix(self, operands: List[Any]) -> None:
+        """Handle cm (Modify current matrix) operation."""
+        self._flush_text()
+        try:
+            self.cm_matrix = mult([float(operand) for operand in operands[:6]], self.cm_matrix)
+        except Exception:
+            self.cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
+        self.memo_cm = self.cm_matrix.copy()
+        self.memo_tm = self.tm_matrix.copy()
+
+    def _handle_operation_horizontal_text_scaling(self, operands: List[Any]) -> None:
+        """Handle Tz (Set horizontal text scaling) operation."""
+        self.char_scale = float(operands[0]) / 100 if operands else 1.0
+
+    def _handle_operation_word_spacing(self, operands: List[Any]) -> None:
+        """Handle Tw (Set word spacing) operation."""
+        self.space_scale = 1.0 + float(operands[0] if operands else 0.0)
+
+    def _handle_operation_character_spacing(self, operands: List[Any]) -> None:
+        """Handle Tc (Set character spacing) operation."""
+        # This is a placeholder for character spacing handling
+
+    def _handle_operation_text_leading(self, operands: List[Any]) -> None:
+        """Handle TL (Set Text Leading) operation."""
+        scale_x = math.sqrt(self.tm_matrix[0] ** 2 + self.tm_matrix[2] ** 2)
+        self.TL = float(operands[0] if operands else 0.0) * self.font_size * scale_x
+
+    def _handle_operation_set_font(self, operands: List[Any]) -> None:
+        """Handle Tf (Set font size) operation."""
+        if self.text != "":
+            self._flush_text()
+
+        try:
+            # char_map_tuple: font_type,
+            #                 float(sp_width / 2),
+            #                 encoding,
+            #                 map_dict,
+            #                 font_dict (describes the font)
+            char_map_tuple = self.cmaps[operands[0]]
+            # current cmap: encoding,
+            #               map_dict,
+            #               font resource name (internal name, not the real font name),
+            #               font_dict
+            self.cmap = (
+                char_map_tuple[2],
+                char_map_tuple[3],
+                operands[0],
+                char_map_tuple[4],
+            )
+            self._space_width = char_map_tuple[1]
+        except KeyError:  # font not found
+            self.cmap = (
+                unknown_char_map[2],
+                unknown_char_map[3],
+                f"???{operands[0]}",
+                None,
+            )
+            self._space_width = unknown_char_map[1]
+
+        try:
+            self.font_size = float(operands[1])
+        except Exception:
+            pass  # keep previous size
+
+    def _handle_operation_move_text_position(self, operands: List[Any]) -> None:
+        """Handle Td (Move text position) operation."""
+        # A special case is a translating only tm:
+        # tm = [1, 0, 0, 1, e, f]
+        # i.e. tm[4] += tx, tm[5] += ty.
+        tx, ty = float(operands[0]), float(operands[1])
+        self.tm_matrix[4] += tx * self.tm_matrix[0] + ty * self.tm_matrix[2]
+        self.tm_matrix[5] += tx * self.tm_matrix[1] + ty * self.tm_matrix[3]
+        str_widths = self._compute_str_widths(self._actual_str_size["str_widths"])
+        self._actual_str_size["str_widths"] = 0.0
+        self._handle_position_change(str_widths)
+
+    def _handle_operation_set_text_matrix(self, operands: List[Any]) -> None:
+        """Handle Tm (Set text matrix) operation."""
+        self.tm_matrix = [float(operand) for operand in operands[:6]]
+        str_widths = self._compute_str_widths(self._actual_str_size["str_widths"])
+        self._actual_str_size["str_widths"] = 0.0
+        self._handle_position_change(str_widths)
+
+    def _handle_operation_move_to_next_line(self, operands: List[Any]) -> None:
+        """Handle T* (Move to next line) operation."""
+        self.tm_matrix[4] -= self.TL * self.tm_matrix[2]
+        self.tm_matrix[5] -= self.TL * self.tm_matrix[3]
+        str_widths = self._compute_str_widths(self._actual_str_size["str_widths"])
+        self._actual_str_size["str_widths"] = 0.0
+        self._handle_position_change(str_widths)
+
+    def _handle_operation_show_text(self, operands: List[Any]) -> None:
+        """Handle Tj (Show text) operation."""
+        self.text, self.rtl_dir, self._actual_str_size = self._handle_tj(
+            self.text,
+            operands,
+            self.cm_matrix,
+            self.tm_matrix,
+            self.cmap,
+            self.orientations,
+            self.font_size,
+            self.rtl_dir,
+            self.visitor_text,
+            self._space_width,
+            self._actual_str_size,
+        )
+        str_widths = self._compute_str_widths(self._actual_str_size["str_widths"])
+        self._handle_position_change(str_widths)
+
+    def _handle_operation_show_text_with_positioning(self, operands: List[Any]) -> None:
+        """Handle TJ (Show text with positioning) operation."""
+        # The space width may be smaller than the font width, so the width should be 95%.
+        _confirm_space_width = self._space_width * 0.95
+        if operands:
+            for op in operands[0]:
+                if isinstance(op, (str, bytes)):
+                    self._handle_operation_show_text([op])
+                if isinstance(op, (int, float, NumberObject, FloatObject)) and (
+                    abs(float(op)) >= _confirm_space_width and self.text and self.text[-1] != " "
+                ):
+                    self._handle_operation_show_text([" "])
+
+    def _handle_operation_do(self, operands: List[Any]) -> None:
+        """Handle Do (Execute XObject) operation."""
+        self._flush_text()
+        try:
+            if self.output and self.output[-1] != "\n":
+                self.output += "\n"
+                if self.visitor_text is not None:
+                    self.visitor_text(
+                        "\n",
+                        self.memo_cm,
+                        self.memo_tm,
+                        self.cmap[3],
+                        self.font_size,
+                    )
+        except IndexError:
+            pass
+
+        try:
+            xobj = self.resources_dict["/XObject"]  # type: ignore
+            if xobj[operands[0]]["/Subtype"] != "/Image":  # type: ignore
+                # Extract text from XForm object
+                xform_extractor = TextExtraction(
+                    self.page_obj,
+                    xobj[operands[0]],  # type: ignore
+                    self.pdf,
+                    self.orientations,
+                    self.space_width,
+                    None,  # content_key = None for XForm objects
+                    self.visitor_operand_before,
+                    self.visitor_operand_after,
+                    self.visitor_text,
+                )
+                text = xform_extractor.extract_text()
+                self.output += text
+                if self.visitor_text is not None:
+                    self.visitor_text(
+                        text,
+                        self.memo_cm,
+                        self.memo_tm,
+                        self.cmap[3],
+                        self.font_size,
+                    )
+        except Exception as exception:
+            logger_warning(
+                f"Impossible to decode XFormObject {operands[0]}: {exception}",
+                __name__,
+            )
+        finally:
+            self.text = ""
+            self.memo_cm = self.cm_matrix.copy()
+            self.memo_tm = self.tm_matrix.copy()
+
+    def _handle_position_change(self, str_widths: float) -> None:
+        """Handle position changes for text positioning operations."""
+        try:
+            self.text, self.output, self.cm_prev, self.tm_prev = crlf_space_check(
+                self.text,
+                (self.cm_prev, self.tm_prev),
+                (self.cm_matrix, self.tm_matrix),
+                (self.memo_cm, self.memo_tm),
+                self.cmap,
+                self.orientations,
+                self.output,
+                self.font_size,
+                self.visitor_text,
+                str_widths,
+                self._compute_str_widths(self._actual_str_size["space_width"]),
+                self._actual_str_size["str_height"],
+            )
+            if self.text == "":
+                self.memo_cm = self.cm_matrix.copy()
+                self.memo_tm = self.tm_matrix.copy()
+        except OrientationNotFoundError:
+            return
+
+    def _handle_tj(
+        self,
+        text: str,
+        operands: List[Union[str, TextStringObject]],
+        cm_matrix: List[float],
+        tm_matrix: List[float],
+        cmap: Tuple[Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]],
+        orientations: Tuple[int, ...],
+        font_size: float,
+        rtl_dir: bool,
+        visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
+        space_width: float,
+        actual_str_size: Dict[str, float],
+    ) -> Tuple[str, bool, Dict[str, float]]:
+        """Handle text showing operations."""
+        text_operands, is_str_operands = get_text_operands(operands, cm_matrix, tm_matrix, cmap, orientations)
+        if is_str_operands:
+            text += text_operands
+        else:
+            text, rtl_dir = get_display_str(
+                text,
+                cm_matrix,
+                tm_matrix,  # text matrix
+                cmap,
+                text_operands,
+                font_size,
+                rtl_dir,
+                visitor_text,
+            )
+
+        font_widths, actual_str_size["space_width"], actual_str_size["str_height"] = self._get_actual_font_widths(
+            cmap, text_operands, font_size, space_width,
+        )
+        actual_str_size["str_widths"] += font_widths
+
+        return text, rtl_dir, actual_str_size
+
+    def _get_actual_font_widths(
+        self,
+        cmap: Tuple[Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]],
+        text_operands: str,
+        font_size: float,
+        space_width: float,
+    ) -> Tuple[float, float, float]:
+        """Get actual font widths for text operands."""
+        font_widths: float = 0
+        font_name: str = cmap[2]
+
+        # Use the page object's font width maps
+        if font_name not in self.page_obj._font_width_maps:
+            if cmap[3] is None:
+                font_width_map: Dict[Any, float] = {}
+                space_char = " "
+                actual_space_width: float = space_width
+                font_width_map["default"] = actual_space_width * 2
+            else:
+                space_char = get_actual_str_key(" ", cmap[0], cmap[1])
+                font_width_map = build_font_width_map(cmap[3], space_width * 2)
+                actual_space_width = compute_font_width(font_width_map, space_char)
+            if actual_space_width == 0:
+                actual_space_width = space_width
+            self.page_obj._font_width_maps[font_name] = (font_width_map, space_char, actual_space_width)
+
+        font_width_map = self.page_obj._font_width_maps[font_name][0]
+        space_char = self.page_obj._font_width_maps[font_name][1]
+        actual_space_width = self.page_obj._font_width_maps[font_name][2]
+
+        if text_operands:
+            for char in text_operands:
+                if char == space_char:
+                    font_widths += actual_space_width
+                    continue
+                font_widths += compute_font_width(font_width_map, char)
+
+        return (font_widths * font_size, space_width * font_size, font_size)

From f4e9285a521ce32457bfca69e30df70496930c16 Mon Sep 17 00:00:00 2001
From: Martin Thoma <info@martin-thoma.de>
Date: Sun, 29 Jun 2025 21:39:08 +0200
Subject: [PATCH 2/5] Remove code duplication

---
 pypdf/_page.py | 39 ---------------------------------------
 1 file changed, 39 deletions(-)

diff --git a/pypdf/_page.py b/pypdf/_page.py
index 9a429a035..e079dd3c3 100644
--- a/pypdf/_page.py
+++ b/pypdf/_page.py
@@ -51,9 +51,6 @@
 
 from ._cmap import (
     build_char_map,
-    build_font_width_map,
-    compute_font_width,
-    get_actual_str_key,
 )
 from ._protocols import PdfCommonDocProtocol
 from ._text_extraction import (
@@ -1656,42 +1653,6 @@ def _debug_for_extract(self) -> str:  # pragma: no cover
             out += "No Font\n"
         return out
 
-    def _get_actual_font_widths(
-        self,
-        cmap: Tuple[
-            Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
-        ],
-        text_operands: str,
-        font_size: float,
-        space_width: float
-    ) -> Tuple[float, float, float]:
-        font_widths: float = 0
-        font_name: str = cmap[2]
-        if font_name not in self._font_width_maps:
-            if cmap[3] is None:
-                font_width_map: Dict[Any, float] = {}
-                space_char = " "
-                actual_space_width: float = space_width
-                font_width_map["default"] = actual_space_width * 2
-            else:
-                space_char = get_actual_str_key(" ", cmap[0], cmap[1])
-                font_width_map = build_font_width_map(cmap[3], space_width * 2)
-                actual_space_width = compute_font_width(font_width_map, space_char)
-            if actual_space_width == 0:
-                actual_space_width = space_width
-            self._font_width_maps[font_name] = (font_width_map, space_char, actual_space_width)
-        font_width_map = self._font_width_maps[font_name][0]
-        space_char = self._font_width_maps[font_name][1]
-        actual_space_width = self._font_width_maps[font_name][2]
-
-        if text_operands:
-            for char in text_operands:
-                if char == space_char:
-                    font_widths += actual_space_width
-                    continue
-                font_widths += compute_font_width(font_width_map, char)
-        return (font_widths * font_size, space_width * font_size, font_size)
-
     def _extract_text(
         self,
         obj: Any,

From 1ed2e380da8b8f90a05bc159bca1fc4f61da0ad3 Mon Sep 17 00:00:00 2001
From: Martin Thoma <info@martin-thoma.de>
Date: Sun, 29 Jun 2025 21:43:45 +0200
Subject: [PATCH 3/5] Move _text_extractor into _text_extraction

---
 pypdf/_page.py                                |  2 +-
 .../{ => _text_extraction}/_text_extractor.py | 27 ++++++++++---------
 2 files changed, 16 insertions(+), 13 deletions(-)
 rename pypdf/{ => _text_extraction}/_text_extractor.py (98%)

diff --git a/pypdf/_page.py b/pypdf/_page.py
index e079dd3c3..89a159f1c 100644
--- a/pypdf/_page.py
+++ b/pypdf/_page.py
@@ -56,7 +56,7 @@
 from ._text_extraction import (
     _layout_mode,
 )
-from ._text_extractor import TextExtraction
+from ._text_extraction._text_extractor import TextExtraction
 from ._utils import (
     CompressedTransformationMatrix,
     TransformationMatrixType,
diff --git a/pypdf/_text_extractor.py b/pypdf/_text_extraction/_text_extractor.py
similarity index 98%
rename from pypdf/_text_extractor.py
rename to pypdf/_text_extraction/_text_extractor.py
index 2f7ccfa56..cac68e1b5 100644
--- a/pypdf/_text_extractor.py
+++ b/pypdf/_text_extraction/_text_extractor.py
@@ -30,23 +30,16 @@
 import math
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
 
-from ._cmap import (
+from .._cmap import (
     build_char_map,
     build_font_width_map,
     compute_font_width,
     get_actual_str_key,
     unknown_char_map,
 )
-from ._text_extraction import (
-    OrientationNotFoundError,
-    crlf_space_check,
-    get_display_str,
-    get_text_operands,
-    mult,
-)
-from ._utils import logger_warning
-from .constants import PageAttributes as PG
-from .generic import (
+from .._utils import logger_warning
+from ..constants import PageAttributes as PG
+from ..generic import (
     ContentStream,
     DictionaryObject,
     FloatObject,
@@ -54,6 +47,13 @@
     NumberObject,
     TextStringObject,
 )
+from . import (
+    OrientationNotFoundError,
+    crlf_space_check,
+    get_display_str,
+    get_text_operands,
+    mult,
+)
 
 
 class TextExtraction:
@@ -511,7 +511,10 @@ def _handle_tj(
             )
 
         font_widths, actual_str_size["space_width"], actual_str_size["str_height"] = self._get_actual_font_widths(
-            cmap, text_operands, font_size, space_width,
+            cmap,
+            text_operands,
+            font_size,
+            space_width,
         )
         actual_str_size["str_widths"] += font_widths
 

From 2c643d8a27501ddfbba253aef73630b615c8cd25 Mon Sep 17 00:00:00 2001
From: Martin Thoma <info@martin-thoma.de>
Date: Fri, 4 Jul 2025 22:01:52 +0200
Subject: [PATCH 4/5] Reduce diff

---
 pypdf/_text_extraction/_text_extractor.py | 81 ++++++++++++-----------
 1 file changed, 41 insertions(+), 40 deletions(-)

diff --git a/pypdf/_text_extraction/_text_extractor.py b/pypdf/_text_extraction/_text_extractor.py
index eb65fca60..8f6b66a1e 100644
--- a/pypdf/_text_extraction/_text_extractor.py
+++ b/pypdf/_text_extraction/_text_extractor.py
@@ -480,46 +480,6 @@ def _handle_position_change(self, str_widths: float) -> None:
         except OrientationNotFoundError:
             return
 
-    def _handle_tj(
-        self,
-        text: str,
-        operands: List[Union[str, TextStringObject]],
-        cm_matrix: List[float],
-        tm_matrix: List[float],
-        cmap: Tuple[Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]],
-        orientations: Tuple[int, ...],
-        font_size: float,
-        rtl_dir: bool,
-        visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
-        space_width: float,
-        actual_str_size: Dict[str, float],
-    ) -> Tuple[str, bool, Dict[str, float]]:
-        """Handle text showing operations."""
-        text_operands, is_str_operands = get_text_operands(operands, cm_matrix, tm_matrix, cmap, orientations)
-        if is_str_operands:
-            text += text_operands
-        else:
-            text, rtl_dir = get_display_str(
-                text,
-                cm_matrix,
-                tm_matrix,  # text matrix
-                cmap,
-                text_operands,
-                font_size,
-                rtl_dir,
-                visitor_text,
-            )
-
-        font_widths, actual_str_size["space_width"], actual_str_size["str_height"] = self._get_actual_font_widths(
-            cmap,
-            text_operands,
-            font_size,
-            space_width,
-        )
-        actual_str_size["str_widths"] += font_widths
-
-        return text, rtl_dir, actual_str_size
-
     def _get_actual_font_widths(
         self,
         cmap: Tuple[Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]],
@@ -559,3 +519,44 @@ def _get_actual_font_widths(
 
         return (font_widths * font_size, space_width * font_size, font_size)
 
+
+
+    def _handle_tj(
+        self,
+        text: str,
+        operands: List[Union[str, TextStringObject]],
+        cm_matrix: List[float],
+        tm_matrix: List[float],
+        cmap: Tuple[Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]],
+        orientations: Tuple[int, ...],
+        font_size: float,
+        rtl_dir: bool,
+        visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
+        space_width: float,
+        actual_str_size: Dict[str, float],
+    ) -> Tuple[str, bool, Dict[str, float]]:
+        """Handle text showing operations."""
+        text_operands, is_str_operands = get_text_operands(operands, cm_matrix, tm_matrix, cmap, orientations)
+        if is_str_operands:
+            text += text_operands
+        else:
+            text, rtl_dir = get_display_str(
+                text,
+                cm_matrix,
+                tm_matrix,  # text matrix
+                cmap,
+                text_operands,
+                font_size,
+                rtl_dir,
+                visitor_text,
+            )
+
+        font_widths, actual_str_size["space_width"], actual_str_size["str_height"] = self._get_actual_font_widths(
+            cmap,
+            text_operands,
+            font_size,
+            space_width,
+        )
+        actual_str_size["str_widths"] += font_widths
+
+        return text, rtl_dir, actual_str_size

From c93aadd31c39b78159954bb983be38c405331d99 Mon Sep 17 00:00:00 2001
From: Martin Thoma <info@martin-thoma.de>
Date: Fri, 4 Jul 2025 22:08:15 +0200
Subject: [PATCH 5/5] Reduce diff

---
 pypdf/_text_extraction/_text_extractor.py | 90 +++++++++++------------
 1 file changed, 43 insertions(+), 47 deletions(-)

diff --git a/pypdf/_text_extraction/_text_extractor.py b/pypdf/_text_extraction/_text_extractor.py
index 8f6b66a1e..891ae058f 100644
--- a/pypdf/_text_extraction/_text_extractor.py
+++ b/pypdf/_text_extraction/_text_extractor.py
@@ -81,55 +81,60 @@ def __init__(
         self.page_obj = page_obj  # Reference to the PageObject for font width maps
         self.obj = obj
         self.pdf = pdf
-        self.orientations = orientations
+
         self.space_width = space_width
         self.content_key = content_key
         self.visitor_operand_before = visitor_operand_before
         self.visitor_operand_after = visitor_operand_after
-        self.visitor_text = visitor_text
-
-        # Text state
-        self.text: str = ""
-        self.output: str = ""
-        self.rtl_dir: bool = False  # right-to-left
 
         # Matrix state
         self.cm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
         self.tm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
-        self.cm_stack: List[Tuple[Any, ...]] = []
-
-        # Previous matrices for tracking changes
+        self.cm_stack: List[
+            Tuple[
+                List[float],
+                Tuple[Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]],
+                float,
+                float,
+                float,
+                float,
+                float,
+            ]
+        ] = []
+
+        # Store the last modified matrices; can be an intermediate position
         self.cm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
         self.tm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
 
-        # Memo matrices for visitor callbacks
+        # Store the position at the beginning of building the text
         self.memo_cm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
         self.memo_tm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
 
         # Font and text scaling state
-        self.char_scale: float = 1.0
-        self.space_scale: float = 1.0
+        self.char_scale = 1.0
+        self.space_scale = 1.0
         self._space_width: float = 500.0  # will be set correctly at first Tf
-        self.TL: float = 0.0
-        self.font_size: float = 12.0  # init just in case
+        self.TL = 0.0
+        self.font_size = 12.0  # init just in case
+
+        # Text state
+        self.text: str = ""
+        self.output: str = ""
+        self.rtl_dir: bool = False  # right-to-left
 
-        # Character map state
         self.cmap: Tuple[Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]] = (
             "charmap",
             {},
             "NotInitialized",
             None,
         )  # (encoding, CMAP, font resource name, font)
+        self.orientations: Tuple[int, ...] = orientations
+        self.visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None
+        self.cmaps: Dict[str, Tuple[str, float, Union[str, Dict[int, str]], Dict[str, str], DictionaryObject]] = {}
 
         # Actual string size tracking
         self._actual_str_size: Dict[str, float] = {"str_widths": 0.0, "space_width": 0.0, "str_height": 0.0}
 
-        # Character maps for fonts
-        self.cmaps: Dict[
-            str,
-            Tuple[str, float, Union[str, Dict[int, str]], Dict[str, str], DictionaryObject],
-        ] = {}
-
         # Resources dictionary
         self.resources_dict: Optional[DictionaryObject] = None
 
@@ -231,8 +236,7 @@ def _process_operation(self, operator: bytes, operands: List[Any]) -> None:
         if self.visitor_operand_after is not None:
             self.visitor_operand_after(operator, operands, self.cm_matrix, self.tm_matrix)
 
-    def _compute_str_widths(self, str_widths: float) -> float:
-        """Compute string widths."""
+    def compute_str_widths(self, str_widths: float) -> float:
         return str_widths / 1000
 
     def _flush_text(self) -> None:
@@ -355,14 +359,14 @@ def _handle_operation_move_text_position(self, operands: List[Any]) -> None:
         tx, ty = float(operands[0]), float(operands[1])
         self.tm_matrix[4] += tx * self.tm_matrix[0] + ty * self.tm_matrix[2]
         self.tm_matrix[5] += tx * self.tm_matrix[1] + ty * self.tm_matrix[3]
-        str_widths = self._compute_str_widths(self._actual_str_size["str_widths"])
+        str_widths = self.compute_str_widths(self._actual_str_size["str_widths"])
         self._actual_str_size["str_widths"] = 0.0
         self._handle_position_change(str_widths)
 
     def _handle_operation_set_text_matrix(self, operands: List[Any]) -> None:
         """Handle Tm (Set text matrix) operation."""
         self.tm_matrix = [float(operand) for operand in operands[:6]]
-        str_widths = self._compute_str_widths(self._actual_str_size["str_widths"])
+        str_widths = self.compute_str_widths(self._actual_str_size["str_widths"])
         self._actual_str_size["str_widths"] = 0.0
         self._handle_position_change(str_widths)
 
@@ -370,7 +374,7 @@ def _handle_operation_move_to_next_line(self, operands: List[Any]) -> None:
         """Handle T* (Move to next line) operation."""
         self.tm_matrix[4] -= self.TL * self.tm_matrix[2]
         self.tm_matrix[5] -= self.TL * self.tm_matrix[3]
-        str_widths = self._compute_str_widths(self._actual_str_size["str_widths"])
+        str_widths = self.compute_str_widths(self._actual_str_size["str_widths"])
         self._actual_str_size["str_widths"] = 0.0
         self._handle_position_change(str_widths)
 
@@ -389,7 +393,7 @@ def _handle_operation_show_text(self, operands: List[Any]) -> None:
             self._space_width,
             self._actual_str_size,
         )
-        str_widths = self._compute_str_widths(self._actual_str_size["str_widths"])
+        str_widths = self.compute_str_widths(self._actual_str_size["str_widths"])
         self._handle_position_change(str_widths)
 
     def _handle_operation_show_text_with_positioning(self, operands: List[Any]) -> None:
@@ -471,7 +475,7 @@ def _handle_position_change(self, str_widths: float) -> None:
                 self.font_size,
                 self.visitor_text,
                 str_widths,
-                self._compute_str_widths(self._actual_str_size["space_width"]),
+                self.compute_str_widths(self._actual_str_size["space_width"]),
                 self._actual_str_size["str_height"],
             )
             if self.text == "":
@@ -482,16 +486,15 @@ def _handle_position_change(self, str_widths: float) -> None:
 
     def _get_actual_font_widths(
         self,
-        cmap: Tuple[Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]],
+        cmap: Tuple[
+            Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
+        ],
         text_operands: str,
         font_size: float,
         space_width: float,
     ) -> Tuple[float, float, float]:
-        """Get actual font widths for text operands."""
         font_widths: float = 0
         font_name: str = cmap[2]
-
-        # Use the page object's font width maps
         if font_name not in self.page_obj._font_width_maps:
             if cmap[3] is None:
                 font_width_map: Dict[Any, float] = {}
@@ -505,7 +508,6 @@ def _get_actual_font_widths(
             if actual_space_width == 0:
                 actual_space_width = space_width
             self.page_obj._font_width_maps[font_name] = (font_width_map, space_char, actual_space_width)
-
         font_width_map = self.page_obj._font_width_maps[font_name][0]
         space_char = self.page_obj._font_width_maps[font_name][1]
         actual_space_width = self.page_obj._font_width_maps[font_name][2]
@@ -516,18 +518,17 @@ def _get_actual_font_widths(
                     font_widths += actual_space_width
                     continue
                 font_widths += compute_font_width(font_width_map, char)
-
         return (font_widths * font_size, space_width * font_size, font_size)
 
-
-
     def _handle_tj(
         self,
         text: str,
         operands: List[Union[str, TextStringObject]],
         cm_matrix: List[float],
         tm_matrix: List[float],
-        cmap: Tuple[Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]],
+        cmap: Tuple[
+            Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
+        ],
         orientations: Tuple[int, ...],
         font_size: float,
         rtl_dir: bool,
@@ -535,8 +536,8 @@ def _handle_tj(
         space_width: float,
         actual_str_size: Dict[str, float],
     ) -> Tuple[str, bool, Dict[str, float]]:
-        """Handle text showing operations."""
-        text_operands, is_str_operands = get_text_operands(operands, cm_matrix, tm_matrix, cmap, orientations)
+        text_operands, is_str_operands = get_text_operands(
+            operands, cm_matrix, tm_matrix, cmap, orientations)
         if is_str_operands:
             text += text_operands
         else:
@@ -550,13 +551,8 @@ def _handle_tj(
                 rtl_dir,
                 visitor_text,
             )
-
-        font_widths, actual_str_size["space_width"], actual_str_size["str_height"] = self._get_actual_font_widths(
-            cmap,
-            text_operands,
-            font_size,
-            space_width,
-        )
+        font_widths, actual_str_size["space_width"], actual_str_size["str_height"] = (
+            self._get_actual_font_widths(cmap, text_operands, font_size, space_width))
         actual_str_size["str_widths"] += font_widths
 
         return text, rtl_dir, actual_str_size