From 15c5e99b77848050adfb8c365c018e6706536dab Mon Sep 17 00:00:00 2001 From: PJBrs Date: Sun, 14 Sep 2025 18:59:33 +0200 Subject: [PATCH 01/17] MAINT: _writer: let _update_field_annotation return appearance stream This patch lets the _update_field_annotation method return an appearance stream instead of None, so that this method can be separated out of _writer.py later on. --- pypdf/_writer.py | 41 ++++++++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index f4e199986..eef94c1de 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -874,7 +874,6 @@ def _add_apstream_object( object_name: str, x_offset: float, y_offset: float, - font_res: Optional[DictionaryObject] = None ) -> None: """ Adds an appearance stream to the page content in the form of @@ -886,17 +885,25 @@ def _add_apstream_object( object_name: The name of the appearance stream. x_offset: The horizontal offset for the appearance stream. y_offset: The vertical offset for the appearance stream. - font_res: The appearance stream's font resource (if given). """ - # Prepare XObject resource dictionary on the page + # Prepare XObject resource dictionary on the page. This currently + # only deals with font resources, but can easily be adapted to also + # include other resources. pg_res = cast(DictionaryObject, page[PG.RESOURCES]) - if font_res is not None: - font_name = font_res["/BaseFont"] # [/"Name"] often also exists, but is deprecated + if "/Resources" in appearance_stream_obj: + ap_stream_res = cast(DictionaryObject, appearance_stream_obj["/Resources"]) + # No need to check "if "/Font" in ap_stream_res", because the only reason this + # code runs would be if we are flattening form fields, and the associated code + # either adds a Font resource or no resource at all. This probably needs to + # change if we want to use this method to flatten markup annotations. + ap_stream_font_dict = cast(DictionaryObject, ap_stream_res["/Font"]) if "/Font" not in pg_res: pg_res[NameObject("/Font")] = DictionaryObject() - pg_ft_res = cast(DictionaryObject, pg_res[NameObject("/Font")]) - if font_name not in pg_ft_res: - pg_ft_res[NameObject(font_name)] = font_res + pg_font_res = cast(DictionaryObject, pg_res["/Font"]) + # Merge fonts from the appearance stream into the page's font resources + for font_name, font_ref in ap_stream_font_dict.items(): + if font_name not in pg_font_res: + pg_font_res[font_name] = font_ref # Always add the resolved stream object to the writer to get a new IndirectObject. # This ensures we have a valid IndirectObject managed by *this* writer. xobject_ref = self._add_object(appearance_stream_obj) @@ -922,8 +929,7 @@ def _update_field_annotation( annotation: DictionaryObject, font_name: str = "", font_size: float = -1, - flatten: bool = False, - ) -> None: + ) -> StreamObject: # Calculate rectangle dimensions _rct = cast(RectangleObject, annotation[AA.Rect]) rct = RectangleObject((0, 0, abs(_rct[2] - _rct[0]), abs(_rct[3] - _rct[1]))) @@ -1065,9 +1071,7 @@ def _update_field_annotation( self._objects[n - 1] = dct dct.indirect_reference = IndirectObject(n, 0, self) - if flatten: - field_name = self._get_qualified_field_name(annotation) - self._add_apstream_object(page, dct, field_name, _rct[0], _rct[1], font_res) + return dct FFBITS_NUL = FA.FfBits(0) @@ -1139,6 +1143,7 @@ def update_page_form_field_values( ).get_object() for field, value in fields.items(): + rct = cast(RectangleObject, annotation[AA.Rect]) if not ( self._get_qualified_field_name(parent_annotation) == field or parent_annotation.get("/T", None) == field @@ -1175,7 +1180,6 @@ def update_page_form_field_values( if flatten and appearance_stream_obj is not None: # We basically copy the entire appearance stream, which should be an XObject that # is already registered. No need to add font resources. - rct = cast(RectangleObject, annotation[AA.Rect]) self._add_apstream_object(page, appearance_stream_obj, field, rct[0], rct[1]) elif ( parent_annotation.get(FA.FT) == "/Tx" @@ -1183,11 +1187,14 @@ def update_page_form_field_values( ): # textbox if isinstance(value, tuple): - self._update_field_annotation( - page, parent_annotation, annotation, value[1], value[2], flatten=flatten + dct = self._update_field_annotation( + page, parent_annotation, annotation, value[1], value[2] ) else: - self._update_field_annotation(page, parent_annotation, annotation, flatten=flatten) + dct = self._update_field_annotation(page, parent_annotation, annotation) + if flatten: + field_name = self._get_qualified_field_name(annotation) + self._add_apstream_object(page, dct, field_name, rct[0], rct[1]) elif ( annotation.get(FA.FT) == "/Sig" ): # deprecated # not implemented yet From f44a155c3fba0a625ca0471f5d2c4b803cdaa6b0 Mon Sep 17 00:00:00 2001 From: PJBrs Date: Sun, 14 Sep 2025 20:24:39 +0200 Subject: [PATCH 02/17] MAINT: _writer: refactor update_page_form_fields Add a couple of comments to the update_page_form_fields method, and change the flatten command later on. Underlying logic: First set the field value, then get its appearance stream, and, if it has one, flatten it if appropriate. --- pypdf/_writer.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index eef94c1de..f61e9e9fd 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -1156,6 +1156,7 @@ def update_page_form_field_values( del parent_annotation["/I"] if flags: annotation[NameObject(FA.Ff)] = NumberObject(flags) + # Set the field value if not (value is None and flatten): # Only change values if given by user and not flattening. if isinstance(value, list): lst = ArrayObject(TextStringObject(v) for v in value) @@ -1166,6 +1167,7 @@ def update_page_form_field_values( ) else: parent_annotation[NameObject(FA.V)] = TextStringObject(value) + # Get or create the field's appearance stream if parent_annotation.get(FA.FT) == "/Btn": # Checkbox button (no /FT found in Radio widgets) v = NameObject(value) @@ -1177,28 +1179,25 @@ def update_page_form_field_values( # other cases will be updated through the for loop annotation[NameObject(AA.AS)] = v annotation[NameObject(FA.V)] = v - if flatten and appearance_stream_obj is not None: - # We basically copy the entire appearance stream, which should be an XObject that - # is already registered. No need to add font resources. - self._add_apstream_object(page, appearance_stream_obj, field, rct[0], rct[1]) elif ( parent_annotation.get(FA.FT) == "/Tx" or parent_annotation.get(FA.FT) == "/Ch" ): # textbox if isinstance(value, tuple): - dct = self._update_field_annotation( + appearance_stream_obj = self._update_field_annotation( page, parent_annotation, annotation, value[1], value[2] ) else: - dct = self._update_field_annotation(page, parent_annotation, annotation) - if flatten: - field_name = self._get_qualified_field_name(annotation) - self._add_apstream_object(page, dct, field_name, rct[0], rct[1]) + appearance_stream_obj = self._update_field_annotation( + page, parent_annotation, annotation + ) elif ( annotation.get(FA.FT) == "/Sig" ): # deprecated # not implemented yet logger_warning("Signature forms not implemented yet", __name__) + if flatten and appearance_stream_obj is not None: + self._add_apstream_object(page, appearance_stream_obj, field, rct[0], rct[1]) def reattach_fields( self, page: Optional[PageObject] = None From baabffb89b0ae0ac51ece139185cada66a10cc81 Mon Sep 17 00:00:00 2001 From: PJBrs Date: Mon, 15 Sep 2025 13:39:50 +0200 Subject: [PATCH 03/17] MAINT: Refactor code from _writer to _appearance_stream This patch introduces a new module - appearance_stream - and copies two methods from _writer to this new module. Currently, these methods are needed to develop an appearance stream for a text annotation. They are: update_field_annotation (renamed from _update_field_annotation) generate_appearance_stream The update_field_annotation was a PdfWriter method, which means that the current code needs some refactoring, since it now has a circular import of PdfWriter. Other than changing self to writer in update_field_annotation, and changing the code in PdfWriter to call update_field_annotation from _appearance_stream, and it avoids a little bit of casting in one place. For the rest, this patch changes nothing. In a future change, we might want to make a class TextAppearanceStream based on generate_appearance_stream, with .from_annotation(Annotation) as a class method (based on update_field_annotaion). scale_text would also be a method in this class. --- pypdf/_writer.py | 221 ++++------------------------ pypdf/generic/_appearance_stream.py | 186 +++++++++++++++++++++++ 2 files changed, 211 insertions(+), 196 deletions(-) create mode 100644 pypdf/generic/_appearance_stream.py diff --git a/pypdf/_writer.py b/pypdf/_writer.py index f61e9e9fd..ba0759ff1 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -48,7 +48,6 @@ cast, ) -from ._cmap import _default_fonts_space_width, build_char_map_from_dict from ._doc_common import DocumentInformation, PdfDocCommon from ._encryption import EncryptAlgorithm, Encryption from ._page import PageObject, Transformation @@ -85,7 +84,6 @@ BooleanObject, ByteStringObject, ContentStream, - DecodedStreamObject, Destination, DictionaryObject, EmbeddedFile, @@ -107,6 +105,7 @@ hex_to_rgb, is_null_or_none, ) +from .generic._appearance_stream import update_field_annotation from .pagerange import PageRange, PageRangeSpec from .types import ( AnnotationSubtype, @@ -119,7 +118,6 @@ from .xmp import XmpInformation ALL_DOCUMENT_PERMISSIONS = UserAccessPermissions.all() -DEFAULT_FONT_HEIGHT_IN_MULTILINE = 12 class ObjectDeletionFlag(enum.IntFlag): @@ -922,157 +920,6 @@ def _add_apstream_object( xobject_drawing_commands = f"q\n{xobject_cm._to_cm()}\n{xobject_name} Do\nQ".encode() self._merge_content_stream_to_page(page, xobject_drawing_commands) - def _update_field_annotation( - self, - page: PageObject, - field: DictionaryObject, - annotation: DictionaryObject, - font_name: str = "", - font_size: float = -1, - ) -> StreamObject: - # Calculate rectangle dimensions - _rct = cast(RectangleObject, annotation[AA.Rect]) - rct = RectangleObject((0, 0, abs(_rct[2] - _rct[0]), abs(_rct[3] - _rct[1]))) - - # Extract font information - da = annotation.get_inherited( - AA.DA, - cast(DictionaryObject, self.root_object[CatalogDictionary.ACRO_FORM]).get( - AA.DA, None - ), - ) - if da is None: - da = TextStringObject("/Helv 0 Tf 0 g") - else: - da = da.get_object() - font_properties = da.replace("\n", " ").replace("\r", " ").split(" ") - font_properties = [x for x in font_properties if x != ""] - if font_name: - font_properties[font_properties.index("Tf") - 2] = font_name - else: - font_name = font_properties[font_properties.index("Tf") - 2] - font_height = ( - font_size - if font_size >= 0 - else float(font_properties[font_properties.index("Tf") - 1]) - ) - if font_height == 0: - if field.get(FA.Ff, 0) & FA.FfBits.Multiline: - font_height = DEFAULT_FONT_HEIGHT_IN_MULTILINE - else: - font_height = rct.height - 2 - font_properties[font_properties.index("Tf") - 1] = str(font_height) - da = " ".join(font_properties) - y_offset = rct.height - 1 - font_height - - # Retrieve font information from local DR ... - dr: Any = cast( - DictionaryObject, - cast( - DictionaryObject, - annotation.get_inherited( - "/DR", - cast( - DictionaryObject, self.root_object[CatalogDictionary.ACRO_FORM] - ).get("/DR", DictionaryObject()), - ), - ).get_object(), - ) - dr = dr.get("/Font", DictionaryObject()).get_object() - # _default_fonts_space_width keys is the list of Standard fonts - if font_name not in dr and font_name not in _default_fonts_space_width: - # ...or AcroForm dictionary - dr = cast( - dict[Any, Any], - cast( - DictionaryObject, self.root_object[CatalogDictionary.ACRO_FORM] - ).get("/DR", {}), - ) - dr = dr.get_object().get("/Font", DictionaryObject()).get_object() - font_res = dr.get(font_name, None) - if not is_null_or_none(font_res): - font_res = cast(DictionaryObject, font_res.get_object()) - _font_subtype, _, font_encoding, font_map = build_char_map_from_dict( - 200, font_res - ) - try: # remove width stored in -1 key - del font_map[-1] - except KeyError: - pass - font_full_rev: dict[str, bytes] - if isinstance(font_encoding, str): - font_full_rev = { - v: k.encode(font_encoding) for k, v in font_map.items() - } - else: - font_full_rev = {v: bytes((k,)) for k, v in font_encoding.items()} - font_encoding_rev = {v: bytes((k,)) for k, v in font_encoding.items()} - for key, value in font_map.items(): - font_full_rev[value] = font_encoding_rev.get(key, key) - else: - logger_warning(f"Font dictionary for {font_name} not found.", __name__) - font_full_rev = {} - - # Retrieve field text and selected values - field_flags = field.get(FA.Ff, 0) - if field.get(FA.FT, "/Tx") == "/Ch" and field_flags & FA.FfBits.Combo == 0: - txt = "\n".join(annotation.get_inherited(FA.Opt, [])) - sel = field.get("/V", []) - if not isinstance(sel, list): - sel = [sel] - else: # /Tx - txt = field.get("/V", "") - sel = [] - # Escape parentheses (PDF 1.7 reference, table 3.2, Literal Strings) - txt = txt.replace("\\", "\\\\").replace("(", r"\(").replace(")", r"\)") - # Generate appearance stream - ap_stream = generate_appearance_stream( - txt, sel, da, font_full_rev, rct, font_height, y_offset - ) - - # Create appearance dictionary - dct = DecodedStreamObject.initialize_from_dictionary( - { - NameObject("/Type"): NameObject("/XObject"), - NameObject("/Subtype"): NameObject("/Form"), - NameObject("/BBox"): rct, - "__streamdata__": ByteStringObject(ap_stream), - "/Length": 0, - } - ) - if AA.AP in annotation: - for k, v in cast(DictionaryObject, annotation[AA.AP]).get("/N", {}).items(): - if k not in {"/BBox", "/Length", "/Subtype", "/Type", "/Filter"}: - dct[k] = v - - # Update Resources with font information if necessary - if font_res is not None: - dct[NameObject("/Resources")] = DictionaryObject( - { - NameObject("/Font"): DictionaryObject( - { - NameObject(font_name): getattr( - font_res, "indirect_reference", font_res - ) - } - ) - } - ) - if AA.AP not in annotation: - annotation[NameObject(AA.AP)] = DictionaryObject( - {NameObject("/N"): self._add_object(dct)} - ) - elif "/N" not in cast(DictionaryObject, annotation[AA.AP]): - cast(DictionaryObject, annotation[NameObject(AA.AP)])[ - NameObject("/N") - ] = self._add_object(dct) - else: # [/AP][/N] exists - n = annotation[AA.AP]["/N"].indirect_reference.idnum # type: ignore - self._objects[n - 1] = dct - dct.indirect_reference = IndirectObject(n, 0, self) - - return dct - FFBITS_NUL = FA.FfBits(0) def update_page_form_field_values( @@ -1167,31 +1014,46 @@ def update_page_form_field_values( ) else: parent_annotation[NameObject(FA.V)] = TextStringObject(value) - # Get or create the field's appearance stream + # Get or create the field's appearance stream object if parent_annotation.get(FA.FT) == "/Btn": - # Checkbox button (no /FT found in Radio widgets) + # Checkbox button (no /FT found in Radio widgets); + # We can find the associated appearance stream object + # within the annotation. v = NameObject(value) ap = cast(DictionaryObject, annotation[NameObject(AA.AP)]) normal_ap = cast(DictionaryObject, ap["/N"]) if v not in normal_ap: v = NameObject("/Off") appearance_stream_obj = normal_ap.get(v) - # other cases will be updated through the for loop + # Other cases will be updated through the for loop annotation[NameObject(AA.AS)] = v annotation[NameObject(FA.V)] = v elif ( parent_annotation.get(FA.FT) == "/Tx" or parent_annotation.get(FA.FT) == "/Ch" ): - # textbox + # Textbox; we need to generate the appearance stream object if isinstance(value, tuple): - appearance_stream_obj = self._update_field_annotation( - page, parent_annotation, annotation, value[1], value[2] + appearance_stream_obj = update_field_annotation( + af, page, parent_annotation, annotation, value[1], value[2] ) else: - appearance_stream_obj = self._update_field_annotation( - page, parent_annotation, annotation - ) + appearance_stream_obj = update_field_annotation( + af, page, parent_annotation, annotation + ) + # Add the appearance stream object + if AA.AP not in annotation: + annotation[NameObject(AA.AP)] = DictionaryObject( + {NameObject("/N"): self._add_object(appearance_stream_obj)} + ) + elif "/N" not in (ap:= cast(DictionaryObject, annotation[AA.AP])): + cast(DictionaryObject, annotation[NameObject(AA.AP)])[ + NameObject("/N") + ] = self._add_object(appearance_stream_obj) + else: # [/AP][/N] exists + n = annotation[AA.AP]["/N"].indirect_reference.idnum # type: ignore + self._objects[n - 1] = appearance_stream_obj + appearance_stream_obj.indirect_reference = IndirectObject(n, 0, self) elif ( annotation.get(FA.FT) == "/Sig" ): # deprecated # not implemented yet @@ -3441,36 +3303,3 @@ def _create_outline_item( format_flag += OutlineFontFlag.bold outline_item.update({NameObject("/F"): NumberObject(format_flag)}) return outline_item - - -def generate_appearance_stream( - txt: str, - sel: list[str], - da: str, - font_full_rev: dict[str, bytes], - rct: RectangleObject, - font_height: float, - y_offset: float, -) -> bytes: - ap_stream = f"q\n/Tx BMC \nq\n1 1 {rct.width - 1} {rct.height - 1} re\nW\nBT\n{da}\n".encode() - for line_number, line in enumerate(txt.replace("\n", "\r").split("\r")): - if line in sel: - # may be improved but cannot find how to get fill working => replaced with lined box - ap_stream += ( - f"1 {y_offset - (line_number * font_height * 1.4) - 1} {rct.width - 2} {font_height + 2} re\n" - f"0.5 0.5 0.5 rg s\n{da}\n" - ).encode() - if line_number == 0: - ap_stream += f"2 {y_offset} Td\n".encode() - else: - # Td is a relative translation - ap_stream += f"0 {- font_height * 1.4} Td\n".encode() - enc_line: list[bytes] = [ - font_full_rev.get(c, c.encode("utf-16-be")) for c in line - ] - if any(len(c) >= 2 for c in enc_line): - ap_stream += b"<" + (b"".join(enc_line)).hex().encode() + b"> Tj\n" - else: - ap_stream += b"(" + b"".join(enc_line) + b") Tj\n" - ap_stream += b"ET\nQ\nEMC\nQ\n" - return ap_stream diff --git a/pypdf/generic/_appearance_stream.py b/pypdf/generic/_appearance_stream.py new file mode 100644 index 000000000..b72ace9be --- /dev/null +++ b/pypdf/generic/_appearance_stream.py @@ -0,0 +1,186 @@ +from typing import Any, cast + +from .._cmap import _default_fonts_space_width, build_char_map_from_dict +from .._page import PageObject +from .._utils import logger_warning +from ..constants import AnnotationDictionaryAttributes as AA +from ..constants import FieldDictionaryAttributes as FA +from ..generic import ( + DecodedStreamObject, + DictionaryObject, + NameObject, + RectangleObject, + StreamObject, +) +from ..generic._base import ByteStringObject, TextStringObject, is_null_or_none + +DEFAULT_FONT_HEIGHT_IN_MULTILINE = 12 + + +def generate_appearance_stream( + txt: str, + sel: list[str], + da: str, + font_full_rev: dict[str, bytes], + rct: RectangleObject, + font_height: float, + y_offset: float, +) -> bytes: + ap_stream = f"q\n/Tx BMC \nq\n1 1 {rct.width - 1} {rct.height - 1} re\nW\nBT\n{da}\n".encode() + for line_number, line in enumerate(txt.replace("\n", "\r").split("\r")): + if line in sel: + # may be improved but cannot find how to get fill working => replaced with lined box + ap_stream += ( + f"1 {y_offset - (line_number * font_height * 1.4) - 1} {rct.width - 2} {font_height + 2} re\n" + f"0.5 0.5 0.5 rg s\n{da}\n" + ).encode() + if line_number == 0: + ap_stream += f"2 {y_offset} Td\n".encode() + else: + # Td is a relative translation + ap_stream += f"0 {- font_height * 1.4} Td\n".encode() + enc_line: list[bytes] = [ + font_full_rev.get(c, c.encode("utf-16-be")) for c in line + ] + if any(len(c) >= 2 for c in enc_line): + ap_stream += b"<" + (b"".join(enc_line)).hex().encode() + b"> Tj\n" + else: + ap_stream += b"(" + b"".join(enc_line) + b") Tj\n" + ap_stream += b"ET\nQ\nEMC\nQ\n" + return ap_stream + + +def update_field_annotation( + af: DictionaryObject, # _root_object[CatalogDictionary.ACRO_FORM]) + page: PageObject, + field: DictionaryObject, + annotation: DictionaryObject, + font_name: str = "", + font_size: float = -1, +) -> StreamObject: + # Calculate rectangle dimensions + _rct = cast(RectangleObject, annotation[AA.Rect]) + rct = RectangleObject((0, 0, abs(_rct[2] - _rct[0]), abs(_rct[3] - _rct[1]))) + + # Extract font information + da = annotation.get_inherited( + AA.DA, + af.get( + AA.DA, None + ), + ) + if da is None: + da = TextStringObject("/Helv 0 Tf 0 g") + else: + da = da.get_object() + font_properties = da.replace("\n", " ").replace("\r", " ").split(" ") + font_properties = [x for x in font_properties if x != ""] + if font_name: + font_properties[font_properties.index("Tf") - 2] = font_name + else: + font_name = font_properties[font_properties.index("Tf") - 2] + font_height = ( + font_size + if font_size >= 0 + else float(font_properties[font_properties.index("Tf") - 1]) + ) + if font_height == 0: + if field.get(FA.Ff, 0) & FA.FfBits.Multiline: + font_height = DEFAULT_FONT_HEIGHT_IN_MULTILINE + else: + font_height = rct.height - 2 + font_properties[font_properties.index("Tf") - 1] = str(font_height) + da = " ".join(font_properties) + y_offset = rct.height - 1 - font_height + + # Retrieve font information from local DR ... + dr: Any = cast( + DictionaryObject, + cast( + DictionaryObject, + annotation.get_inherited( + "/DR", + af.get("/DR", DictionaryObject()), + ), + ).get_object(), + ) + dr = dr.get("/Font", DictionaryObject()).get_object() + # _default_fonts_space_width keys is the list of Standard fonts + if font_name not in dr and font_name not in _default_fonts_space_width: + # ...or AcroForm dictionary + dr = cast( + dict[Any, Any], + af.get("/DR", {}), + ) + dr = dr.get_object().get("/Font", DictionaryObject()).get_object() + font_res = dr.get(font_name, None) + if not is_null_or_none(font_res): + font_res = cast(DictionaryObject, font_res.get_object()) + _font_subtype, _, font_encoding, font_map = build_char_map_from_dict( + 200, font_res + ) + try: # remove width stored in -1 key + del font_map[-1] + except KeyError: + pass + font_full_rev: dict[str, bytes] + if isinstance(font_encoding, str): + font_full_rev = { + v: k.encode(font_encoding) for k, v in font_map.items() + } + else: + font_full_rev = {v: bytes((k,)) for k, v in font_encoding.items()} + font_encoding_rev = {v: bytes((k,)) for k, v in font_encoding.items()} + for key, value in font_map.items(): + font_full_rev[value] = font_encoding_rev.get(key, key) + else: + logger_warning(f"Font dictionary for {font_name} not found.", __name__) + font_full_rev = {} + + # Retrieve field text and selected values + field_flags = field.get(FA.Ff, 0) + if field.get(FA.FT, "/Tx") == "/Ch" and field_flags & FA.FfBits.Combo == 0: + txt = "\n".join(annotation.get_inherited(FA.Opt, [])) + sel = field.get("/V", []) + if not isinstance(sel, list): + sel = [sel] + else: # /Tx + txt = field.get("/V", "") + sel = [] + # Escape parentheses (PDF 1.7 reference, table 3.2, Literal Strings) + txt = txt.replace("\\", "\\\\").replace("(", r"\(").replace(")", r"\)") + # Generate appearance stream + ap_stream = generate_appearance_stream( + txt, sel, da, font_full_rev, rct, font_height, y_offset + ) + + # Create appearance dictionary + dct = DecodedStreamObject.initialize_from_dictionary( + { + NameObject("/Type"): NameObject("/XObject"), + NameObject("/Subtype"): NameObject("/Form"), + NameObject("/BBox"): rct, + "__streamdata__": ByteStringObject(ap_stream), + "/Length": 0, + } + ) + if AA.AP in annotation: + for k, v in cast(DictionaryObject, annotation[AA.AP]).get("/N", {}).items(): + if k not in {"/BBox", "/Length", "/Subtype", "/Type", "/Filter"}: + dct[k] = v + + # Update Resources with font information if necessary + if font_res is not None: + dct[NameObject("/Resources")] = DictionaryObject( + { + NameObject("/Font"): DictionaryObject( + { + NameObject(font_name): getattr( + font_res, "indirect_reference", font_res + ) + } + ) + } + ) + + return dct From 58166361bfea52c35d971588cda9fa3dbe4da5ca Mon Sep 17 00:00:00 2001 From: PJBrs Date: Mon, 15 Sep 2025 21:47:11 +0200 Subject: [PATCH 04/17] MAINT: Turn the appearance stream code into a class This patch introduces the TextAppearanceStream class, with .from_text_annotation as a class method to instantiate it from a text annotation. It includes the code from generate_appearance_stream and _update_field_annotation. --- pypdf/_writer.py | 10 +- pypdf/generic/_appearance_stream.py | 322 ++++++++++++++-------------- 2 files changed, 169 insertions(+), 163 deletions(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index ba0759ff1..3a27dfddf 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -105,7 +105,7 @@ hex_to_rgb, is_null_or_none, ) -from .generic._appearance_stream import update_field_annotation +from .generic._appearance_stream import TextStreamAppearance from .pagerange import PageRange, PageRangeSpec from .types import ( AnnotationSubtype, @@ -1034,12 +1034,12 @@ def update_page_form_field_values( ): # Textbox; we need to generate the appearance stream object if isinstance(value, tuple): - appearance_stream_obj = update_field_annotation( - af, page, parent_annotation, annotation, value[1], value[2] + appearance_stream_obj = TextStreamAppearance.from_text_annotation( + af, parent_annotation, annotation, value[1], value[2] ) else: - appearance_stream_obj = update_field_annotation( - af, page, parent_annotation, annotation + appearance_stream_obj = TextStreamAppearance.from_text_annotation( + af, parent_annotation, annotation ) # Add the appearance stream object if AA.AP not in annotation: diff --git a/pypdf/generic/_appearance_stream.py b/pypdf/generic/_appearance_stream.py index b72ace9be..413753473 100644 --- a/pypdf/generic/_appearance_stream.py +++ b/pypdf/generic/_appearance_stream.py @@ -1,7 +1,6 @@ -from typing import Any, cast +from typing import Any, Optional, Union, cast from .._cmap import _default_fonts_space_width, build_char_map_from_dict -from .._page import PageObject from .._utils import logger_warning from ..constants import AnnotationDictionaryAttributes as AA from ..constants import FieldDictionaryAttributes as FA @@ -9,178 +8,185 @@ DecodedStreamObject, DictionaryObject, NameObject, + NumberObject, RectangleObject, - StreamObject, ) from ..generic._base import ByteStringObject, TextStringObject, is_null_or_none DEFAULT_FONT_HEIGHT_IN_MULTILINE = 12 -def generate_appearance_stream( - txt: str, - sel: list[str], - da: str, - font_full_rev: dict[str, bytes], - rct: RectangleObject, - font_height: float, - y_offset: float, -) -> bytes: - ap_stream = f"q\n/Tx BMC \nq\n1 1 {rct.width - 1} {rct.height - 1} re\nW\nBT\n{da}\n".encode() - for line_number, line in enumerate(txt.replace("\n", "\r").split("\r")): - if line in sel: - # may be improved but cannot find how to get fill working => replaced with lined box - ap_stream += ( - f"1 {y_offset - (line_number * font_height * 1.4) - 1} {rct.width - 2} {font_height + 2} re\n" - f"0.5 0.5 0.5 rg s\n{da}\n" - ).encode() - if line_number == 0: - ap_stream += f"2 {y_offset} Td\n".encode() - else: - # Td is a relative translation - ap_stream += f"0 {- font_height * 1.4} Td\n".encode() - enc_line: list[bytes] = [ - font_full_rev.get(c, c.encode("utf-16-be")) for c in line - ] - if any(len(c) >= 2 for c in enc_line): - ap_stream += b"<" + (b"".join(enc_line)).hex().encode() + b"> Tj\n" - else: - ap_stream += b"(" + b"".join(enc_line) + b") Tj\n" - ap_stream += b"ET\nQ\nEMC\nQ\n" - return ap_stream +class TextStreamAppearance(DecodedStreamObject): + """ + A class representing the appearance stream for a text-based form field. + This class is similar in form to the FreeText class in pypdf. + """ + + def __init__( + self, + txt: str = "", + sel: Optional[list[str]] = None, + da: str = "", + font_full_rev: Optional[dict[str, bytes]] = None, + rct: Union[RectangleObject, tuple[float, float, float, float]] = (0.0, 0.0, 0.0, 0.0), + font_height: float = 0, + y_offset: float = 0, + ) -> None: + super().__init__() + font_full_rev = font_full_rev or {} + if isinstance(rct, tuple): + rct = RectangleObject(rct) + ap_stream = f"q\n/Tx BMC \nq\n1 1 {rct.width - 1} {rct.height - 1} re\nW\nBT\n{da}\n".encode() + for line_number, line in enumerate(txt.replace("\n", "\r").split("\r")): + if sel and line in sel: + # may be improved but cannot find how to get fill working => replaced with lined box + ap_stream += ( + f"1 {y_offset - (line_number * font_height * 1.4) - 1} {rct.width - 2} {font_height + 2} re\n" + f"0.5 0.5 0.5 rg s\n{da}\n" + ).encode() + if line_number == 0: + ap_stream += f"2 {y_offset} Td\n".encode() + else: + # Td is a relative translation + ap_stream += f"0 {-font_height * 1.4} Td\n".encode() + enc_line: list[bytes] = [ + font_full_rev.get(c, c.encode("utf-16-be")) for c in line + ] + if any(len(c) >= 2 for c in enc_line): + ap_stream += b"<" + (b"".join(enc_line)).hex().encode() + b"> Tj\n" + else: + ap_stream += b"(" + b"".join(enc_line) + b") Tj\n" + ap_stream += b"ET\nQ\nEMC\nQ\n" + self[NameObject("/Type")] = NameObject("/XObject") + self[NameObject("/Subtype")] = NameObject("/Form") + self[NameObject("/BBox")] = rct + self.set_data(ByteStringObject(ap_stream)) + self[NameObject("/Length")] = NumberObject(len(ap_stream)) -def update_field_annotation( - af: DictionaryObject, # _root_object[CatalogDictionary.ACRO_FORM]) - page: PageObject, - field: DictionaryObject, - annotation: DictionaryObject, - font_name: str = "", - font_size: float = -1, -) -> StreamObject: - # Calculate rectangle dimensions - _rct = cast(RectangleObject, annotation[AA.Rect]) - rct = RectangleObject((0, 0, abs(_rct[2] - _rct[0]), abs(_rct[3] - _rct[1]))) + @classmethod + def from_text_annotation( + cls, + af: DictionaryObject, # _root_object[CatalogDictionary.ACRO_FORM]) + field: DictionaryObject, + annotation: DictionaryObject, + font_name: str = "", + font_size: float = -1, + ) -> "TextStreamAppearance": + """Creates a TextStreamAppearance object from a given text field annotation.""" - # Extract font information - da = annotation.get_inherited( - AA.DA, - af.get( - AA.DA, None - ), - ) - if da is None: - da = TextStringObject("/Helv 0 Tf 0 g") - else: - da = da.get_object() - font_properties = da.replace("\n", " ").replace("\r", " ").split(" ") - font_properties = [x for x in font_properties if x != ""] - if font_name: - font_properties[font_properties.index("Tf") - 2] = font_name - else: - font_name = font_properties[font_properties.index("Tf") - 2] - font_height = ( - font_size - if font_size >= 0 - else float(font_properties[font_properties.index("Tf") - 1]) - ) - if font_height == 0: - if field.get(FA.Ff, 0) & FA.FfBits.Multiline: - font_height = DEFAULT_FONT_HEIGHT_IN_MULTILINE + # Calculate rectangle dimensions + _rct = cast(RectangleObject, annotation[AA.Rect]) + rct = RectangleObject((0, 0, abs(_rct[2] - _rct[0]), abs(_rct[3] - _rct[1]))) + + # Extract font information + da = annotation.get_inherited( + AA.DA, + af.get(AA.DA, None), + ) + if da is None: + da = TextStringObject("/Helv 0 Tf 0 g") + else: + da = da.get_object() + font_properties = da.replace("\n", " ").replace("\r", " ").split(" ") + font_properties = [x for x in font_properties if x != ""] + if font_name: + font_properties[font_properties.index("Tf") - 2] = font_name else: - font_height = rct.height - 2 - font_properties[font_properties.index("Tf") - 1] = str(font_height) - da = " ".join(font_properties) - y_offset = rct.height - 1 - font_height + font_name = font_properties[font_properties.index("Tf") - 2] + font_height = ( + font_size + if font_size >= 0 + else float(font_properties[font_properties.index("Tf") - 1]) + ) + if font_height == 0: + if field.get(FA.Ff, 0) & FA.FfBits.Multiline: + font_height = DEFAULT_FONT_HEIGHT_IN_MULTILINE + else: + font_height = rct.height - 2 + font_properties[font_properties.index("Tf") - 1] = str(font_height) + da = " ".join(font_properties) + y_offset = rct.height - 1 - font_height - # Retrieve font information from local DR ... - dr: Any = cast( - DictionaryObject, - cast( + # Retrieve font information from local DR ... + dr: Any = cast( DictionaryObject, - annotation.get_inherited( - "/DR", - af.get("/DR", DictionaryObject()), - ), - ).get_object(), - ) - dr = dr.get("/Font", DictionaryObject()).get_object() - # _default_fonts_space_width keys is the list of Standard fonts - if font_name not in dr and font_name not in _default_fonts_space_width: - # ...or AcroForm dictionary - dr = cast( - dict[Any, Any], - af.get("/DR", {}), - ) - dr = dr.get_object().get("/Font", DictionaryObject()).get_object() - font_res = dr.get(font_name, None) - if not is_null_or_none(font_res): - font_res = cast(DictionaryObject, font_res.get_object()) - _font_subtype, _, font_encoding, font_map = build_char_map_from_dict( - 200, font_res + cast( + DictionaryObject, + annotation.get_inherited( + "/DR", + af.get("/DR", DictionaryObject()), + ), + ).get_object(), ) - try: # remove width stored in -1 key - del font_map[-1] - except KeyError: - pass - font_full_rev: dict[str, bytes] - if isinstance(font_encoding, str): - font_full_rev = { - v: k.encode(font_encoding) for k, v in font_map.items() - } + dr = dr.get("/Font", DictionaryObject()).get_object() + # _default_fonts_space_width keys is the list of Standard fonts + if font_name not in dr and font_name not in _default_fonts_space_width: + # ...or AcroForm dictionary + dr = cast( + dict[Any, Any], + af.get("/DR", {}), + ) + dr = dr.get_object().get("/Font", DictionaryObject()).get_object() + font_res = dr.get(font_name, None) + if not is_null_or_none(font_res): + font_res = cast(DictionaryObject, font_res.get_object()) + _font_subtype, _, font_encoding, font_map = build_char_map_from_dict( + 200, font_res + ) + try: # remove width stored in -1 key + del font_map[-1] + except KeyError: + pass + font_full_rev: dict[str, bytes] + if isinstance(font_encoding, str): + font_full_rev = { + v: k.encode(font_encoding) for k, v in font_map.items() + } + else: + font_full_rev = {v: bytes((k,)) for k, v in font_encoding.items()} + font_encoding_rev = {v: bytes((k,)) for k, v in font_encoding.items()} + for key, value in font_map.items(): + font_full_rev[value] = font_encoding_rev.get(key, key) else: - font_full_rev = {v: bytes((k,)) for k, v in font_encoding.items()} - font_encoding_rev = {v: bytes((k,)) for k, v in font_encoding.items()} - for key, value in font_map.items(): - font_full_rev[value] = font_encoding_rev.get(key, key) - else: - logger_warning(f"Font dictionary for {font_name} not found.", __name__) - font_full_rev = {} - - # Retrieve field text and selected values - field_flags = field.get(FA.Ff, 0) - if field.get(FA.FT, "/Tx") == "/Ch" and field_flags & FA.FfBits.Combo == 0: - txt = "\n".join(annotation.get_inherited(FA.Opt, [])) - sel = field.get("/V", []) - if not isinstance(sel, list): - sel = [sel] - else: # /Tx - txt = field.get("/V", "") - sel = [] - # Escape parentheses (PDF 1.7 reference, table 3.2, Literal Strings) - txt = txt.replace("\\", "\\\\").replace("(", r"\(").replace(")", r"\)") - # Generate appearance stream - ap_stream = generate_appearance_stream( - txt, sel, da, font_full_rev, rct, font_height, y_offset - ) + logger_warning(f"Font dictionary for {font_name} not found.", __name__) + font_full_rev = {} - # Create appearance dictionary - dct = DecodedStreamObject.initialize_from_dictionary( - { - NameObject("/Type"): NameObject("/XObject"), - NameObject("/Subtype"): NameObject("/Form"), - NameObject("/BBox"): rct, - "__streamdata__": ByteStringObject(ap_stream), - "/Length": 0, - } - ) - if AA.AP in annotation: - for k, v in cast(DictionaryObject, annotation[AA.AP]).get("/N", {}).items(): - if k not in {"/BBox", "/Length", "/Subtype", "/Type", "/Filter"}: - dct[k] = v + # Retrieve field text and selected values + field_flags = field.get(FA.Ff, 0) + if field.get(FA.FT, "/Tx") == "/Ch" and field_flags & FA.FfBits.Combo == 0: + txt = "\n".join(annotation.get_inherited(FA.Opt, [])) + sel = field.get("/V", []) + if not isinstance(sel, list): + sel = [sel] + else: # /Tx + txt = field.get("/V", "") + sel = [] + # Escape parentheses (PDF 1.7 reference, table 3.2, Literal Strings) + txt = txt.replace("\\", "\\\\").replace("(", r"\(").replace(")", r"\)") - # Update Resources with font information if necessary - if font_res is not None: - dct[NameObject("/Resources")] = DictionaryObject( - { - NameObject("/Font"): DictionaryObject( - { - NameObject(font_name): getattr( - font_res, "indirect_reference", font_res - ) - } - ) - } + # Create the TextStreamAppearance instance + new_appearance_stream = cls( + txt, sel, da, font_full_rev, rct, font_height, y_offset ) - return dct + if AA.AP in annotation: + for k, v in cast(DictionaryObject, annotation[AA.AP]).get("/N", {}).items(): + if k not in {"/BBox", "/Length", "/Subtype", "/Type", "/Filter"}: + new_appearance_stream[k] = v + + # Update Resources with font information if necessary + if font_res is not None: + new_appearance_stream[NameObject("/Resources")] = DictionaryObject( + { + NameObject("/Font"): DictionaryObject( + { + NameObject(font_name): getattr( + font_res, "indirect_reference", font_res + ) + } + ) + } + ) + + return new_appearance_stream From 962c9765663d34dba75e13e06e73787d9823600f Mon Sep 17 00:00:00 2001 From: PJBrs Date: Wed, 17 Sep 2025 17:45:18 +0200 Subject: [PATCH 05/17] MAINT: _appearance_stream: Make variables more clear and readable Code in _appearance_stream used various rather cryptic variable names that, for some coders, made it hard to understand what the code was doing. This patch tries to clarify those variable names to make it easier to understand what's going on, and make it easier later on to add functionality. Overview of the changes: txt --> text sel --> selection da --> default_appearance font_full_rev --> font_glyph_byte_map rct --> rectangle k, v --> key, value enc_line --> encoded_line af --> acro_form dr --> document_resources / document_font_resources font_res --> font_resource Furthermore, I undid some abbreviated imports: - AnnotationDictionaryAttributes no longer as AA - FieldDictionaryAttributes no longer as FA --- pypdf/generic/_appearance_stream.py | 145 +++++++++++++++------------- 1 file changed, 76 insertions(+), 69 deletions(-) diff --git a/pypdf/generic/_appearance_stream.py b/pypdf/generic/_appearance_stream.py index 413753473..9991c4a77 100644 --- a/pypdf/generic/_appearance_stream.py +++ b/pypdf/generic/_appearance_stream.py @@ -2,8 +2,7 @@ from .._cmap import _default_fonts_space_width, build_char_map_from_dict from .._utils import logger_warning -from ..constants import AnnotationDictionaryAttributes as AA -from ..constants import FieldDictionaryAttributes as FA +from ..constants import AnnotationDictionaryAttributes, FieldDictionaryAttributes from ..generic import ( DecodedStreamObject, DictionaryObject, @@ -24,50 +23,53 @@ class TextStreamAppearance(DecodedStreamObject): def __init__( self, - txt: str = "", - sel: Optional[list[str]] = None, - da: str = "", - font_full_rev: Optional[dict[str, bytes]] = None, - rct: Union[RectangleObject, tuple[float, float, float, float]] = (0.0, 0.0, 0.0, 0.0), + text: str = "", + selection: Optional[list[str]] = None, + default_appearance: str = "", + font_glyph_byte_map: Optional[dict[str, bytes]] = None, + rectangle: Union[RectangleObject, tuple[float, float, float, float]] = (0.0, 0.0, 0.0, 0.0), font_height: float = 0, y_offset: float = 0, ) -> None: super().__init__() - font_full_rev = font_full_rev or {} - if isinstance(rct, tuple): - rct = RectangleObject(rct) - ap_stream = f"q\n/Tx BMC \nq\n1 1 {rct.width - 1} {rct.height - 1} re\nW\nBT\n{da}\n".encode() - for line_number, line in enumerate(txt.replace("\n", "\r").split("\r")): - if sel and line in sel: + font_glyph_byte_map = font_glyph_byte_map or {} + if isinstance(rectangle, tuple): + rectangle = RectangleObject(rectangle) + ap_stream = ( + f"q\n/Tx BMC \nq\n1 1 {rectangle.width - 1} {rectangle.height - 1} " + f"re\nW\nBT\n{default_appearance}\n" + ).encode() + for line_number, line in enumerate(text.replace("\n", "\r").split("\r")): + if selection and line in selection: # may be improved but cannot find how to get fill working => replaced with lined box ap_stream += ( - f"1 {y_offset - (line_number * font_height * 1.4) - 1} {rct.width - 2} {font_height + 2} re\n" - f"0.5 0.5 0.5 rg s\n{da}\n" + f"1 {y_offset - (line_number * font_height * 1.4) - 1} {rectangle.width - 2} {font_height + 2} re\n" + f"0.5 0.5 0.5 rg s\n{default_appearance}\n" ).encode() if line_number == 0: ap_stream += f"2 {y_offset} Td\n".encode() else: # Td is a relative translation ap_stream += f"0 {-font_height * 1.4} Td\n".encode() - enc_line: list[bytes] = [ - font_full_rev.get(c, c.encode("utf-16-be")) for c in line + encoded_line: list[bytes] = [ + font_glyph_byte_map.get(c, c.encode("utf-16-be")) for c in line ] - if any(len(c) >= 2 for c in enc_line): - ap_stream += b"<" + (b"".join(enc_line)).hex().encode() + b"> Tj\n" + if any(len(c) >= 2 for c in encoded_line): + ap_stream += b"<" + (b"".join(encoded_line)).hex().encode() + b"> Tj\n" else: - ap_stream += b"(" + b"".join(enc_line) + b") Tj\n" + ap_stream += b"(" + b"".join(encoded_line) + b") Tj\n" ap_stream += b"ET\nQ\nEMC\nQ\n" self[NameObject("/Type")] = NameObject("/XObject") self[NameObject("/Subtype")] = NameObject("/Form") - self[NameObject("/BBox")] = rct + self[NameObject("/BBox")] = rectangle self.set_data(ByteStringObject(ap_stream)) self[NameObject("/Length")] = NumberObject(len(ap_stream)) @classmethod def from_text_annotation( cls, - af: DictionaryObject, # _root_object[CatalogDictionary.ACRO_FORM]) + acro_form: DictionaryObject, # _root_object[CatalogDictionary.ACRO_FORM]) field: DictionaryObject, annotation: DictionaryObject, font_name: str = "", @@ -76,19 +78,19 @@ def from_text_annotation( """Creates a TextStreamAppearance object from a given text field annotation.""" # Calculate rectangle dimensions - _rct = cast(RectangleObject, annotation[AA.Rect]) - rct = RectangleObject((0, 0, abs(_rct[2] - _rct[0]), abs(_rct[3] - _rct[1]))) + _rectangle = cast(RectangleObject, annotation[AnnotationDictionaryAttributes.Rect]) + rectangle = RectangleObject((0, 0, abs(_rectangle[2] - _rectangle[0]), abs(_rectangle[3] - _rectangle[1]))) # Extract font information - da = annotation.get_inherited( - AA.DA, - af.get(AA.DA, None), + default_appearance = annotation.get_inherited( + AnnotationDictionaryAttributes.DA, + acro_form.get(AnnotationDictionaryAttributes.DA, None), ) - if da is None: - da = TextStringObject("/Helv 0 Tf 0 g") + if default_appearance is None: + default_appearance = TextStringObject("/Helv 0 Tf 0 g") else: - da = da.get_object() - font_properties = da.replace("\n", " ").replace("\r", " ").split(" ") + default_appearance = default_appearance.get_object() + font_properties = default_appearance.replace("\n", " ").replace("\r", " ").split(" ") font_properties = [x for x in font_properties if x != ""] if font_name: font_properties[font_properties.index("Tf") - 2] = font_name @@ -100,89 +102,94 @@ def from_text_annotation( else float(font_properties[font_properties.index("Tf") - 1]) ) if font_height == 0: - if field.get(FA.Ff, 0) & FA.FfBits.Multiline: + if field.get(FieldDictionaryAttributes.Ff, 0) & FieldDictionaryAttributes.FfBits.Multiline: font_height = DEFAULT_FONT_HEIGHT_IN_MULTILINE else: - font_height = rct.height - 2 + font_height = rectangle.height - 2 font_properties[font_properties.index("Tf") - 1] = str(font_height) - da = " ".join(font_properties) - y_offset = rct.height - 1 - font_height + default_appearance = " ".join(font_properties) + y_offset = rectangle.height - 1 - font_height # Retrieve font information from local DR ... - dr: Any = cast( + document_resources: Any = cast( DictionaryObject, cast( DictionaryObject, annotation.get_inherited( "/DR", - af.get("/DR", DictionaryObject()), + acro_form.get("/DR", DictionaryObject()), ), ).get_object(), ) - dr = dr.get("/Font", DictionaryObject()).get_object() + document_font_resources = document_resources.get("/Font", DictionaryObject()).get_object() # _default_fonts_space_width keys is the list of Standard fonts - if font_name not in dr and font_name not in _default_fonts_space_width: + if font_name not in document_font_resources and font_name not in _default_fonts_space_width: # ...or AcroForm dictionary - dr = cast( + document_resources = cast( dict[Any, Any], - af.get("/DR", {}), + acro_form.get("/DR", {}), ) - dr = dr.get_object().get("/Font", DictionaryObject()).get_object() - font_res = dr.get(font_name, None) - if not is_null_or_none(font_res): - font_res = cast(DictionaryObject, font_res.get_object()) + document_font_resources = document_resources.get_object().get("/Font", DictionaryObject()).get_object() + font_resource = document_font_resources.get(font_name, None) + if not is_null_or_none(font_resource): + font_resource = cast(DictionaryObject, font_resource.get_object()) _font_subtype, _, font_encoding, font_map = build_char_map_from_dict( - 200, font_res + 200, font_resource ) try: # remove width stored in -1 key del font_map[-1] except KeyError: pass - font_full_rev: dict[str, bytes] + font_glyph_byte_map: dict[str, bytes] if isinstance(font_encoding, str): - font_full_rev = { - v: k.encode(font_encoding) for k, v in font_map.items() + font_glyph_byte_map = { + value: key.encode(font_encoding) for key, value in font_map.items() } else: - font_full_rev = {v: bytes((k,)) for k, v in font_encoding.items()} - font_encoding_rev = {v: bytes((k,)) for k, v in font_encoding.items()} + font_glyph_byte_map = {value: bytes((key,)) for key, value in font_encoding.items()} + font_encoding_rev = {value: bytes((key,)) for key, value in font_encoding.items()} for key, value in font_map.items(): - font_full_rev[value] = font_encoding_rev.get(key, key) + font_glyph_byte_map[value] = font_encoding_rev.get(key, key) else: logger_warning(f"Font dictionary for {font_name} not found.", __name__) - font_full_rev = {} + font_glyph_byte_map = {} # Retrieve field text and selected values - field_flags = field.get(FA.Ff, 0) - if field.get(FA.FT, "/Tx") == "/Ch" and field_flags & FA.FfBits.Combo == 0: - txt = "\n".join(annotation.get_inherited(FA.Opt, [])) - sel = field.get("/V", []) - if not isinstance(sel, list): - sel = [sel] + field_flags = field.get(FieldDictionaryAttributes.Ff, 0) + if ( + field.get(FieldDictionaryAttributes.FT, "/Tx") == "/Ch" and + field_flags & FieldDictionaryAttributes.FfBits.Combo == 0 + ): + text = "\n".join(annotation.get_inherited(FieldDictionaryAttributes.Opt, [])) + selection = field.get("/V", []) + if not isinstance(selection, list): + selection = [selection] else: # /Tx - txt = field.get("/V", "") - sel = [] + text = field.get("/V", "") + selection = [] # Escape parentheses (PDF 1.7 reference, table 3.2, Literal Strings) - txt = txt.replace("\\", "\\\\").replace("(", r"\(").replace(")", r"\)") + text = text.replace("\\", "\\\\").replace("(", r"\(").replace(")", r"\)") # Create the TextStreamAppearance instance new_appearance_stream = cls( - txt, sel, da, font_full_rev, rct, font_height, y_offset + text, selection, default_appearance, font_glyph_byte_map, rectangle, font_height, y_offset ) - if AA.AP in annotation: - for k, v in cast(DictionaryObject, annotation[AA.AP]).get("/N", {}).items(): - if k not in {"/BBox", "/Length", "/Subtype", "/Type", "/Filter"}: - new_appearance_stream[k] = v + if AnnotationDictionaryAttributes.AP in annotation: + for key, value in ( + cast(DictionaryObject, annotation[AnnotationDictionaryAttributes.AP]).get("/N", {}).items() + ): + if key not in {"/BBox", "/Length", "/Subtype", "/Type", "/Filter"}: + new_appearance_stream[key] = value # Update Resources with font information if necessary - if font_res is not None: + if font_resource is not None: new_appearance_stream[NameObject("/Resources")] = DictionaryObject( { NameObject("/Font"): DictionaryObject( { NameObject(font_name): getattr( - font_res, "indirect_reference", font_res + font_resource, "indirect_reference", font_resource ) } ) From d6152a28a97c04b9fd86e56be30f32fda77b9f90 Mon Sep 17 00:00:00 2001 From: PJBrs Date: Tue, 30 Sep 2025 12:34:28 +0200 Subject: [PATCH 06/17] MAINT: _appearance_stream: Rename font_height to font_size This patch removes the variable name "font_height", because it means the same thing as font size. I think that font_height was introduced previously to distinguish between a font size found in an annotation's default appearance and the size set by a user. To be consistent, also use the variable user_font_name when it pertains to a user choice, and font_name for a font name found in a default appearance. --- pypdf/generic/_appearance_stream.py | 37 ++++++++++++++++------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/pypdf/generic/_appearance_stream.py b/pypdf/generic/_appearance_stream.py index 9991c4a77..9b4c2d69c 100644 --- a/pypdf/generic/_appearance_stream.py +++ b/pypdf/generic/_appearance_stream.py @@ -12,7 +12,7 @@ ) from ..generic._base import ByteStringObject, TextStringObject, is_null_or_none -DEFAULT_FONT_HEIGHT_IN_MULTILINE = 12 +DEFAULT_FONT_SIZE_IN_MULTILINE = 12 class TextStreamAppearance(DecodedStreamObject): @@ -28,7 +28,7 @@ def __init__( default_appearance: str = "", font_glyph_byte_map: Optional[dict[str, bytes]] = None, rectangle: Union[RectangleObject, tuple[float, float, float, float]] = (0.0, 0.0, 0.0, 0.0), - font_height: float = 0, + font_size: float = 0, y_offset: float = 0, ) -> None: super().__init__() @@ -43,14 +43,14 @@ def __init__( if selection and line in selection: # may be improved but cannot find how to get fill working => replaced with lined box ap_stream += ( - f"1 {y_offset - (line_number * font_height * 1.4) - 1} {rectangle.width - 2} {font_height + 2} re\n" + f"1 {y_offset - (line_number * font_size * 1.4) - 1} {rectangle.width - 2} {font_size + 2} re\n" f"0.5 0.5 0.5 rg s\n{default_appearance}\n" ).encode() if line_number == 0: ap_stream += f"2 {y_offset} Td\n".encode() else: # Td is a relative translation - ap_stream += f"0 {-font_height * 1.4} Td\n".encode() + ap_stream += f"0 {-font_size * 1.4} Td\n".encode() encoded_line: list[bytes] = [ font_glyph_byte_map.get(c, c.encode("utf-16-be")) for c in line ] @@ -72,8 +72,8 @@ def from_text_annotation( acro_form: DictionaryObject, # _root_object[CatalogDictionary.ACRO_FORM]) field: DictionaryObject, annotation: DictionaryObject, - font_name: str = "", - font_size: float = -1, + user_font_name: str = "", + user_font_size: float = -1, ) -> "TextStreamAppearance": """Creates a TextStreamAppearance object from a given text field annotation.""" @@ -92,23 +92,25 @@ def from_text_annotation( default_appearance = default_appearance.get_object() font_properties = default_appearance.replace("\n", " ").replace("\r", " ").split(" ") font_properties = [x for x in font_properties if x != ""] - if font_name: - font_properties[font_properties.index("Tf") - 2] = font_name + if user_font_name: + font_name = user_font_name + font_properties[font_properties.index("Tf") - 2] = user_font_name else: font_name = font_properties[font_properties.index("Tf") - 2] - font_height = ( - font_size - if font_size >= 0 + font_size = ( + user_font_size + if user_font_size >= 0 else float(font_properties[font_properties.index("Tf") - 1]) ) - if font_height == 0: + if font_size == 0: # Only when not set and / or 0 in default appearance if field.get(FieldDictionaryAttributes.Ff, 0) & FieldDictionaryAttributes.FfBits.Multiline: - font_height = DEFAULT_FONT_HEIGHT_IN_MULTILINE + font_size = DEFAULT_FONT_SIZE_IN_MULTILINE # 12 else: - font_height = rectangle.height - 2 - font_properties[font_properties.index("Tf") - 1] = str(font_height) + font_size = rectangle.height - 2 # Set as large as possible + font_properties[font_properties.index("Tf") - 1] = str(font_size) default_appearance = " ".join(font_properties) - y_offset = rectangle.height - 1 - font_height + + y_offset = rectangle.height - 1 - font_size # Retrieve font information from local DR ... document_resources: Any = cast( @@ -167,12 +169,13 @@ def from_text_annotation( else: # /Tx text = field.get("/V", "") selection = [] + # Escape parentheses (PDF 1.7 reference, table 3.2, Literal Strings) text = text.replace("\\", "\\\\").replace("(", r"\(").replace(")", r"\)") # Create the TextStreamAppearance instance new_appearance_stream = cls( - text, selection, default_appearance, font_glyph_byte_map, rectangle, font_height, y_offset + text, selection, default_appearance, font_glyph_byte_map, rectangle, font_size, y_offset ) if AnnotationDictionaryAttributes.AP in annotation: From e6e069d53a75a33cc94b04e6bce6aacd2db6a90e Mon Sep 17 00:00:00 2001 From: PJBrs Date: Tue, 30 Sep 2025 18:02:53 +0200 Subject: [PATCH 07/17] MAINT: _appearance_stream: More comments This patch adds more comments, especially to the from_text_annotation method, in the hope that this will later ease further refactoring. --- pypdf/generic/_appearance_stream.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/pypdf/generic/_appearance_stream.py b/pypdf/generic/_appearance_stream.py index 9b4c2d69c..1212dd8c7 100644 --- a/pypdf/generic/_appearance_stream.py +++ b/pypdf/generic/_appearance_stream.py @@ -41,7 +41,7 @@ def __init__( ).encode() for line_number, line in enumerate(text.replace("\n", "\r").split("\r")): if selection and line in selection: - # may be improved but cannot find how to get fill working => replaced with lined box + # Might be improved, but cannot find how to get fill working => replaced with lined box ap_stream += ( f"1 {y_offset - (line_number * font_size * 1.4) - 1} {rectangle.width - 2} {font_size + 2} re\n" f"0.5 0.5 0.5 rg s\n{default_appearance}\n" @@ -81,38 +81,49 @@ def from_text_annotation( _rectangle = cast(RectangleObject, annotation[AnnotationDictionaryAttributes.Rect]) rectangle = RectangleObject((0, 0, abs(_rectangle[2] - _rectangle[0]), abs(_rectangle[3] - _rectangle[1]))) - # Extract font information + # Get default appearance dictionary from annotation default_appearance = annotation.get_inherited( AnnotationDictionaryAttributes.DA, acro_form.get(AnnotationDictionaryAttributes.DA, None), ) if default_appearance is None: + # Create a default appearance if none was found in the annotation default_appearance = TextStringObject("/Helv 0 Tf 0 g") else: default_appearance = default_appearance.get_object() + + # Embed user-provided font name and font size in the default appearance, also + # taking into account whether the field flags indicate a multiline field. + # Uses the variable font_properties as an intermediate. font_properties = default_appearance.replace("\n", " ").replace("\r", " ").split(" ") font_properties = [x for x in font_properties if x != ""] + # Override default appearance font name with user provided font name, if given. if user_font_name: font_name = user_font_name font_properties[font_properties.index("Tf") - 2] = user_font_name else: + # Indirectly this just reads font_name from default appearance. font_name = font_properties[font_properties.index("Tf") - 2] + # Override default appearance font size with user provided font size, if given. font_size = ( user_font_size if user_font_size >= 0 else float(font_properties[font_properties.index("Tf") - 1]) ) + # Parse the field flags to find whether we need to wrap text, find whether we need to scale font size if font_size == 0: # Only when not set and / or 0 in default appearance if field.get(FieldDictionaryAttributes.Ff, 0) & FieldDictionaryAttributes.FfBits.Multiline: font_size = DEFAULT_FONT_SIZE_IN_MULTILINE # 12 else: font_size = rectangle.height - 2 # Set as large as possible font_properties[font_properties.index("Tf") - 1] = str(font_size) + # Reconstruct default appearance with user info and flags information default_appearance = " ".join(font_properties) + # Set the vertical offset y_offset = rectangle.height - 1 - font_size - # Retrieve font information from local DR ... + # Try to find a resource dictionary for the font document_resources: Any = cast( DictionaryObject, cast( @@ -133,6 +144,8 @@ def from_text_annotation( ) document_font_resources = document_resources.get_object().get("/Font", DictionaryObject()).get_object() font_resource = document_font_resources.get(font_name, None) + + # If this annotation has a font resources, get the font character map if not is_null_or_none(font_resource): font_resource = cast(DictionaryObject, font_resource.get_object()) _font_subtype, _, font_encoding, font_map = build_char_map_from_dict( From ae06c59ac8c9b19f0ead14d9161a32911b53c7e7 Mon Sep 17 00:00:00 2001 From: PJBrs Date: Sun, 26 Oct 2025 15:02:54 +0100 Subject: [PATCH 08/17] MAINT: _writer.py: Make some variables more readable This patch aims to make a couple of variables and associated imports more readable by writing them out in full instead of having very short abbreviations. --- pypdf/_writer.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 3a27dfddf..b4dd9db14 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -962,8 +962,8 @@ def update_page_form_field_values( """ if CatalogDictionary.ACRO_FORM not in self._root_object: raise PyPdfError("No /AcroForm dictionary in PDF of PdfWriter Object") - af = cast(DictionaryObject, self._root_object[CatalogDictionary.ACRO_FORM]) - if InteractiveFormDictEntries.Fields not in af: + acro_form = cast(DictionaryObject, self._root_object[CatalogDictionary.ACRO_FORM]) + if InteractiveFormDictEntries.Fields not in acro_form: raise PyPdfError("No /Fields dictionary in PDF of PdfWriter Object") if isinstance(auto_regenerate, bool): self.set_need_appearances_writer(auto_regenerate) @@ -990,7 +990,7 @@ def update_page_form_field_values( ).get_object() for field, value in fields.items(): - rct = cast(RectangleObject, annotation[AA.Rect]) + rectangle = cast(RectangleObject, annotation[AA.Rect]) if not ( self._get_qualified_field_name(parent_annotation) == field or parent_annotation.get("/T", None) == field @@ -1035,11 +1035,11 @@ def update_page_form_field_values( # Textbox; we need to generate the appearance stream object if isinstance(value, tuple): appearance_stream_obj = TextStreamAppearance.from_text_annotation( - af, parent_annotation, annotation, value[1], value[2] + acro_form, parent_annotation, annotation, value[1], value[2] ) else: appearance_stream_obj = TextStreamAppearance.from_text_annotation( - af, parent_annotation, annotation + acro_form, parent_annotation, annotation ) # Add the appearance stream object if AA.AP not in annotation: @@ -1059,7 +1059,7 @@ def update_page_form_field_values( ): # deprecated # not implemented yet logger_warning("Signature forms not implemented yet", __name__) if flatten and appearance_stream_obj is not None: - self._add_apstream_object(page, appearance_stream_obj, field, rct[0], rct[1]) + self._add_apstream_object(page, appearance_stream_obj, field, rectangle[0], rectangle[1]) def reattach_fields( self, page: Optional[PageObject] = None From 008aaa2258e2280c9ef860b38b2ef56dc6fba50a Mon Sep 17 00:00:00 2001 From: PJBrs Date: Thu, 18 Sep 2025 17:56:04 +0200 Subject: [PATCH 09/17] MAINT: _appearance_stream: Factor out generation of text appearance This patch makes the code for producing the appearance stream data into a separate method. --- pypdf/generic/_appearance_stream.py | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/pypdf/generic/_appearance_stream.py b/pypdf/generic/_appearance_stream.py index 1212dd8c7..5b58d8656 100644 --- a/pypdf/generic/_appearance_stream.py +++ b/pypdf/generic/_appearance_stream.py @@ -21,7 +21,7 @@ class TextStreamAppearance(DecodedStreamObject): This class is similar in form to the FreeText class in pypdf. """ - def __init__( + def _generate_appearance_stream_data( self, text: str = "", selection: Optional[list[str]] = None, @@ -30,8 +30,7 @@ def __init__( rectangle: Union[RectangleObject, tuple[float, float, float, float]] = (0.0, 0.0, 0.0, 0.0), font_size: float = 0, y_offset: float = 0, - ) -> None: - super().__init__() + ) -> bytes: font_glyph_byte_map = font_glyph_byte_map or {} if isinstance(rectangle, tuple): rectangle = RectangleObject(rectangle) @@ -59,12 +58,32 @@ def __init__( else: ap_stream += b"(" + b"".join(encoded_line) + b") Tj\n" ap_stream += b"ET\nQ\nEMC\nQ\n" + return ap_stream + + def __init__( + self, + text: str = "", + selection: Optional[list[str]] = None, + default_appearance: str = "", + font_glyph_byte_map: Optional[dict[str, bytes]] = None, + rectangle: Union[RectangleObject, tuple[float, float, float, float]] = (0.0, 0.0, 0.0, 0.0), + font_size: float = 0, + y_offset: float = 0, + ) -> None: + super().__init__() + font_glyph_byte_map = font_glyph_byte_map or {} + if isinstance(rectangle, tuple): + rectangle = RectangleObject(rectangle) + + ap_stream_data = self._generate_appearance_stream_data( + text, selection, default_appearance, font_glyph_byte_map, rectangle, font_size, y_offset, + ) self[NameObject("/Type")] = NameObject("/XObject") self[NameObject("/Subtype")] = NameObject("/Form") self[NameObject("/BBox")] = rectangle - self.set_data(ByteStringObject(ap_stream)) - self[NameObject("/Length")] = NumberObject(len(ap_stream)) + self.set_data(ByteStringObject(ap_stream_data)) + self[NameObject("/Length")] = NumberObject(len(ap_stream_data)) @classmethod def from_text_annotation( From a4c634e8ecb662a9dfcc821585e5b5ccd6a9e3df Mon Sep 17 00:00:00 2001 From: PJBrs Date: Fri, 19 Sep 2025 10:26:16 +0200 Subject: [PATCH 10/17] MAINT: _appearance_stream: Move y_offset calculation The y_offset calculation occurs very early on in the code, necessitating carrying it across various methods. This patch simplifies that logic. --- pypdf/generic/_appearance_stream.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pypdf/generic/_appearance_stream.py b/pypdf/generic/_appearance_stream.py index 5b58d8656..1de5c80ec 100644 --- a/pypdf/generic/_appearance_stream.py +++ b/pypdf/generic/_appearance_stream.py @@ -29,15 +29,19 @@ def _generate_appearance_stream_data( font_glyph_byte_map: Optional[dict[str, bytes]] = None, rectangle: Union[RectangleObject, tuple[float, float, float, float]] = (0.0, 0.0, 0.0, 0.0), font_size: float = 0, - y_offset: float = 0, ) -> bytes: font_glyph_byte_map = font_glyph_byte_map or {} if isinstance(rectangle, tuple): rectangle = RectangleObject(rectangle) + + # Set the vertical offset + y_offset = rectangle.height - 1 - font_size + ap_stream = ( f"q\n/Tx BMC \nq\n1 1 {rectangle.width - 1} {rectangle.height - 1} " f"re\nW\nBT\n{default_appearance}\n" ).encode() + for line_number, line in enumerate(text.replace("\n", "\r").split("\r")): if selection and line in selection: # Might be improved, but cannot find how to get fill working => replaced with lined box @@ -68,7 +72,6 @@ def __init__( font_glyph_byte_map: Optional[dict[str, bytes]] = None, rectangle: Union[RectangleObject, tuple[float, float, float, float]] = (0.0, 0.0, 0.0, 0.0), font_size: float = 0, - y_offset: float = 0, ) -> None: super().__init__() font_glyph_byte_map = font_glyph_byte_map or {} @@ -76,7 +79,7 @@ def __init__( rectangle = RectangleObject(rectangle) ap_stream_data = self._generate_appearance_stream_data( - text, selection, default_appearance, font_glyph_byte_map, rectangle, font_size, y_offset, + text, selection, default_appearance, font_glyph_byte_map, rectangle, font_size, ) self[NameObject("/Type")] = NameObject("/XObject") @@ -139,9 +142,6 @@ def from_text_annotation( # Reconstruct default appearance with user info and flags information default_appearance = " ".join(font_properties) - # Set the vertical offset - y_offset = rectangle.height - 1 - font_size - # Try to find a resource dictionary for the font document_resources: Any = cast( DictionaryObject, @@ -207,7 +207,7 @@ def from_text_annotation( # Create the TextStreamAppearance instance new_appearance_stream = cls( - text, selection, default_appearance, font_glyph_byte_map, rectangle, font_size, y_offset + text, selection, default_appearance, font_glyph_byte_map, rectangle, font_size ) if AnnotationDictionaryAttributes.AP in annotation: From ef5d23bcbfda0abcff5074e210398855a5814cf2 Mon Sep 17 00:00:00 2001 From: PJBrs Date: Fri, 19 Sep 2025 11:24:00 +0200 Subject: [PATCH 11/17] MAINT: _appearance_stream: Move multiline parsing This moves parsing the multiline field flag to the place where the other field flags are parsed, and moves the consequences for font size elsewhere. --- pypdf/generic/_appearance_stream.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/pypdf/generic/_appearance_stream.py b/pypdf/generic/_appearance_stream.py index 1de5c80ec..c040bfab4 100644 --- a/pypdf/generic/_appearance_stream.py +++ b/pypdf/generic/_appearance_stream.py @@ -1,3 +1,4 @@ +import re from typing import Any, Optional, Union, cast from .._cmap import _default_fonts_space_width, build_char_map_from_dict @@ -29,11 +30,20 @@ def _generate_appearance_stream_data( font_glyph_byte_map: Optional[dict[str, bytes]] = None, rectangle: Union[RectangleObject, tuple[float, float, float, float]] = (0.0, 0.0, 0.0, 0.0), font_size: float = 0, + multiline: bool = False ) -> bytes: font_glyph_byte_map = font_glyph_byte_map or {} if isinstance(rectangle, tuple): rectangle = RectangleObject(rectangle) + # If font_size is 0, apply the logic for multiline or large-as-possible font + if font_size == 0: + if multiline: + font_size = DEFAULT_FONT_SIZE_IN_MULTILINE + else: + font_size = rectangle.height - 2 + default_appearance = re.sub(r"0.0 Tf", str(font_size) + r" Tf", default_appearance) + # Set the vertical offset y_offset = rectangle.height - 1 - font_size @@ -72,6 +82,7 @@ def __init__( font_glyph_byte_map: Optional[dict[str, bytes]] = None, rectangle: Union[RectangleObject, tuple[float, float, float, float]] = (0.0, 0.0, 0.0, 0.0), font_size: float = 0, + multiline: bool = False ) -> None: super().__init__() font_glyph_byte_map = font_glyph_byte_map or {} @@ -79,7 +90,7 @@ def __init__( rectangle = RectangleObject(rectangle) ap_stream_data = self._generate_appearance_stream_data( - text, selection, default_appearance, font_glyph_byte_map, rectangle, font_size, + text, selection, default_appearance, font_glyph_byte_map, rectangle, font_size, multiline ) self[NameObject("/Type")] = NameObject("/XObject") @@ -132,12 +143,6 @@ def from_text_annotation( if user_font_size >= 0 else float(font_properties[font_properties.index("Tf") - 1]) ) - # Parse the field flags to find whether we need to wrap text, find whether we need to scale font size - if font_size == 0: # Only when not set and / or 0 in default appearance - if field.get(FieldDictionaryAttributes.Ff, 0) & FieldDictionaryAttributes.FfBits.Multiline: - font_size = DEFAULT_FONT_SIZE_IN_MULTILINE # 12 - else: - font_size = rectangle.height - 2 # Set as large as possible font_properties[font_properties.index("Tf") - 1] = str(font_size) # Reconstruct default appearance with user info and flags information default_appearance = " ".join(font_properties) @@ -188,8 +193,11 @@ def from_text_annotation( logger_warning(f"Font dictionary for {font_name} not found.", __name__) font_glyph_byte_map = {} - # Retrieve field text and selected values + # Retrieve field text, selected values and formatting information + multiline = False field_flags = field.get(FieldDictionaryAttributes.Ff, 0) + if field_flags & FieldDictionaryAttributes.FfBits.Multiline: + multiline = True if ( field.get(FieldDictionaryAttributes.FT, "/Tx") == "/Ch" and field_flags & FieldDictionaryAttributes.FfBits.Combo == 0 @@ -207,7 +215,7 @@ def from_text_annotation( # Create the TextStreamAppearance instance new_appearance_stream = cls( - text, selection, default_appearance, font_glyph_byte_map, rectangle, font_size + text, selection, default_appearance, font_glyph_byte_map, rectangle, font_size, multiline ) if AnnotationDictionaryAttributes.AP in annotation: From d964c43687f7257c68d3a007e659786c8667da6c Mon Sep 17 00:00:00 2001 From: PJBrs Date: Fri, 19 Sep 2025 16:41:12 +0200 Subject: [PATCH 12/17] MAINT: _appearance_writer: Don't pass default_appearance Instead of passing around default appearance, construct it from given font name, size and color. Also, having a default appearance as an argument for a text stream appearance seems less "natural" than just passing font name, size and color. This patch also represents a small number of simplifications that improve test coverage. --- pypdf/generic/_appearance_stream.py | 70 +++++++++++++++++------------ tests/test_writer.py | 2 +- 2 files changed, 43 insertions(+), 29 deletions(-) diff --git a/pypdf/generic/_appearance_stream.py b/pypdf/generic/_appearance_stream.py index c040bfab4..36888ad2b 100644 --- a/pypdf/generic/_appearance_stream.py +++ b/pypdf/generic/_appearance_stream.py @@ -26,10 +26,11 @@ def _generate_appearance_stream_data( self, text: str = "", selection: Optional[list[str]] = None, - default_appearance: str = "", font_glyph_byte_map: Optional[dict[str, bytes]] = None, rectangle: Union[RectangleObject, tuple[float, float, float, float]] = (0.0, 0.0, 0.0, 0.0), - font_size: float = 0, + font_name: str = "/Helv", + font_size: float = 0.0, + font_color: str = "0 g", multiline: bool = False ) -> bytes: font_glyph_byte_map = font_glyph_byte_map or {} @@ -42,11 +43,11 @@ def _generate_appearance_stream_data( font_size = DEFAULT_FONT_SIZE_IN_MULTILINE else: font_size = rectangle.height - 2 - default_appearance = re.sub(r"0.0 Tf", str(font_size) + r" Tf", default_appearance) # Set the vertical offset y_offset = rectangle.height - 1 - font_size + default_appearance = f"{font_name} {font_size} Tf {font_color}" ap_stream = ( f"q\n/Tx BMC \nq\n1 1 {rectangle.width - 1} {rectangle.height - 1} " f"re\nW\nBT\n{default_appearance}\n" @@ -78,10 +79,11 @@ def __init__( self, text: str = "", selection: Optional[list[str]] = None, - default_appearance: str = "", font_glyph_byte_map: Optional[dict[str, bytes]] = None, rectangle: Union[RectangleObject, tuple[float, float, float, float]] = (0.0, 0.0, 0.0, 0.0), - font_size: float = 0, + font_name: str = "/Helv", + font_size: float = 0.0, + font_color: str = "0 g", multiline: bool = False ) -> None: super().__init__() @@ -90,12 +92,19 @@ def __init__( rectangle = RectangleObject(rectangle) ap_stream_data = self._generate_appearance_stream_data( - text, selection, default_appearance, font_glyph_byte_map, rectangle, font_size, multiline + text, + selection, + font_glyph_byte_map, + rectangle, + font_name, + font_size, + font_color, + multiline ) self[NameObject("/Type")] = NameObject("/XObject") self[NameObject("/Subtype")] = NameObject("/Form") - self[NameObject("/BBox")] = rectangle + self[NameObject("/BBox")] = RectangleObject(rectangle) self.set_data(ByteStringObject(ap_stream_data)) self[NameObject("/Length")] = NumberObject(len(ap_stream_data)) @@ -119,33 +128,31 @@ def from_text_annotation( AnnotationDictionaryAttributes.DA, acro_form.get(AnnotationDictionaryAttributes.DA, None), ) - if default_appearance is None: + if not default_appearance: # Create a default appearance if none was found in the annotation default_appearance = TextStringObject("/Helv 0 Tf 0 g") else: default_appearance = default_appearance.get_object() - # Embed user-provided font name and font size in the default appearance, also - # taking into account whether the field flags indicate a multiline field. - # Uses the variable font_properties as an intermediate. - font_properties = default_appearance.replace("\n", " ").replace("\r", " ").split(" ") - font_properties = [x for x in font_properties if x != ""] - # Override default appearance font name with user provided font name, if given. + # Derive font name, size and color from the default appearance. Also set + # user-provided font name and font size in the default appearance, if given. + # For a font name, this presumes that we can find an associated font resource + # dictionary. Uses the variable font_properties as an intermediate. + # As per the PDF spec: + # "At a minimum, the string [that is, default_appearance] shall include a Tf (text + # font) operator along with its two operands, font and size" (Section 12.7.4.3 + # "Variable text" of the PDF 2.0 specification). + font_properties = [prop for prop in re.split(r"\s", default_appearance) if prop] + font_name = font_properties.pop(font_properties.index("Tf") - 2) + font_size = float(font_properties.pop(font_properties.index("Tf") - 1)) + font_properties.remove("Tf") + font_color = " ".join(font_properties) + # Determine the font name to use, prioritizing the user's input if user_font_name: font_name = user_font_name - font_properties[font_properties.index("Tf") - 2] = user_font_name - else: - # Indirectly this just reads font_name from default appearance. - font_name = font_properties[font_properties.index("Tf") - 2] - # Override default appearance font size with user provided font size, if given. - font_size = ( - user_font_size - if user_font_size >= 0 - else float(font_properties[font_properties.index("Tf") - 1]) - ) - font_properties[font_properties.index("Tf") - 1] = str(font_size) - # Reconstruct default appearance with user info and flags information - default_appearance = " ".join(font_properties) + # Determine the font size to use, prioritizing the user's input + if user_font_size > 0: + font_size = user_font_size # Try to find a resource dictionary for the font document_resources: Any = cast( @@ -215,7 +222,14 @@ def from_text_annotation( # Create the TextStreamAppearance instance new_appearance_stream = cls( - text, selection, default_appearance, font_glyph_byte_map, rectangle, font_size, multiline + text, + selection, + font_glyph_byte_map, + rectangle, + font_name, + font_size, + font_color, + multiline ) if AnnotationDictionaryAttributes.AP in annotation: diff --git a/tests/test_writer.py b/tests/test_writer.py index ce8d72c5f..c76f76a44 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -2406,7 +2406,7 @@ def test_selfont(): b"Text_1" in writer.pages[0]["/Annots"][1].get_object()["/AP"]["/N"].get_data() ) assert ( - b"/F3 12 Tf" + b"/F3 12.0 Tf" in writer.pages[0]["/Annots"][2].get_object()["/AP"]["/N"].get_data() ) assert ( From 71ecbfae6e50dab74066f01ae60432a9a529d563 Mon Sep 17 00:00:00 2001 From: PJBrs Date: Sun, 21 Sep 2025 21:07:12 +0200 Subject: [PATCH 13/17] MAINT: _appearance_stream: Move font_resource parsing Move the font resource parsing code to TextAppearanceStream, in the hope that, later, one might be able to generate a TextAppearanceStream directly. I wonder, though, where the necessary font resource would come from. --- pypdf/generic/_appearance_stream.py | 87 ++++++++++++++--------------- 1 file changed, 42 insertions(+), 45 deletions(-) diff --git a/pypdf/generic/_appearance_stream.py b/pypdf/generic/_appearance_stream.py index 36888ad2b..cb016502a 100644 --- a/pypdf/generic/_appearance_stream.py +++ b/pypdf/generic/_appearance_stream.py @@ -26,8 +26,8 @@ def _generate_appearance_stream_data( self, text: str = "", selection: Optional[list[str]] = None, - font_glyph_byte_map: Optional[dict[str, bytes]] = None, rectangle: Union[RectangleObject, tuple[float, float, float, float]] = (0.0, 0.0, 0.0, 0.0), + font_glyph_byte_map: Optional[dict[str, bytes]] = None, font_name: str = "/Helv", font_size: float = 0.0, font_color: str = "0 g", @@ -46,8 +46,8 @@ def _generate_appearance_stream_data( # Set the vertical offset y_offset = rectangle.height - 1 - font_size - default_appearance = f"{font_name} {font_size} Tf {font_color}" + ap_stream = ( f"q\n/Tx BMC \nq\n1 1 {rectangle.width - 1} {rectangle.height - 1} " f"re\nW\nBT\n{default_appearance}\n" @@ -79,23 +79,44 @@ def __init__( self, text: str = "", selection: Optional[list[str]] = None, - font_glyph_byte_map: Optional[dict[str, bytes]] = None, rectangle: Union[RectangleObject, tuple[float, float, float, float]] = (0.0, 0.0, 0.0, 0.0), + font_resource: Optional[DictionaryObject] = None, font_name: str = "/Helv", font_size: float = 0.0, font_color: str = "0 g", multiline: bool = False ) -> None: super().__init__() - font_glyph_byte_map = font_glyph_byte_map or {} - if isinstance(rectangle, tuple): - rectangle = RectangleObject(rectangle) + + # If a font resource was added, get the font character map + if font_resource: + font_resource = cast(DictionaryObject, font_resource.get_object()) + _font_subtype, _, font_encoding, font_map = build_char_map_from_dict( + 200, font_resource + ) + try: # remove width stored in -1 key + del font_map[-1] + except KeyError: + pass + font_glyph_byte_map: dict[str, bytes] + if isinstance(font_encoding, str): + font_glyph_byte_map = { + v: k.encode(font_encoding) for k, v in font_map.items() + } + else: + font_glyph_byte_map = {v: bytes((k,)) for k, v in font_encoding.items()} + font_encoding_rev = {v: bytes((k,)) for k, v in font_encoding.items()} + for key, value in font_map.items(): + font_glyph_byte_map[value] = font_encoding_rev.get(key, key) + else: + logger_warning(f"Font dictionary for {font_name} not found.", __name__) + font_glyph_byte_map = {} ap_stream_data = self._generate_appearance_stream_data( text, selection, - font_glyph_byte_map, rectangle, + font_glyph_byte_map, font_name, font_size, font_color, @@ -107,6 +128,19 @@ def __init__( self[NameObject("/BBox")] = RectangleObject(rectangle) self.set_data(ByteStringObject(ap_stream_data)) self[NameObject("/Length")] = NumberObject(len(ap_stream_data)) + # Update Resources with font information if necessary + if font_resource is not None: + self[NameObject("/Resources")] = DictionaryObject( + { + NameObject("/Font"): DictionaryObject( + { + NameObject(font_name): getattr( + font_resource, "indirect_reference", font_resource + ) + } + ) + } + ) @classmethod def from_text_annotation( @@ -175,30 +209,8 @@ def from_text_annotation( ) document_font_resources = document_resources.get_object().get("/Font", DictionaryObject()).get_object() font_resource = document_font_resources.get(font_name, None) - - # If this annotation has a font resources, get the font character map if not is_null_or_none(font_resource): font_resource = cast(DictionaryObject, font_resource.get_object()) - _font_subtype, _, font_encoding, font_map = build_char_map_from_dict( - 200, font_resource - ) - try: # remove width stored in -1 key - del font_map[-1] - except KeyError: - pass - font_glyph_byte_map: dict[str, bytes] - if isinstance(font_encoding, str): - font_glyph_byte_map = { - value: key.encode(font_encoding) for key, value in font_map.items() - } - else: - font_glyph_byte_map = {value: bytes((key,)) for key, value in font_encoding.items()} - font_encoding_rev = {value: bytes((key,)) for key, value in font_encoding.items()} - for key, value in font_map.items(): - font_glyph_byte_map[value] = font_encoding_rev.get(key, key) - else: - logger_warning(f"Font dictionary for {font_name} not found.", __name__) - font_glyph_byte_map = {} # Retrieve field text, selected values and formatting information multiline = False @@ -224,14 +236,13 @@ def from_text_annotation( new_appearance_stream = cls( text, selection, - font_glyph_byte_map, rectangle, + font_resource, font_name, font_size, font_color, multiline ) - if AnnotationDictionaryAttributes.AP in annotation: for key, value in ( cast(DictionaryObject, annotation[AnnotationDictionaryAttributes.AP]).get("/N", {}).items() @@ -239,18 +250,4 @@ def from_text_annotation( if key not in {"/BBox", "/Length", "/Subtype", "/Type", "/Filter"}: new_appearance_stream[key] = value - # Update Resources with font information if necessary - if font_resource is not None: - new_appearance_stream[NameObject("/Resources")] = DictionaryObject( - { - NameObject("/Font"): DictionaryObject( - { - NameObject(font_name): getattr( - font_resource, "indirect_reference", font_resource - ) - } - ) - } - ) - return new_appearance_stream From 1a0629b2f9c3fc60680c2578a8508adfd05b7b8a Mon Sep 17 00:00:00 2001 From: PJBrs Date: Sat, 20 Sep 2025 14:57:25 +0200 Subject: [PATCH 14/17] MAINT: _appearance_stream: Document all methods --- pypdf/generic/_appearance_stream.py | 71 +++++++++++++++++++++++++++-- 1 file changed, 68 insertions(+), 3 deletions(-) diff --git a/pypdf/generic/_appearance_stream.py b/pypdf/generic/_appearance_stream.py index cb016502a..aa2e9fbc2 100644 --- a/pypdf/generic/_appearance_stream.py +++ b/pypdf/generic/_appearance_stream.py @@ -19,9 +19,11 @@ class TextStreamAppearance(DecodedStreamObject): """ A class representing the appearance stream for a text-based form field. - This class is similar in form to the FreeText class in pypdf. - """ + This class generates the content stream (the `ap_stream_data`) that dictates + how text is rendered within a form field's bounding box. It handles properties + like font, font size, color, multiline text, and text selection highlighting. + """ def _generate_appearance_stream_data( self, text: str = "", @@ -33,6 +35,31 @@ def _generate_appearance_stream_data( font_color: str = "0 g", multiline: bool = False ) -> bytes: + """ + Generates the raw bytes of the PDF appearance stream for a text field. + + This private method assembles the PDF content stream operators to draw + the provided text within the specified rectangle. It handles text positioning, + font application, color, and special formatting like selected text. + + Args: + text: The text to be rendered in the form field. + selection: An optional list of strings that should be highlighted as selected. + font_glyph_byte_map: An optional dictionary mapping characters to their + byte representation for glyph encoding. + rect: The bounding box of the form field. Can be a `RectangleObject` + or a tuple of four floats (x1, y1, x2, y2). + font_name: The name of the font resource to use (e.g., "/Helv"). + font_size: The font size. If 0, it is automatically calculated + based on whether the field is multiline or not. + font_color: The color to apply to the font, represented as a PDF + graphics state string (e.g., "0 g" for black). + multiline: A boolean indicating if the text field is multiline. + + Returns: + A byte string containing the PDF content stream data. + + """ font_glyph_byte_map = font_glyph_byte_map or {} if isinstance(rectangle, tuple): rectangle = RectangleObject(rectangle) @@ -86,6 +113,25 @@ def __init__( font_color: str = "0 g", multiline: bool = False ) -> None: + """ + Initializes a TextStreamAppearance object. + + This constructor creates a new PDF stream object configured as an XObject + of subtype Form. It uses the `_appearance_stream_data` method to generate + the content for the stream. + + Args: + text: The text to be rendered in the form field. + selection: An optional list of strings that should be highlighted as selected. + rect: The bounding box of the form field. Can be a `RectangleObject` + or a tuple of four floats (x1, y1, x2, y2). + font_resource: An optional variable that represents a PDF font dictionary. + font_name: The name of the font resource, e.g., "/Helv". + font_size: The font size. If 0, it's auto-calculated. + font_color: The font color string. + multiline: A boolean indicating if the text field is multiline. + + """ super().__init__() # If a font resource was added, get the font character map @@ -151,8 +197,27 @@ def from_text_annotation( user_font_name: str = "", user_font_size: float = -1, ) -> "TextStreamAppearance": - """Creates a TextStreamAppearance object from a given text field annotation.""" + """ + Creates a TextStreamAppearance object from a text field annotation. + + This class method is a factory for creating a `TextStreamAppearance` + instance by extracting all necessary information (bounding box, font, + text content, etc.) from the PDF field and annotation dictionaries. + It respects inheritance for properties like default appearance (`/DA`). + + Args: + acro_form: The root AcroForm dictionary from the PDF catalog. + field: The field dictionary object. + annotation: The widget annotation dictionary object associated with the field. + user_font_name: An optional user-provided font name to override the + default. Defaults to an empty string. + user_font_size: An optional user-provided font size to override the + default. A value of -1 indicates no override. + + Returns: + A new `TextStreamAppearance` instance configured for the given field. + """ # Calculate rectangle dimensions _rectangle = cast(RectangleObject, annotation[AnnotationDictionaryAttributes.Rect]) rectangle = RectangleObject((0, 0, abs(_rectangle[2] - _rectangle[0]), abs(_rectangle[3] - _rectangle[1]))) From bd08e0310a2021cefbd4f7f589be116f3587acbc Mon Sep 17 00:00:00 2001 From: PJBrs Date: Sun, 26 Oct 2025 16:39:41 +0100 Subject: [PATCH 15/17] MAINT: _appearance_stream: Shorten some code Some code had many lines with one brace; reduce. --- pypdf/generic/_appearance_stream.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/pypdf/generic/_appearance_stream.py b/pypdf/generic/_appearance_stream.py index aa2e9fbc2..95f7bcf2c 100644 --- a/pypdf/generic/_appearance_stream.py +++ b/pypdf/generic/_appearance_stream.py @@ -176,17 +176,11 @@ def __init__( self[NameObject("/Length")] = NumberObject(len(ap_stream_data)) # Update Resources with font information if necessary if font_resource is not None: - self[NameObject("/Resources")] = DictionaryObject( - { - NameObject("/Font"): DictionaryObject( - { - NameObject(font_name): getattr( - font_resource, "indirect_reference", font_resource - ) - } - ) - } - ) + self[NameObject("/Resources")] = DictionaryObject({ + NameObject("/Font"): DictionaryObject({ + NameObject(font_name): getattr(font_resource, "indirect_reference", font_resource) + }) + }) @classmethod def from_text_annotation( From d1f01f0f2960ad11ed32b36e4b953172490592cd Mon Sep 17 00:00:00 2001 From: PJBrs Date: Wed, 5 Nov 2025 13:45:59 +0100 Subject: [PATCH 16/17] MAINT: _appearance_stream: Change multiline to is_multiline --- pypdf/generic/_appearance_stream.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/pypdf/generic/_appearance_stream.py b/pypdf/generic/_appearance_stream.py index 95f7bcf2c..fe860f783 100644 --- a/pypdf/generic/_appearance_stream.py +++ b/pypdf/generic/_appearance_stream.py @@ -33,7 +33,7 @@ def _generate_appearance_stream_data( font_name: str = "/Helv", font_size: float = 0.0, font_color: str = "0 g", - multiline: bool = False + is_multiline: bool = False ) -> bytes: """ Generates the raw bytes of the PDF appearance stream for a text field. @@ -54,7 +54,7 @@ def _generate_appearance_stream_data( based on whether the field is multiline or not. font_color: The color to apply to the font, represented as a PDF graphics state string (e.g., "0 g" for black). - multiline: A boolean indicating if the text field is multiline. + is_multiline: A boolean indicating if the text field is multiline. Returns: A byte string containing the PDF content stream data. @@ -66,7 +66,7 @@ def _generate_appearance_stream_data( # If font_size is 0, apply the logic for multiline or large-as-possible font if font_size == 0: - if multiline: + if is_multiline: font_size = DEFAULT_FONT_SIZE_IN_MULTILINE else: font_size = rectangle.height - 2 @@ -111,7 +111,7 @@ def __init__( font_name: str = "/Helv", font_size: float = 0.0, font_color: str = "0 g", - multiline: bool = False + is_multiline: bool = False ) -> None: """ Initializes a TextStreamAppearance object. @@ -129,7 +129,7 @@ def __init__( font_name: The name of the font resource, e.g., "/Helv". font_size: The font size. If 0, it's auto-calculated. font_color: The font color string. - multiline: A boolean indicating if the text field is multiline. + is_multiline: A boolean indicating if the text field is multiline. """ super().__init__() @@ -166,7 +166,7 @@ def __init__( font_name, font_size, font_color, - multiline + is_multiline ) self[NameObject("/Type")] = NameObject("/XObject") @@ -272,10 +272,10 @@ def from_text_annotation( font_resource = cast(DictionaryObject, font_resource.get_object()) # Retrieve field text, selected values and formatting information - multiline = False + is_multiline = False field_flags = field.get(FieldDictionaryAttributes.Ff, 0) if field_flags & FieldDictionaryAttributes.FfBits.Multiline: - multiline = True + is_multiline = True if ( field.get(FieldDictionaryAttributes.FT, "/Tx") == "/Ch" and field_flags & FieldDictionaryAttributes.FfBits.Combo == 0 @@ -300,7 +300,7 @@ def from_text_annotation( font_name, font_size, font_color, - multiline + is_multiline ) if AnnotationDictionaryAttributes.AP in annotation: for key, value in ( From 8497da3661651a9b3c01bce2948c204135612653 Mon Sep 17 00:00:00 2001 From: PJBrs Date: Wed, 5 Nov 2025 13:46:57 +0100 Subject: [PATCH 17/17] MAINT: Newline after docstring --- pypdf/generic/_appearance_stream.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pypdf/generic/_appearance_stream.py b/pypdf/generic/_appearance_stream.py index fe860f783..43e9c1657 100644 --- a/pypdf/generic/_appearance_stream.py +++ b/pypdf/generic/_appearance_stream.py @@ -24,6 +24,7 @@ class TextStreamAppearance(DecodedStreamObject): how text is rendered within a form field's bounding box. It handles properties like font, font size, color, multiline text, and text selection highlighting. """ + def _generate_appearance_stream_data( self, text: str = "",