From e9ea6867ba0ecf87b926f569168c3e48a279e740 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 4 Jun 2023 19:07:52 +0200 Subject: [PATCH 1/4] STY : same attributes between PdfReader and PdfWriter provides the same interface to access root,info,id for communalisation --- pypdf/_page_labels.py | 14 +++--- pypdf/_protocols.py | 8 ++++ pypdf/_reader.py | 107 ++++++++++++++++++++++++------------------ pypdf/_writer.py | 22 ++++----- 4 files changed, 87 insertions(+), 64 deletions(-) diff --git a/pypdf/_page_labels.py b/pypdf/_page_labels.py index 000820fca..a7383ac37 100644 --- a/pypdf/_page_labels.py +++ b/pypdf/_page_labels.py @@ -11,12 +11,12 @@ Example 1 --------- ->>> reader.trailer["/Root"]["/PageLabels"]["/Nums"] +>>> reader._root_object["/PageLabels"]["/Nums"] [0, IndirectObject(18, 0, 139929798197504), 8, IndirectObject(19, 0, 139929798197504)] ->>> reader.get_object(reader.trailer["/Root"]["/PageLabels"]["/Nums"][1]) +>>> reader.get_object(reader._root_object["/PageLabels"]["/Nums"][1]) {'/S': '/r'} ->>> reader.get_object(reader.trailer["/Root"]["/PageLabels"]["/Nums"][3]) +>>> reader.get_object(reader._root_object["/PageLabels"]["/Nums"][3]) {'/S': '/D'} Example 2 @@ -57,7 +57,7 @@ aa to zz for the next 26, and so on) """ -from typing import Iterator, Optional, Tuple +from typing import Iterator, Optional, Tuple, cast from ._protocols import PdfReaderProtocol from ._utils import logger_warning @@ -127,10 +127,10 @@ def index2label(reader: PdfReaderProtocol, index: int) -> str: Returns: The label of the page, e.g. "iv" or "4". """ - root = reader.trailer["/Root"] + root = cast(DictionaryObject, reader._root_object) if "/PageLabels" not in root: return str(index + 1) # Fallback - number_tree = root["/PageLabels"] + number_tree = cast(DictionaryObject, root["/PageLabels"]) if "/Nums" in number_tree: # [Nums] shall be an array of the form # [ key 1 value 1 key 2 value 2 ... key n value n ] @@ -139,7 +139,7 @@ def index2label(reader: PdfReaderProtocol, index: int) -> str: # The keys shall be sorted in numerical order, # analogously to the arrangement of keys in a name tree # as described in 7.9.6, "Name Trees." - nums = number_tree["/Nums"] + nums = cast(ArrayObject, number_tree["/Nums"]) i = 0 value = None start_index = 0 diff --git a/pypdf/_protocols.py b/pypdf/_protocols.py index c6f2bbebd..bf1f8b17e 100644 --- a/pypdf/_protocols.py +++ b/pypdf/_protocols.py @@ -59,6 +59,10 @@ def pages(self) -> List[Any]: def trailer(self) -> Dict[str, Any]: ... + @property + def _root_object(self) -> PdfObjectProtocol: + ... + def get_object(self, indirect_reference: Any) -> Optional[PdfObjectProtocol]: ... @@ -83,3 +87,7 @@ def pages(self) -> List[Any]: @property def pdf_header(self) -> bytes: ... + + @property + def _root_object(self) -> PdfObjectProtocol: + ... diff --git a/pypdf/_reader.py b/pypdf/_reader.py index 3bf9909a5..102617903 100644 --- a/pypdf/_reader.py +++ b/pypdf/_reader.py @@ -34,6 +34,7 @@ from datetime import datetime from io import BytesIO, UnsupportedOperation from pathlib import Path +from types import NoneType from typing import ( Any, Callable, @@ -329,8 +330,7 @@ def __init__( # Some documents may not have a /ID, use two empty # byte strings instead. Solves # https://github.com/py-pdf/pypdf/issues/608 - id_entry = self.trailer.get(TK.ID) - id1_entry = id_entry[0].get_object().original_bytes if id_entry else b"" + id1_entry = self._ID[0].get_object().original_bytes if self._ID else b"" encrypt_entry = cast( DictionaryObject, self.trailer[TK.ENCRYPT].get_object() ) @@ -348,6 +348,34 @@ def __init__( elif password is not None: raise PdfReadError("Not encrypted file") + @property + def _root_object(self) -> DictionaryObject: + """Provide access to "/Root". standardized with PdfWriter.""" + return cast(DictionaryObject, self.trailer[TK.ROOT]) + + @property + def _info(self) -> Optional[DictionaryObject]: + """ + Provide access to "/Info". standardized with PdfWriter. + + Returns: + /Info Dictionary ; None if the entry does not exists + + """ + info = self.trailer.get(TK.INFO, None) + return None if info is None else cast(DictionaryObject, info.get_object()) + + @property + def _ID(self) -> Optional[ArrayObject]: + """ + Provide access to "/ID". standardized with PdfWriter. + + Returns: + /ID array ; None if the entry does not exists + """ + id = self.trailer.get(TK.ID, None) + return None if id is None else cast(ArrayObject, id.get_object()) + @property def pdf_header(self) -> str: """ @@ -375,9 +403,9 @@ def metadata(self) -> Optional[DocumentInformation]: """ if TK.INFO not in self.trailer: return None - obj = self.trailer[TK.INFO] + obj = self._info retval = DocumentInformation() - if isinstance(obj, type(None)): + if obj is None: raise PdfReadError( "trailer not found or does not point to document information directory" ) @@ -408,7 +436,7 @@ def xmp_metadata(self) -> Optional[XmpInformation]: """XMP (Extensible Metadata Platform) data.""" try: self._override_encryption = True - return self.trailer[TK.ROOT].xmp_metadata # type: ignore + return self._root_object.xmp_metadata # type: ignore finally: self._override_encryption = False @@ -446,7 +474,7 @@ def _get_num_pages(self) -> int: # the PDF file's page count is used in this case. Otherwise, # the original method (flattened page count) is used. if self.is_encrypted: - return self.trailer[TK.ROOT]["/Pages"]["/Count"] # type: ignore + return self._root_object["/Pages"]["/Count"] # type: ignore else: if self.flattened_pages is None: self._flatten() @@ -546,10 +574,9 @@ def get_fields( field_attributes.update(CheckboxRadioButtonAttributes.attributes_dict()) if retval is None: retval = {} - catalog = cast(DictionaryObject, self.trailer[TK.ROOT]) # get the AcroForm tree - if CD.ACRO_FORM in catalog: - tree = cast(Optional[TreeObject], catalog[CD.ACRO_FORM]) + if CD.ACRO_FORM in self._root_object: + tree = cast(Optional[TreeObject], self._root_object[CD.ACRO_FORM]) else: return None if tree is None: @@ -739,13 +766,12 @@ def _get_named_destinations( """ if retval is None: retval = {} - catalog = cast(DictionaryObject, self.trailer[TK.ROOT]) # get the name tree - if CA.DESTS in catalog: - tree = cast(TreeObject, catalog[CA.DESTS]) - elif CA.NAMES in catalog: - names = cast(DictionaryObject, catalog[CA.NAMES]) + if CA.DESTS in self._root_object: + tree = cast(TreeObject, self._root_object[CA.DESTS]) + elif CA.NAMES in self._root_object: + names = cast(DictionaryObject, self._root_object[CA.NAMES]) if CA.DESTS in names: tree = cast(TreeObject, names[CA.DESTS]) @@ -825,11 +851,10 @@ def _get_outline( ) -> OutlineType: if outline is None: outline = [] - catalog = cast(DictionaryObject, self.trailer[TK.ROOT]) # get the outline dictionary and named destinations - if CO.OUTLINES in catalog: - lines = cast(DictionaryObject, catalog[CO.OUTLINES]) + if CO.OUTLINES in self._root_object: + lines = cast(DictionaryObject, self._root_object[CO.OUTLINES]) if isinstance(lines, NullObject): return outline @@ -882,9 +907,8 @@ def threads(self) -> Optional[ArrayObject]: It's an array of dictionaries with "/F" and "/I" properties or None if there are no articles. """ - catalog = cast(DictionaryObject, self.trailer[TK.ROOT]) - if CO.THREADS in catalog: - return cast("ArrayObject", catalog[CO.THREADS]) + if CO.THREADS in self._root_object: + return cast("ArrayObject", self._root_object[CO.THREADS]) else: return None @@ -1097,9 +1121,8 @@ def page_layout(self) -> Optional[str]: * - /TwoPageRight - Show two pages at a time, odd-numbered pages on the right """ - trailer = cast(DictionaryObject, self.trailer[TK.ROOT]) - if CD.PAGE_LAYOUT in trailer: - return cast(NameObject, trailer[CD.PAGE_LAYOUT]) + if CD.PAGE_LAYOUT in self._root_object: + return cast(NameObject, self._root_object[CD.PAGE_LAYOUT]) return None def getPageLayout(self) -> Optional[str]: # deprecated @@ -1143,7 +1166,7 @@ def page_mode(self) -> Optional[PagemodeType]: - Show attachments panel """ try: - return self.trailer[TK.ROOT]["/PageMode"] # type: ignore + return self._root_object["/PageMode"] # type: ignore except KeyError: return None @@ -1183,7 +1206,7 @@ def _flatten( if pages is None: # Fix issue 327: set flattened_pages attribute only for # decrypted file - catalog = self.trailer[TK.ROOT].get_object() + catalog = self._root_object.get_object() pages = catalog["/Pages"].get_object() # type: ignore self.flattened_pages = [] @@ -2073,12 +2096,11 @@ def isEncrypted(self) -> bool: # deprecated def xfa(self) -> Optional[Dict[str, Any]]: tree: Optional[TreeObject] = None retval: Dict[str, Any] = {} - catalog = cast(DictionaryObject, self.trailer[TK.ROOT]) - if "/AcroForm" not in catalog or not catalog["/AcroForm"]: + if isinstance(self._root_object.get("/AcroForm", None), (NoneType, NullObject)): return None - tree = cast(TreeObject, catalog["/AcroForm"]) + tree = cast(TreeObject, self._root_object["/AcroForm"]) if "/XFA" in tree: fields = cast(ArrayObject, tree["/XFA"]) @@ -2103,13 +2125,10 @@ def add_form_topname(self, name: str) -> Optional[DictionaryObject]: Returns: The created object. ``None`` means no object was created. """ - catalog = cast(DictionaryObject, self.trailer[TK.ROOT]) - - if "/AcroForm" not in catalog or not isinstance( - catalog["/AcroForm"], DictionaryObject - ): + if isinstance(self._root_object.get("/AcroForm", None), (NoneType, NullObject)): return None - acroform = cast(DictionaryObject, catalog[NameObject("/AcroForm")]) + + acroform = cast(DictionaryObject, self._root_object["/AcroForm"]) if "/Fields" not in acroform: # TODO: :No error returns but may be extended for XFA Forms return None @@ -2145,13 +2164,10 @@ def rename_form_topname(self, name: str) -> Optional[DictionaryObject]: Returns: The modified object. ``None`` means no object was modified. """ - catalog = cast(DictionaryObject, self.trailer[TK.ROOT]) - - if "/AcroForm" not in catalog or not isinstance( - catalog["/AcroForm"], DictionaryObject - ): + if isinstance(self._root_object.get("/AcroForm", None), (NoneType, NullObject)): return None - acroform = cast(DictionaryObject, catalog[NameObject("/AcroForm")]) + + acroform = cast(DictionaryObject, self._root_object["/AcroForm"]) if "/Fields" not in acroform: return None @@ -2178,14 +2194,14 @@ def _list_attachments(self) -> List[str]: Returns: list of filenames """ - catalog = cast(DictionaryObject, self.trailer["/Root"]) - # From the catalog get the embedded file names try: filenames = cast( ArrayObject, cast( DictionaryObject, - cast(DictionaryObject, catalog["/Names"])["/EmbeddedFiles"], + cast(DictionaryObject, self._root_object["/Names"])[ + "/EmbeddedFiles" + ], )["/Names"], ) except KeyError: @@ -2220,14 +2236,15 @@ def _get_attachments( dictionary of filename -> Union[bytestring or List[ByteString]] if the filename exists multiple times a List of the different version will be provided """ - catalog = cast(DictionaryObject, self.trailer["/Root"]) # From the catalog get the embedded file names try: filenames = cast( ArrayObject, cast( DictionaryObject, - cast(DictionaryObject, catalog["/Names"])["/EmbeddedFiles"], + cast(DictionaryObject, self._root_object["/Names"])[ + "/EmbeddedFiles" + ], )["/Names"], ) except KeyError: diff --git a/pypdf/_writer.py b/pypdf/_writer.py index aa42c0f27..0b205c770 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -898,7 +898,7 @@ def clone_reader_document_root(self, reader: PdfReader) -> None: Args: reader: PdfReader from the document root should be copied. """ - self._root_object = cast(DictionaryObject, reader.trailer[TK.ROOT].clone(self)) + self._root_object = cast(DictionaryObject, reader._root_object.clone(self)) self._root = self._root_object.indirect_reference # type: ignore[assignment] self._pages = self._root_object.raw_get("/Pages") self._flatten() @@ -989,11 +989,10 @@ def clone_document_from_reader( document. """ self.clone_reader_document_root(reader) - self._info = reader.trailer[TK.INFO].clone(self).indirect_reference # type: ignore - try: - self._ID = cast(ArrayObject, reader.trailer[TK.ID].clone(self)) # type: ignore - except KeyError: - pass + self._info = reader._info.clone(self).indirect_reference # type: ignore + _i = reader._ID + if _i is not None: + self._ID = _i if callable(after_page_append): for page in cast( ArrayObject, cast(DictionaryObject, self._pages.get_object())["/Kids"] @@ -2781,10 +2780,9 @@ def merge( else: outline_item_typ = self.get_outline_root() - _ro = cast("DictionaryObject", reader.trailer[TK.ROOT]) - if import_outline and CO.OUTLINES in _ro: + if import_outline and CO.OUTLINES in reader._root_object: outline = self._get_filtered_outline( - _ro.get(CO.OUTLINES, None), srcpages, reader + reader._root_object.get(CO.OUTLINES, None), srcpages, reader ) self._insert_filtered_outline( outline, outline_item_typ, None @@ -2799,12 +2797,12 @@ def merge( pag[NameObject("/Annots")] = lst self.clean_page(pag) - if "/AcroForm" in cast(DictionaryObject, reader.trailer["/Root"]): + if "/AcroForm" in cast(DictionaryObject, reader._root_object): if "/AcroForm" not in self._root_object: self._root_object[NameObject("/AcroForm")] = self._add_object( cast( DictionaryObject, - cast(DictionaryObject, reader.trailer["/Root"])["/AcroForm"], + reader._root_object["/AcroForm"], ).clone(self, False, ("/Fields",)) ) arr = ArrayObject() @@ -2815,7 +2813,7 @@ def merge( ) trslat = self._id_translated[id(reader)] try: - for f in reader.trailer["/Root"]["/AcroForm"]["/Fields"]: # type: ignore + for f in reader._root_object["/AcroForm"]["/Fields"]: # type: ignore try: ind = IndirectObject(trslat[f.idnum], 0, self) if ind not in arr: From b282a8fc0c21221bf3d877bcf816b2cb51d6e7b4 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 4 Jun 2023 19:24:57 +0200 Subject: [PATCH 2/4] fix NoneType --- pypdf/_reader.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/pypdf/_reader.py b/pypdf/_reader.py index 102617903..e838b5e41 100644 --- a/pypdf/_reader.py +++ b/pypdf/_reader.py @@ -34,7 +34,6 @@ from datetime import datetime from io import BytesIO, UnsupportedOperation from pathlib import Path -from types import NoneType from typing import ( Any, Callable, @@ -2097,10 +2096,10 @@ def xfa(self) -> Optional[Dict[str, Any]]: tree: Optional[TreeObject] = None retval: Dict[str, Any] = {} - if isinstance(self._root_object.get("/AcroForm", None), (NoneType, NullObject)): + if isinstance(self.root_object.get("/AcroForm", None), (type(None), NullObject)): return None - tree = cast(TreeObject, self._root_object["/AcroForm"]) + tree = cast(TreeObject, self.root_object["/AcroForm"]) if "/XFA" in tree: fields = cast(ArrayObject, tree["/XFA"]) @@ -2125,10 +2124,10 @@ def add_form_topname(self, name: str) -> Optional[DictionaryObject]: Returns: The created object. ``None`` means no object was created. """ - if isinstance(self._root_object.get("/AcroForm", None), (NoneType, NullObject)): + if isinstance(self.root_object.get("/AcroForm", None), (type(None), NullObject)): return None - acroform = cast(DictionaryObject, self._root_object["/AcroForm"]) + acroform = cast(DictionaryObject, self.root_object["/AcroForm"]) if "/Fields" not in acroform: # TODO: :No error returns but may be extended for XFA Forms return None @@ -2164,10 +2163,10 @@ def rename_form_topname(self, name: str) -> Optional[DictionaryObject]: Returns: The modified object. ``None`` means no object was modified. """ - if isinstance(self._root_object.get("/AcroForm", None), (NoneType, NullObject)): + if isinstance(self.root_object.get("/AcroForm", None), (type(None), NullObject)): return None - acroform = cast(DictionaryObject, self._root_object["/AcroForm"]) + acroform = cast(DictionaryObject, self.root_object["/AcroForm"]) if "/Fields" not in acroform: return None From d98bccf9c7e0cfb96484856cda47fe4cccafe308 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 4 Jun 2023 19:27:14 +0200 Subject: [PATCH 3/4] STY : replace _root_object with root_object the interface should be public --- pypdf/_page_labels.py | 8 +-- pypdf/_protocols.py | 4 +- pypdf/_reader.py | 50 +++++++++-------- pypdf/_writer.py | 113 ++++++++++++++++++++------------------- tests/test_javascript.py | 12 ++--- tests/test_writer.py | 14 ++--- 6 files changed, 106 insertions(+), 95 deletions(-) diff --git a/pypdf/_page_labels.py b/pypdf/_page_labels.py index a7383ac37..c3a9294ab 100644 --- a/pypdf/_page_labels.py +++ b/pypdf/_page_labels.py @@ -11,12 +11,12 @@ Example 1 --------- ->>> reader._root_object["/PageLabels"]["/Nums"] +>>> reader.root_object["/PageLabels"]["/Nums"] [0, IndirectObject(18, 0, 139929798197504), 8, IndirectObject(19, 0, 139929798197504)] ->>> reader.get_object(reader._root_object["/PageLabels"]["/Nums"][1]) +>>> reader.get_object(reader.root_object["/PageLabels"]["/Nums"][1]) {'/S': '/r'} ->>> reader.get_object(reader._root_object["/PageLabels"]["/Nums"][3]) +>>> reader.get_object(reader.root_object["/PageLabels"]["/Nums"][3]) {'/S': '/D'} Example 2 @@ -127,7 +127,7 @@ def index2label(reader: PdfReaderProtocol, index: int) -> str: Returns: The label of the page, e.g. "iv" or "4". """ - root = cast(DictionaryObject, reader._root_object) + root = cast(DictionaryObject, reader.root_object) if "/PageLabels" not in root: return str(index + 1) # Fallback number_tree = cast(DictionaryObject, root["/PageLabels"]) diff --git a/pypdf/_protocols.py b/pypdf/_protocols.py index bf1f8b17e..a906b54eb 100644 --- a/pypdf/_protocols.py +++ b/pypdf/_protocols.py @@ -60,7 +60,7 @@ def trailer(self) -> Dict[str, Any]: ... @property - def _root_object(self) -> PdfObjectProtocol: + def root_object(self) -> PdfObjectProtocol: ... def get_object(self, indirect_reference: Any) -> Optional[PdfObjectProtocol]: @@ -89,5 +89,5 @@ def pdf_header(self) -> bytes: ... @property - def _root_object(self) -> PdfObjectProtocol: + def root_object(self) -> PdfObjectProtocol: ... diff --git a/pypdf/_reader.py b/pypdf/_reader.py index e838b5e41..a2d1d6d55 100644 --- a/pypdf/_reader.py +++ b/pypdf/_reader.py @@ -348,7 +348,7 @@ def __init__( raise PdfReadError("Not encrypted file") @property - def _root_object(self) -> DictionaryObject: + def root_object(self) -> DictionaryObject: """Provide access to "/Root". standardized with PdfWriter.""" return cast(DictionaryObject, self.trailer[TK.ROOT]) @@ -435,7 +435,7 @@ def xmp_metadata(self) -> Optional[XmpInformation]: """XMP (Extensible Metadata Platform) data.""" try: self._override_encryption = True - return self._root_object.xmp_metadata # type: ignore + return self.root_object.xmp_metadata # type: ignore finally: self._override_encryption = False @@ -473,7 +473,7 @@ def _get_num_pages(self) -> int: # the PDF file's page count is used in this case. Otherwise, # the original method (flattened page count) is used. if self.is_encrypted: - return self._root_object["/Pages"]["/Count"] # type: ignore + return self.root_object["/Pages"]["/Count"] # type: ignore else: if self.flattened_pages is None: self._flatten() @@ -574,8 +574,8 @@ def get_fields( if retval is None: retval = {} # get the AcroForm tree - if CD.ACRO_FORM in self._root_object: - tree = cast(Optional[TreeObject], self._root_object[CD.ACRO_FORM]) + if CD.ACRO_FORM in self.root_object: + tree = cast(Optional[TreeObject], self.root_object[CD.ACRO_FORM]) else: return None if tree is None: @@ -767,10 +767,10 @@ def _get_named_destinations( retval = {} # get the name tree - if CA.DESTS in self._root_object: - tree = cast(TreeObject, self._root_object[CA.DESTS]) - elif CA.NAMES in self._root_object: - names = cast(DictionaryObject, self._root_object[CA.NAMES]) + if CA.DESTS in self.root_object: + tree = cast(TreeObject, self.root_object[CA.DESTS]) + elif CA.NAMES in self.root_object: + names = cast(DictionaryObject, self.root_object[CA.NAMES]) if CA.DESTS in names: tree = cast(TreeObject, names[CA.DESTS]) @@ -852,8 +852,8 @@ def _get_outline( outline = [] # get the outline dictionary and named destinations - if CO.OUTLINES in self._root_object: - lines = cast(DictionaryObject, self._root_object[CO.OUTLINES]) + if CO.OUTLINES in self.root_object: + lines = cast(DictionaryObject, self.root_object[CO.OUTLINES]) if isinstance(lines, NullObject): return outline @@ -906,8 +906,8 @@ def threads(self) -> Optional[ArrayObject]: It's an array of dictionaries with "/F" and "/I" properties or None if there are no articles. """ - if CO.THREADS in self._root_object: - return cast("ArrayObject", self._root_object[CO.THREADS]) + if CO.THREADS in self.root_object: + return cast("ArrayObject", self.root_object[CO.THREADS]) else: return None @@ -1120,8 +1120,8 @@ def page_layout(self) -> Optional[str]: * - /TwoPageRight - Show two pages at a time, odd-numbered pages on the right """ - if CD.PAGE_LAYOUT in self._root_object: - return cast(NameObject, self._root_object[CD.PAGE_LAYOUT]) + if CD.PAGE_LAYOUT in self.root_object: + return cast(NameObject, self.root_object[CD.PAGE_LAYOUT]) return None def getPageLayout(self) -> Optional[str]: # deprecated @@ -1165,7 +1165,7 @@ def page_mode(self) -> Optional[PagemodeType]: - Show attachments panel """ try: - return self._root_object["/PageMode"] # type: ignore + return self.root_object["/PageMode"] # type: ignore except KeyError: return None @@ -1205,7 +1205,7 @@ def _flatten( if pages is None: # Fix issue 327: set flattened_pages attribute only for # decrypted file - catalog = self._root_object.get_object() + catalog = self.root_object.get_object() pages = catalog["/Pages"].get_object() # type: ignore self.flattened_pages = [] @@ -2096,7 +2096,9 @@ def xfa(self) -> Optional[Dict[str, Any]]: tree: Optional[TreeObject] = None retval: Dict[str, Any] = {} - if isinstance(self.root_object.get("/AcroForm", None), (type(None), NullObject)): + if isinstance( + self.root_object.get("/AcroForm", None), (type(None), NullObject) + ): return None tree = cast(TreeObject, self.root_object["/AcroForm"]) @@ -2124,7 +2126,9 @@ def add_form_topname(self, name: str) -> Optional[DictionaryObject]: Returns: The created object. ``None`` means no object was created. """ - if isinstance(self.root_object.get("/AcroForm", None), (type(None), NullObject)): + if isinstance( + self.root_object.get("/AcroForm", None), (type(None), NullObject) + ): return None acroform = cast(DictionaryObject, self.root_object["/AcroForm"]) @@ -2163,7 +2167,9 @@ def rename_form_topname(self, name: str) -> Optional[DictionaryObject]: Returns: The modified object. ``None`` means no object was modified. """ - if isinstance(self.root_object.get("/AcroForm", None), (type(None), NullObject)): + if isinstance( + self.root_object.get("/AcroForm", None), (type(None), NullObject) + ): return None acroform = cast(DictionaryObject, self.root_object["/AcroForm"]) @@ -2198,7 +2204,7 @@ def _list_attachments(self) -> List[str]: ArrayObject, cast( DictionaryObject, - cast(DictionaryObject, self._root_object["/Names"])[ + cast(DictionaryObject, self.root_object["/Names"])[ "/EmbeddedFiles" ], )["/Names"], @@ -2241,7 +2247,7 @@ def _get_attachments( ArrayObject, cast( DictionaryObject, - cast(DictionaryObject, self._root_object["/Names"])[ + cast(DictionaryObject, self.root_object["/Names"])[ "/EmbeddedFiles" ], )["/Names"], diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 0b205c770..66d6536d3 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -195,14 +195,14 @@ def __init__( self._info = self._add_object(info) # root object - self._root_object = DictionaryObject() - self._root_object.update( + self.root_object = DictionaryObject() + self.root_object.update( { NameObject(PA.TYPE): NameObject(CO.CATALOG), NameObject(CO.PAGES): self._pages, } ) - self._root = self._add_object(self._root_object) + self._root = self._add_object(self.root_object) if clone_from is not None: if not isinstance(clone_from, PdfReader): @@ -226,6 +226,11 @@ def __exit__( if self.fileobj: self.write(self.fileobj) + @property + def _root_object(self) -> DictionaryObject: + deprecate_with_replacement("_root_object", "root_object") + return self.root_object + @property def pdf_header(self) -> bytes: """ @@ -337,13 +342,13 @@ def set_need_appearances_writer(self) -> None: # http://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/PDF32000_2008.pdf try: # get the AcroForm tree - if CatalogDictionary.ACRO_FORM not in self._root_object: - self._root_object[ + if CatalogDictionary.ACRO_FORM not in self.root_object: + self.root_object[ NameObject(CatalogDictionary.ACRO_FORM) ] = self._add_object(DictionaryObject()) need_appearances = NameObject(InteractiveFormDictEntries.NeedAppearances) - self._root_object[CatalogDictionary.ACRO_FORM][need_appearances] = BooleanObject(True) # type: ignore + self.root_object[CatalogDictionary.ACRO_FORM][need_appearances] = BooleanObject(True) # type: ignore except Exception as exc: logger.error("set_need_appearances_writer() catch : %s", repr(exc)) @@ -564,9 +569,9 @@ def open_destination( Raises: Exception: If a destination is invalid. """ - if "/OpenAction" not in self._root_object: + if "/OpenAction" not in self.root_object: return None - oa = self._root_object["/OpenAction"] + oa = self.root_object["/OpenAction"] if isinstance(oa, (str, bytes)): return create_string_object(str(oa)) elif isinstance(oa, ArrayObject): @@ -584,15 +589,15 @@ def open_destination( def open_destination(self, dest: Union[None, str, Destination, PageObject]) -> None: if dest is None: try: - del self._root_object["/OpenAction"] + del self.root_object["/OpenAction"] except KeyError: pass elif isinstance(dest, str): - self._root_object[NameObject("/OpenAction")] = TextStringObject(dest) + self.root_object[NameObject("/OpenAction")] = TextStringObject(dest) elif isinstance(dest, Destination): - self._root_object[NameObject("/OpenAction")] = dest.dest_array + self.root_object[NameObject("/OpenAction")] = dest.dest_array elif isinstance(dest, PageObject): - self._root_object[NameObject("/OpenAction")] = Destination( + self.root_object[NameObject("/OpenAction")] = Destination( "Opening", dest.indirect_reference if dest.indirect_reference is not None @@ -611,9 +616,9 @@ def add_js(self, javascript: str) -> None: # Example: This will launch the print window when the PDF is opened. """ # Names / JavaScript prefered to be able to add multiple scripts - if "/Names" not in self._root_object: - self._root_object[NameObject(CA.NAMES)] = DictionaryObject() - names = cast(DictionaryObject, self._root_object[CA.NAMES]) + if "/Names" not in self.root_object: + self.root_object[NameObject(CA.NAMES)] = DictionaryObject() + names = cast(DictionaryObject, self.root_object[CA.NAMES]) if "/JavaScript" not in names: names[NameObject("/JavaScript")] = DictionaryObject( {NameObject("/Names"): ArrayObject()} @@ -712,21 +717,21 @@ def add_attachment(self, filename: str, data: Union[str, bytes]) -> None: # >> # endobj - if CA.NAMES not in self._root_object: - self._root_object[NameObject(CA.NAMES)] = self._add_object( + if CA.NAMES not in self.root_object: + self.root_object[NameObject(CA.NAMES)] = self._add_object( DictionaryObject() ) - if "/EmbeddedFiles" not in cast(DictionaryObject, self._root_object[CA.NAMES]): + if "/EmbeddedFiles" not in cast(DictionaryObject, self.root_object[CA.NAMES]): embedded_files_names_dictionary = DictionaryObject( {NameObject(CA.NAMES): ArrayObject()} ) - cast(DictionaryObject, self._root_object[CA.NAMES])[ + cast(DictionaryObject, self.root_object[CA.NAMES])[ NameObject("/EmbeddedFiles") ] = self._add_object(embedded_files_names_dictionary) else: embedded_files_names_dictionary = cast( DictionaryObject, - cast(DictionaryObject, self._root_object[CA.NAMES])["/EmbeddedFiles"], + cast(DictionaryObject, self.root_object[CA.NAMES])["/EmbeddedFiles"], ) cast(ArrayObject, embedded_files_names_dictionary[CA.NAMES]).extend( [create_string_object(filename), filespec] @@ -898,15 +903,15 @@ def clone_reader_document_root(self, reader: PdfReader) -> None: Args: reader: PdfReader from the document root should be copied. """ - self._root_object = cast(DictionaryObject, reader._root_object.clone(self)) - self._root = self._root_object.indirect_reference # type: ignore[assignment] - self._pages = self._root_object.raw_get("/Pages") + self.root_object = cast(DictionaryObject, reader.root_object.clone(self)) + self._root = self.root_object.indirect_reference # type: ignore[assignment] + self._pages = self.root_object.raw_get("/Pages") self._flatten() for p in self.flattened_pages: o = p.get_object() self._objects[p.idnum - 1] = PageObject(self, p) self._objects[p.idnum - 1].update(o.items()) - self._root_object[NameObject("/Pages")][ # type: ignore[index] + self.root_object[NameObject("/Pages")][ # type: ignore[index] NameObject("/Kids") ] = self.flattened_pages del self.flattened_pages @@ -937,7 +942,7 @@ def _flatten( if inherit is None: inherit = {} if pages is None: - pages = cast(DictionaryObject, self._root_object["/Pages"]) + pages = cast(DictionaryObject, self.root_object["/Pages"]) self.flattened_pages = ArrayObject() assert pages is not None # hint for mypy @@ -965,7 +970,7 @@ def _flatten( if attr_in not in pages: pages[attr_in] = value pages[NameObject("/Parent")] = cast( - IndirectObject, self._root_object.raw_get("/Pages") + IndirectObject, self.root_object.raw_get("/Pages") ) self.flattened_pages.append(indirect_reference) @@ -1143,7 +1148,7 @@ def write_stream(self, stream: StreamType) -> None: ) if not self._root: - self._root = self._add_object(self._root_object) + self._root = self._add_object(self.root_object) self._sweep_indirect_references(self._root) @@ -1432,9 +1437,9 @@ def getReference(self, obj: PdfObject) -> IndirectObject: # deprecated return self.get_reference(obj) def get_outline_root(self) -> TreeObject: - if CO.OUTLINES in self._root_object: + if CO.OUTLINES in self.root_object: # TABLE 3.25 Entries in the catalog dictionary - outline = cast(TreeObject, self._root_object[CO.OUTLINES]) + outline = cast(TreeObject, self.root_object[CO.OUTLINES]) idnum = self._objects.index(outline) + 1 outline_ref = IndirectObject(idnum, 0, self) assert outline_ref.get_object() == outline @@ -1442,7 +1447,7 @@ def get_outline_root(self) -> TreeObject: outline = TreeObject() outline.update({}) outline_ref = self._add_object(outline) - self._root_object[NameObject(CO.OUTLINES)] = outline_ref + self.root_object[NameObject(CO.OUTLINES)] = outline_ref return outline @@ -1456,12 +1461,12 @@ def get_threads_root(self) -> ArrayObject: An array (possibly empty) of Dictionaries with ``/F`` and ``/I`` properties. """ - if CO.THREADS in self._root_object: + if CO.THREADS in self.root_object: # TABLE 3.25 Entries in the catalog dictionary - threads = cast(ArrayObject, self._root_object[CO.THREADS]) + threads = cast(ArrayObject, self.root_object[CO.THREADS]) else: threads = ArrayObject() - self._root_object[NameObject(CO.THREADS)] = threads + self.root_object[NameObject(CO.THREADS)] = threads return threads @property @@ -1485,10 +1490,10 @@ def getOutlineRoot(self) -> TreeObject: # deprecated return self.get_outline_root() def get_named_dest_root(self) -> ArrayObject: - if CA.NAMES in self._root_object and isinstance( - self._root_object[CA.NAMES], DictionaryObject + if CA.NAMES in self.root_object and isinstance( + self.root_object[CA.NAMES], DictionaryObject ): - names = cast(DictionaryObject, self._root_object[CA.NAMES]) + names = cast(DictionaryObject, self.root_object[CA.NAMES]) names_ref = names.indirect_reference if CA.DESTS in names and isinstance(names[CA.DESTS], DictionaryObject): # 3.6.3 Name Dictionary (PDF spec 1.7) @@ -1510,7 +1515,7 @@ def get_named_dest_root(self) -> ArrayObject: else: names = DictionaryObject() names_ref = self._add_object(names) - self._root_object[NameObject(CA.NAMES)] = names_ref + self.root_object[NameObject(CA.NAMES)] = names_ref dests = DictionaryObject() dests_ref = self._add_object(dests) names[NameObject(CA.DESTS)] = dests_ref @@ -2260,7 +2265,7 @@ def addLink( def _get_page_layout(self) -> Optional[LayoutType]: try: - return cast(LayoutType, self._root_object["/PageLayout"]) + return cast(LayoutType, self.root_object["/PageLayout"]) except KeyError: return None @@ -2305,7 +2310,7 @@ def _set_page_layout(self, layout: Union[NameObject, LayoutType]) -> None: __name__, ) layout = NameObject(layout) - self._root_object.update({NameObject("/PageLayout"): layout}) + self.root_object.update({NameObject("/PageLayout"): layout}) def set_page_layout(self, layout: LayoutType) -> None: """ @@ -2405,7 +2410,7 @@ def pageLayout(self, layout: LayoutType) -> None: # deprecated def _get_page_mode(self) -> Optional[PagemodeType]: try: - return cast(PagemodeType, self._root_object["/PageMode"]) + return cast(PagemodeType, self.root_object["/PageMode"]) except KeyError: return None @@ -2432,7 +2437,7 @@ def set_page_mode(self, mode: PagemodeType) -> None: f"Mode should be one of: {', '.join(self._valid_modes)}", __name__ ) mode_name = NameObject(mode) - self._root_object.update({NameObject("/PageMode"): mode_name}) + self.root_object.update({NameObject("/PageMode"): mode_name}) def setPageMode(self, mode: PagemodeType) -> None: # deprecated """ @@ -2780,9 +2785,9 @@ def merge( else: outline_item_typ = self.get_outline_root() - if import_outline and CO.OUTLINES in reader._root_object: + if import_outline and CO.OUTLINES in reader.root_object: outline = self._get_filtered_outline( - reader._root_object.get(CO.OUTLINES, None), srcpages, reader + reader.root_object.get(CO.OUTLINES, None), srcpages, reader ) self._insert_filtered_outline( outline, outline_item_typ, None @@ -2797,23 +2802,23 @@ def merge( pag[NameObject("/Annots")] = lst self.clean_page(pag) - if "/AcroForm" in cast(DictionaryObject, reader._root_object): - if "/AcroForm" not in self._root_object: - self._root_object[NameObject("/AcroForm")] = self._add_object( + if "/AcroForm" in cast(DictionaryObject, reader.root_object): + if "/AcroForm" not in self.root_object: + self.root_object[NameObject("/AcroForm")] = self._add_object( cast( DictionaryObject, - reader._root_object["/AcroForm"], + reader.root_object["/AcroForm"], ).clone(self, False, ("/Fields",)) ) arr = ArrayObject() else: arr = cast( ArrayObject, - cast(DictionaryObject, self._root_object["/AcroForm"])["/Fields"], + cast(DictionaryObject, self.root_object["/AcroForm"])["/Fields"], ) trslat = self._id_translated[id(reader)] try: - for f in reader._root_object["/AcroForm"]["/Fields"]: # type: ignore + for f in reader.root_object["/AcroForm"]["/Fields"]: # type: ignore try: ind = IndirectObject(trslat[f.idnum], 0, self) if ind not in arr: @@ -2824,7 +2829,7 @@ def merge( pass except KeyError: # for /Acroform or /Fields are not existing arr = self._add_object(ArrayObject()) - cast(DictionaryObject, self._root_object["/AcroForm"])[ + cast(DictionaryObject, self.root_object["/AcroForm"])[ NameObject("/Fields") ] = arr @@ -3238,15 +3243,15 @@ def _set_page_label( if start != 0: new_page_label[NameObject("/St")] = NumberObject(start) - if NameObject(CatalogDictionary.PAGE_LABELS) not in self._root_object: + if NameObject(CatalogDictionary.PAGE_LABELS) not in self.root_object: nums = ArrayObject() nums_insert(NumberObject(0), default_page_label, nums) page_labels = TreeObject() page_labels[NameObject("/Nums")] = nums - self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)] = page_labels + self.root_object[NameObject(CatalogDictionary.PAGE_LABELS)] = page_labels page_labels = cast( - TreeObject, self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)] + TreeObject, self.root_object[NameObject(CatalogDictionary.PAGE_LABELS)] ) nums = cast(ArrayObject, page_labels[NameObject("/Nums")]) @@ -3257,7 +3262,7 @@ def _set_page_label( nums_insert(NumberObject(page_index_to + 1), default_page_label, nums) page_labels[NameObject("/Nums")] = nums - self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)] = page_labels + self.root_object[NameObject(CatalogDictionary.PAGE_LABELS)] = page_labels def _pdf_objectify(obj: Union[Dict[str, Any], str, int, List[Any]]) -> PdfObject: diff --git a/tests/test_javascript.py b/tests/test_javascript.py index 37edb218a..42f2d2f81 100644 --- a/tests/test_javascript.py +++ b/tests/test_javascript.py @@ -24,19 +24,19 @@ def test_add_js(pdf_file_writer): pdf_file_writer.add_js("this.print({bUI:true,bSilent:false,bShrinkToFit:true});") assert ( - "/Names" in pdf_file_writer._root_object + "/Names" in pdf_file_writer.root_object ), "add_js should add a name catalog in the root object." assert ( - "/JavaScript" in pdf_file_writer._root_object["/Names"] + "/JavaScript" in pdf_file_writer.root_object["/Names"] ), "add_js should add a JavaScript name tree under the name catalog." def test_added_js(pdf_file_writer): def get_javascript_name() -> Any: - assert "/Names" in pdf_file_writer._root_object - assert "/JavaScript" in pdf_file_writer._root_object["/Names"] - assert "/Names" in pdf_file_writer._root_object["/Names"]["/JavaScript"] - return pdf_file_writer._root_object["/Names"]["/JavaScript"]["/Names"][ + assert "/Names" in pdf_file_writer.root_object + assert "/JavaScript" in pdf_file_writer.root_object["/Names"] + assert "/Names" in pdf_file_writer.root_object["/Names"]["/JavaScript"] + return pdf_file_writer.root_object["/Names"]["/JavaScript"]["/Names"][ -2 ] # return -2 in order to get the latest javascript diff --git a/tests/test_writer.py b/tests/test_writer.py index 80a3158aa..af2ca71d9 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -911,7 +911,7 @@ def test_startup_dest(): assert pdf_file_writer.open_destination is None pdf_file_writer.open_destination = pdf_file_writer.pages[9] # checked also using Acrobrat to verify the good page is opened - op = pdf_file_writer._root_object["/OpenAction"] + op = pdf_file_writer.root_object["/OpenAction"] assert op[0] == pdf_file_writer.pages[9].indirect_reference assert op[1] == "/Fit" op = pdf_file_writer.open_destination @@ -921,16 +921,16 @@ def test_startup_dest(): assert pdf_file_writer.open_destination == op # irrelevant, just for coverage - pdf_file_writer._root_object[NameObject("/OpenAction")][0] = NumberObject(0) + pdf_file_writer.root_object[NameObject("/OpenAction")][0] = NumberObject(0) pdf_file_writer.open_destination with pytest.raises(Exception) as exc: - del pdf_file_writer._root_object[NameObject("/OpenAction")][0] + del pdf_file_writer.root_object[NameObject("/OpenAction")][0] pdf_file_writer.open_destination assert "Invalid Destination" in str(exc.value) pdf_file_writer.open_destination = "Test" # checked also using Acrobrat to verify open_destination - op = pdf_file_writer._root_object["/OpenAction"] + op = pdf_file_writer.root_object["/OpenAction"] assert isinstance(op, TextStringObject) assert op == "Test" op = pdf_file_writer.open_destination @@ -938,10 +938,10 @@ def test_startup_dest(): assert op == "Test" # irrelevant, this is just for coverage - pdf_file_writer._root_object[NameObject("/OpenAction")] = NumberObject(0) + pdf_file_writer.root_object[NameObject("/OpenAction")] = NumberObject(0) assert pdf_file_writer.open_destination is None pdf_file_writer.open_destination = None - assert "/OpenAction" not in pdf_file_writer._root_object + assert "/OpenAction" not in pdf_file_writer.root_object pdf_file_writer.open_destination = None @@ -1025,7 +1025,7 @@ def test_append_multiple(): reader, [0, 0, 0] ) # to demonstre multiple insertion of same page at once writer.append(reader, [0, 0, 0]) # second pack - pages = writer._root_object["/Pages"]["/Kids"] + pages = writer.root_object["/Pages"]["/Kids"] assert pages[0] not in pages[1:] # page not repeated assert pages[-1] not in pages[0:-1] # page not repeated From 7b102b73d6c5b5c17733f7b700a4c14919a2dd11 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 4 Jun 2023 21:28:59 +0200 Subject: [PATCH 4/4] add test --- tests/test_writer.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/test_writer.py b/tests/test_writer.py index af2ca71d9..46d3128cd 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -1339,3 +1339,8 @@ def test_iss1767(): name = "iss1723.pdf" in_pdf = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) PdfWriter(clone_from=in_pdf) + + +def test_deprecate_root_object(): + with pytest.warns(DeprecationWarning), PdfWriter() as writer: + writer._root_object