Skip to content

Commit

Permalink
ENH: Add capability to remove /Info from PDF (#2820)
Browse files Browse the repository at this point in the history
  • Loading branch information
pubpub-zz authored Sep 14, 2024
1 parent 78baa8f commit a790532
Show file tree
Hide file tree
Showing 3 changed files with 121 additions and 24 deletions.
24 changes: 24 additions & 0 deletions docs/user/metadata.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,30 @@ writer.add_metadata(
}
)

# Clear all data but keep the entry in PDF
writer.metadata = {}

# Replace all entries with new set of entries
writer.metadata = {
"/Author": "Martin",
"/Producer": "Libre Writer",
}

# Save the new PDF to a file
with open("meta-pdf.pdf", "wb") as f:
writer.write(f)
```

## Removing metadata entry

```python
from pypdf import PdfWriter

writer = PdfWriter("example.pdf")

# Remove Metadata (/Info entry)
writer.metadata = None

# Save the new PDF to a file
with open("meta-pdf.pdf", "wb") as f:
writer.write(f)
Expand Down
85 changes: 63 additions & 22 deletions pypdf/_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@
)

from ._cmap import _default_fonts_space_width, build_char_map_from_dict
from ._doc_common import PdfDocCommon
from ._doc_common import DocumentInformation, PdfDocCommon
from ._encryption import EncryptAlgorithm, Encryption
from ._page import PageObject
from ._page_labels import nums_clear_range, nums_insert, nums_next
Expand Down Expand Up @@ -194,7 +194,7 @@ def __init__(
"""

self._ID: Union[ArrayObject, None] = None
self._info_obj: PdfObject
self._info_obj: Optional[PdfObject]

if self.incremental:
if isinstance(fileobj, (str, Path)):
Expand Down Expand Up @@ -309,13 +309,26 @@ def _info(self) -> Optional[DictionaryObject]:
Returns:
/Info Dictionary; None if the entry does not exist
"""
return cast(DictionaryObject, self._info_obj.get_object())
return (
None
if self._info_obj is None
else cast(DictionaryObject, self._info_obj.get_object())
)

@_info.setter
def _info(self, value: Union[IndirectObject, DictionaryObject]) -> None:
obj = cast(DictionaryObject, self._info_obj.get_object())
obj.clear()
obj.update(cast(DictionaryObject, value.get_object()))
def _info(self, value: Optional[Union[IndirectObject, DictionaryObject]]) -> None:
if value is None:
try:
self._objects[self._info_obj.indirect_reference.idnum - 1] = None # type: ignore
except (KeyError, AttributeError):
pass
self._info_obj = None
else:
if self._info_obj is None:
self._info_obj = self._add_object(DictionaryObject())
obj = cast(DictionaryObject, self._info_obj.get_object())
obj.clear()
obj.update(cast(DictionaryObject, value.get_object()))

@property
def xmp_metadata(self) -> Optional[XmpInformation]:
Expand Down Expand Up @@ -1186,6 +1199,7 @@ def clone_reader_document_root(self, reader: PdfReader) -> None:
self._objects = [None] * cast(int, reader.trailer["/Size"])
else:
self._objects.clear()
self._info_obj = None
self._root_object = reader.root_object.clone(self)
self._pages = self._root_object.raw_get("/Pages")

Expand Down Expand Up @@ -1226,22 +1240,21 @@ def clone_document_from_reader(
document.
"""
self.clone_reader_document_root(reader)
if TK.INFO in reader.trailer:
inf = reader._info
if self.incremental:
if inf is not None:
self._info_obj = cast(
IndirectObject, inf.clone(self).indirect_reference
)
self._original_hash[
cast(IndirectObject, self._info_obj.indirect_reference).idnum - 1
] = cast(DictionaryObject, self._info_obj.get_object()).hash_bin()
elif inf is not None:
self._info_obj = self._add_object(
DictionaryObject(cast(DictionaryObject, inf.get_object()))
inf = reader._info
if self.incremental:
if inf is not None:
self._info_obj = cast(
IndirectObject, inf.clone(self).indirect_reference
)
else:
self._info_obj = self._add_object(DictionaryObject())
assert isinstance(self._info, DictionaryObject), "for mypy"
self._original_hash[
self._info_obj.indirect_reference.idnum - 1
] = self._info.hash_bin()
elif inf is not None:
self._info_obj = self._add_object(
DictionaryObject(cast(DictionaryObject, inf.get_object()))
)
# else: _info_obj = None done in clone_reader_document_root()

try:
self._ID = cast(ArrayObject, reader._ID).clone(self)
Expand Down Expand Up @@ -1547,6 +1560,34 @@ def _write_trailer(self, stream: StreamType, xref_location: int) -> None:
trailer.write_to_stream(stream)
stream.write(f"\nstartxref\n{xref_location}\n%%EOF\n".encode()) # eof

@property
def metadata(self) -> Optional[DocumentInformation]:
"""
Retrieve/set the PDF file's document information dictionary, if it exists.
Args:
value: Dictionary with the entries to set. If None, remove the /Info entry from the PDF.
Note that some PDF files use (XMP) metadata streams instead of document
information dictionaries, and these metadata streams will not be
accessed by this function.
"""
return super().metadata

@metadata.setter
def metadata(
self,
value: Optional[Union[DocumentInformation, DictionaryObject, Dict[Any, Any]]],
) -> None:
if value is None:
self._info = None
else:
if self._info is not None:
self._info.clear()
else:
self._info = DictionaryObject()
self.add_metadata(value)

def add_metadata(self, infos: Dict[str, Any]) -> None:
"""
Add custom metadata to the output.
Expand Down
36 changes: 34 additions & 2 deletions tests/test_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1795,9 +1795,33 @@ def test_missing_info():

writer = PdfWriter(clone_from=reader)
assert len(writer.pages) == len(reader.pages)
assert writer.metadata is None
b = BytesIO()
writer.write(b)
assert b"/Info" not in b.getvalue()

reader = PdfReader(RESOURCE_ROOT / "crazyones.pdf")
writer._info = reader._info
writer.metadata = reader.metadata
assert dict(writer._info) == dict(reader._info)
assert writer.metadata == reader.metadata
b = BytesIO()
writer.write(b)
assert b"/Info" in b.getvalue()

writer.metadata = {}
writer._info = DictionaryObject() # for code coverage
b = BytesIO()
writer.write(b)
assert b"/Info" in b.getvalue()
assert writer.metadata == {}

writer.metadata = None
writer.metadata = None # for code coverage
assert writer.metadata is None
assert PdfWriter().metadata == {"/Producer": "pypdf"}
b = BytesIO()
writer.write(b)
assert b"/Info" not in b.getvalue()


@pytest.mark.enable_socket()
Expand Down Expand Up @@ -2417,6 +2441,8 @@ def test_increment_writer(caplog):
writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf", incremental=True)
# 1 object is modified: page 0 inherits MediaBox so is changed
assert len(writer.list_objects_in_increment()) == 1
b = BytesIO()
writer.write(b)

writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf", incremental=False)
# 1 object is modified: page 0 inherits MediaBox so is changed
Expand All @@ -2438,7 +2464,13 @@ def test_increment_writer(caplog):

# clone without info
writer = PdfWriter(RESOURCE_ROOT / "missing_info.pdf", incremental=True)
assert len(writer.list_objects_in_increment()) == 0
assert writer.metadata is None
writer.metadata = {}
assert writer.metadata == {}
assert len(writer.list_objects_in_increment()) == 1
assert writer._info == {}
writer.metadata = None
assert len(writer.list_objects_in_increment()) == 0
assert writer.metadata is None
b = BytesIO()
writer.write(b)

0 comments on commit a790532

Please sign in to comment.