Skip to content

Commit

Permalink
ENH: compress pdf files merging identical objects
Browse files Browse the repository at this point in the history
add compress_identical_objects()
discovered in py-pdf#2728
closes py-pdf#2794
closes py-pdf#2768
  • Loading branch information
pubpub-zz committed Aug 11, 2024
1 parent 5abd590 commit 8d30c88
Show file tree
Hide file tree
Showing 2 changed files with 122 additions and 114 deletions.
233 changes: 119 additions & 114 deletions pypdf/_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,11 @@
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.

import collections
import decimal
import enum
import hashlib
import re
import sys
import uuid
from io import BytesIO, FileIO, IOBase
from pathlib import Path
Expand All @@ -40,7 +40,6 @@
IO,
Any,
Callable,
Deque,
Dict,
Iterable,
List,
Expand Down Expand Up @@ -157,12 +156,17 @@ def __init__(
clone_from: Union[None, PdfReader, StrByteType, Path] = None,
) -> None:
self._header = b"%PDF-1.3"
self._objects: List[PdfObject] = []
self._objects: List[Optional[PdfObject]] = []
"""The indirect objects in the PDF."""

self._idnum_hash: Dict[bytes, IndirectObject] = {}
"""Maps hash values of indirect objects to their IndirectObject instances."""
"""Maps hash values of indirect objects to the list of IndirectObjects.
This is used for compression
"""
self._idnum_hash: Dict[bytes, Tuple[IndirectObject, List[IndirectObject]]] = {}

"""list of translation already done.
dict[id(pdf)][(idnum, generation)]
"""
self._id_translated: Dict[int, Dict[int, int]] = {}

# The root of our page tree node.
Expand Down Expand Up @@ -371,10 +375,13 @@ def get_object(
indirect_reference: Union[int, IndirectObject],
) -> PdfObject:
if isinstance(indirect_reference, int):
return self._objects[indirect_reference - 1]
if indirect_reference.pdf != self:
obj = self._objects[indirect_reference - 1]
elif indirect_reference.pdf != self:
raise ValueError("pdf must be self")
return self._objects[indirect_reference.idnum - 1]
else:
obj = self._objects[indirect_reference.idnum - 1]
assert obj is not None
return obj

def _replace_object(
self,
Expand All @@ -393,6 +400,7 @@ def _replace_object(
obj = obj.clone(self)
self._objects[indirect_reference - 1] = obj
obj.indirect_reference = IndirectObject(indirect_reference, gen, self)
assert obj is None
return self._objects[indirect_reference - 1]

def _add_page(
Expand Down Expand Up @@ -1246,10 +1254,10 @@ def write_stream(self, stream: StreamType) -> None:
if not self._root:
self._root = self._add_object(self._root_object)

self._sweep_indirect_references(self._root)
# no more used : self._sweep_indirect_references(self._root)

object_positions = self._write_pdf_structure(stream)
xref_location = self._write_xref_table(stream, object_positions)
object_positions, free_objects = self._write_pdf_structure(stream)
xref_location = self._write_xref_table(stream, object_positions, free_objects)
self._write_trailer(stream, xref_location)

def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO[Any]]:
Expand Down Expand Up @@ -1282,8 +1290,9 @@ def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO[Any]]:

return my_file, stream

def _write_pdf_structure(self, stream: StreamType) -> List[int]:
def _write_pdf_structure(self, stream: StreamType) -> Tuple[List[int], List[int]]:
object_positions = []
free_objects = [] # will contain list of all free entries
stream.write(self.pdf_header.encode() + b"\n")
stream.write(b"%\xE2\xE3\xCF\xD3\n")

Expand All @@ -1296,15 +1305,26 @@ def _write_pdf_structure(self, stream: StreamType) -> List[int]:
obj = self._encryption.encrypt_object(obj, idnum, 0)
obj.write_to_stream(stream)
stream.write(b"\nendobj\n")
return object_positions

def _write_xref_table(self, stream: StreamType, object_positions: List[int]) -> int:
else:
object_positions.append(-1)
free_objects.append(i + 1)
free_objects.append(0) # add 0 to loop iaw PDF spec
return object_positions, free_objects

def _write_xref_table(
self, stream: StreamType, object_positions: List[int], free_objects: List[int]
) -> int:
xref_location = stream.tell()
stream.write(b"xref\n")
stream.write(f"0 {len(self._objects) + 1}\n".encode())
stream.write(f"{0:0>10} {65535:0>5} f \n".encode())
stream.write(f"{free_objects[0]:0>10} {65535:0>5} f \n".encode())
free_idx = 1
for offset in object_positions:
stream.write(f"{offset:0>10} {0:0>5} n \n".encode())
if offset > 0:
stream.write(f"{offset:0>10} {0:0>5} n \n".encode())
else:
stream.write(f"{free_objects[free_idx]:0>10} {1:0>5} f \n".encode())
free_idx += 1
return xref_location

def _write_trailer(self, stream: StreamType, xref_location: int) -> None:
Expand Down Expand Up @@ -1349,6 +1369,73 @@ def add_metadata(self, infos: Dict[str, Any]) -> None:
assert isinstance(self._info, DictionaryObject)
self._info.update(args)

def compress_identical_objects(self, verbose: Union[int, bool] = -1) -> None:
"""
Parse the Pdf file and merge objects that have same harsh.
This will make objects common to multiple pages
Recommended to be used just before writing output
Args:
verbose: provide some progress information.
int : frequence of progress update; disable if negative
bool : True => 100 ; False = -1
"""

def replace_in_obj(
obj: PdfObject, crossref: Dict[IndirectObject, IndirectObject]
) -> None:
if isinstance(obj, DictionaryObject):
key_val = obj.items()
elif isinstance(obj, ArrayObject):
key_val = enumerate(obj) # type: ignore
else:
return
assert isinstance(obj, (DictionaryObject, ArrayObject))
for k, v in key_val:
if isinstance(v, IndirectObject) and v in crossref:
obj[k] = crossref[v]
else: # if isinstance(v, (DictionaryObject, ArrayObject)):
replace_in_obj(v, crossref)

# _idnum_hash :dict[hash]=(1st_ind_obj,[other_indir_objs,...])
self._idnum_hash = {}
if isinstance(verbose, int):
cpt_init = verbose
else:
cpt_init = 100 if verbose else -1
cpt = cpt_init
# look for similar objects
for idx, obj in enumerate(self._objects):
if obj is None:
continue
assert isinstance(obj.indirect_reference, IndirectObject)
h = obj.hash_value()
if cpt == 0:
print("+", end="", file=sys.stderr) # noqa: T201
cpt = cpt_init
cpt -= 1
if h in self._idnum_hash:
self._idnum_hash[h][1].append(obj.indirect_reference)
self._objects[idx] = None
else:
self._idnum_hash[h] = (obj.indirect_reference, [])

# generate the dict converting others to 1st
cnv = {v[0]: v[1] for v in self._idnum_hash.values() if len(v[1]) > 0}
cnv_rev: Dict[IndirectObject, IndirectObject] = {}
for k, v in cnv.items():
cnv_rev.update(zip(v, (k,) * len(v)))
cpt = cpt_init

# replace reference to merged objects
for obj in self._objects:
if isinstance(obj, (DictionaryObject, ArrayObject)):
if cpt == 0:
print(".", end="", file=sys.stderr) # noqa: T201
cpt = cpt_init
cpt -= 1
replace_in_obj(obj, cnv_rev)

def _sweep_indirect_references(
self,
root: Union[
Expand All @@ -1363,7 +1450,7 @@ def _sweep_indirect_references(
TextStringObject,
NullObject,
],
) -> None:
) -> None: # deprecated
"""
Resolving any circular references to Page objects.
Expand All @@ -1379,73 +1466,15 @@ def _sweep_indirect_references(
Args:
root: The root of the PDF object tree to sweep.
"""
stack: Deque[
Tuple[
Any,
Optional[Any],
Any,
List[PdfObject],
]
] = collections.deque()
discovered = []
parent = None
grant_parents: List[PdfObject] = []
key_or_id = None

# Start from root
stack.append((root, parent, key_or_id, grant_parents))

while len(stack):
data, parent, key_or_id, grant_parents = stack.pop()

# Build stack for a processing depth-first
if isinstance(data, (ArrayObject, DictionaryObject)):
for key, value in data.items():
stack.append(
(
value,
data,
key,
grant_parents + [parent] if parent is not None else [],
)
)
elif isinstance(data, IndirectObject) and data.pdf != self:
data = self._resolve_indirect_object(data)

if str(data) not in discovered:
discovered.append(str(data))
stack.append((data.get_object(), None, None, []))

# Check if data has a parent and if it is a dict or
# an array update the value
if isinstance(parent, (DictionaryObject, ArrayObject)):
if isinstance(data, StreamObject):
# a dictionary value is a stream; streams must be indirect
# objects, so we need to change this value.
data = self._resolve_indirect_object(self._add_object(data))

update_hashes = []

# Data changed and thus the hash value changed
if parent[key_or_id] != data:
update_hashes = [parent.hash_value()] + [
grant_parent.hash_value() for grant_parent in grant_parents
]
parent[key_or_id] = data

# Update old hash value to new hash value
for old_hash in update_hashes:
indirect_reference = self._idnum_hash.pop(old_hash, None)

if indirect_reference is not None:
indirect_reference_obj = indirect_reference.get_object()

if indirect_reference_obj is not None:
self._idnum_hash[
indirect_reference_obj.hash_value()
] = indirect_reference
deprecate_with_replacement(
"_sweep_indirect_references",
"no replacement, please report to dev team if this warning is observed",
"5.0.0",
)

def _resolve_indirect_object(self, data: IndirectObject) -> IndirectObject:
def _resolve_indirect_object(
self, data: IndirectObject
) -> IndirectObject: # deprecated
"""
Resolves an indirect object to an indirect object in this PDF file.
Expand All @@ -1470,36 +1499,12 @@ def _resolve_indirect_object(self, data: IndirectObject) -> IndirectObject:
Raises:
ValueError: If the input stream is closed.
"""
if hasattr(data.pdf, "stream") and data.pdf.stream.closed:
raise ValueError(f"I/O operation on closed file: {data.pdf.stream.name}")

if data.pdf == self:
return data

# Get real object indirect object
real_obj = data.pdf.get_object(data)

if real_obj is None:
logger_warning(
f"Unable to resolve [{data.__class__.__name__}: {data}], "
"returning NullObject instead",
__name__,
)
real_obj = NullObject()

hash_value = real_obj.hash_value()

# Check if object is handled
if hash_value in self._idnum_hash:
return self._idnum_hash[hash_value]

if data.pdf == self:
self._idnum_hash[hash_value] = IndirectObject(data.idnum, 0, self)
# This is new object in this pdf
else:
self._idnum_hash[hash_value] = self._add_object(real_obj)

return self._idnum_hash[hash_value]
deprecate_with_replacement(
"_resolve_indirect_object",
"no replacement, please report to dev team if this warning is observed",
"5.0.0",
)
return IndirectObject(0, 0, self)

def get_reference(self, obj: PdfObject) -> IndirectObject:
idnum = self._objects.index(obj) + 1
Expand Down
3 changes: 3 additions & 0 deletions pypdf/generic/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,9 @@ def __init__(self, idnum: int, generation: int, pdf: Any) -> None: # PdfReader
self.generation = generation
self.pdf = pdf

def __hash__(self) -> int:
return hash((self.idnum, self.generation, id(self.pdf)))

def clone(
self,
pdf_dest: PdfWriterProtocol,
Expand Down

0 comments on commit 8d30c88

Please sign in to comment.