Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 7 additions & 8 deletions PyPDF2/_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -1347,15 +1347,19 @@ def read(self, stream: StreamType) -> None:
for gen, xref_entry in self.xref.items():
if gen == 65535:
continue
for id in xref_entry:
xref_k = sorted(
xref_entry.keys()
) # must ensure ascendant to prevent damange
for id in xref_k:
stream.seek(xref_entry[id], 0)
try:
pid, _pgen = self.read_object_header(stream)
except ValueError:
break
if pid == id - self.xref_index:
self._zero_xref(gen)
break
# fixing index item per item is required for revised PDF.
self.xref[gen][pid] = self.xref[gen][id]
del self.xref[gen][id]
# if not, then either it's just plain wrong, or the
# non-zero-index is actually correct
stream.seek(loc, 0) # return to where it was
Expand Down Expand Up @@ -1750,11 +1754,6 @@ def _read_xref_subsections(
elif self.strict:
raise PdfReadError(f"Unknown xref type: {xref_type}")

def _zero_xref(self, generation: int) -> None:
self.xref[generation] = {
k - self.xref_index: v for (k, v) in list(self.xref[generation].items())
}

def _pairs(self, array: List[int]) -> Iterable[Tuple[int, int]]:
i = 0
while True:
Expand Down
8 changes: 8 additions & 0 deletions tests/test_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -1136,3 +1136,11 @@ def test_reader(caplog):
# ...and now no more required
reader.pages[0].extract_text()
assert caplog.text == ""


def test_zeroing_xref():
# iss #328
url = "https://github.com/py-pdf/PyPDF2/files/9066120/UTA_OSHA_3115_Fall_Protection_Training_09162021_.pdf"
name = "UTA_OSHA.pdf"
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
len(reader.pages)