From 2dc76c0812bded2ceb2d219d4922092e0dbe8f05 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 28 Aug 2022 23:59:30 +0200 Subject: [PATCH 01/24] ENH : Process XRefStm fixes #1295 includes test file adjustment --- PyPDF2/_reader.py | 19 ++++++++++++++++++- tests/test_merger.py | 2 +- tests/test_xmp.py | 4 ++-- 3 files changed, 21 insertions(+), 4 deletions(-) diff --git a/PyPDF2/_reader.py b/PyPDF2/_reader.py index f9d201b12..0cd4a7790 100644 --- a/PyPDF2/_reader.py +++ b/PyPDF2/_reader.py @@ -1400,7 +1400,14 @@ def _read_standard_xref_table(self, stream: StreamType) -> None: pass else: self.xref[generation][num] = offset - self.xref_free_entry[generation][num] = entry_type_b == b"f" + try: + self.xref_free_entry[generation][num] = entry_type_b == b"f" + except Exception: + pass + try: + self.xref_free_entry[65535][num] = entry_type_b == b"f" + except Exception: + pass cnt += 1 num += 1 read_non_whitespace(stream) @@ -1438,6 +1445,11 @@ def _read_xref_tables_and_trailers( for key in trailer_keys: if key in xrefstream and key not in self.trailer: self.trailer[NameObject(key)] = xrefstream.raw_get(key) + if "/XRefStm" in xrefstream: + p = stream.tell() + stream.seek(int(xrefstream["/XRefStm"]) + 1, 0) + self._read_pdf15_xref_stream(stream) + stream.seek(p, 0) if "/Prev" in xrefstream: startxref = cast(int, xrefstream["/Prev"]) else: @@ -1453,6 +1465,11 @@ def _read_xref(self, stream: StreamType) -> Optional[int]: for key, value in new_trailer.items(): if key not in self.trailer: self.trailer[key] = value + if "/XRefStm" in new_trailer: + p = stream.tell() + stream.seek(int(new_trailer["/XRefStm"]) + 1, 0) + self._read_pdf15_xref_stream(stream) + stream.seek(p, 0) if "/Prev" in new_trailer: startxref = new_trailer["/Prev"] return startxref diff --git a/tests/test_merger.py b/tests/test_merger.py index 2cce04122..a550c56d4 100644 --- a/tests/test_merger.py +++ b/tests/test_merger.py @@ -345,7 +345,7 @@ def test_sweep_indirect_list_newobj_is_None(caplog): merger.append(reader) merger.write("tmp-merger-do-not-commit.pdf") merger.close() - assert "Object 21 0 not defined." in caplog.text + # used to be: assert "Object 21 0 not defined." in caplog.text reader2 = PdfReader("tmp-merger-do-not-commit.pdf") reader2.pages diff --git a/tests/test_xmp.py b/tests/test_xmp.py index a53b27b0e..35a797078 100644 --- a/tests/test_xmp.py +++ b/tests/test_xmp.py @@ -172,10 +172,10 @@ def test_dc_subject(): def test_issue585(): url = "https://github.com/mstamy2/PyPDF2/files/5536984/test.pdf" name = "mstamy2-5536984.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) with pytest.raises(PdfReadError) as exc: + reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) reader.xmp_metadata - assert exc.value.args[0].startswith("XML in XmpInformation was invalid") + assert exc.value.args[0].startswith("Stream length not defined") # def test_getter_bag(): From 1c767d27ffb7c84e89088d78d75177c0c6e0592d Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 29 Aug 2022 09:33:55 +0200 Subject: [PATCH 02/24] mypy --- PyPDF2/_reader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PyPDF2/_reader.py b/PyPDF2/_reader.py index 0cd4a7790..6ad0bc3c4 100644 --- a/PyPDF2/_reader.py +++ b/PyPDF2/_reader.py @@ -1447,7 +1447,7 @@ def _read_xref_tables_and_trailers( self.trailer[NameObject(key)] = xrefstream.raw_get(key) if "/XRefStm" in xrefstream: p = stream.tell() - stream.seek(int(xrefstream["/XRefStm"]) + 1, 0) + stream.seek(cast(int, xrefstream["/XRefStm"]) + 1, 0) self._read_pdf15_xref_stream(stream) stream.seek(p, 0) if "/Prev" in xrefstream: @@ -1467,7 +1467,7 @@ def _read_xref(self, stream: StreamType) -> Optional[int]: self.trailer[key] = value if "/XRefStm" in new_trailer: p = stream.tell() - stream.seek(int(new_trailer["/XRefStm"]) + 1, 0) + stream.seek(cast(int, new_trailer["/XRefStm"]) + 1, 0) self._read_pdf15_xref_stream(stream) stream.seek(p, 0) if "/Prev" in new_trailer: From 147b69e4c9c21581485ce974d195d0918979df51 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 29 Aug 2022 13:32:14 +0200 Subject: [PATCH 03/24] ROB : cope with xref starting on \r\n fixes #1279 / Status_v1_Reviewers-Guide.pdf --- PyPDF2/_reader.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/PyPDF2/_reader.py b/PyPDF2/_reader.py index 6ad0bc3c4..f1de4f6b1 100644 --- a/PyPDF2/_reader.py +++ b/PyPDF2/_reader.py @@ -1430,6 +1430,8 @@ def _read_xref_tables_and_trailers( # load the xref table stream.seek(startxref, 0) x = stream.read(1) + if x in b"\r\n": + x = stream.read(1) if x == b"x": startxref = self._read_xref(stream) elif xref_issue_nr: From bc4cbc8e321a24fd554c099cd6651d6d92b77027 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 29 Aug 2022 13:47:44 +0200 Subject: [PATCH 04/24] ROB : escaped octal code followed by decimal int fixes #1294 and may be others --- PyPDF2/generic/_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PyPDF2/generic/_utils.py b/PyPDF2/generic/_utils.py index 1d4b492ec..bb1aa8a22 100644 --- a/PyPDF2/generic/_utils.py +++ b/PyPDF2/generic/_utils.py @@ -79,7 +79,7 @@ def read_string_from_stream( try: tok = escape_dict[tok] except KeyError: - if tok.isdigit(): + if tok in b"01234567": # "The number ddd may consist of one, two, or three # octal digits; high-order overflow shall be ignored. # Three octal digits shall be used, with leading zeros @@ -87,7 +87,7 @@ def read_string_from_stream( # a digit." (PDF reference 7.3.4.2, p 16) for _ in range(2): ntok = stream.read(1) - if ntok.isdigit(): + if ntok in b"01234567": tok += ntok else: stream.seek(-1, 1) # ntok has to be analysed From 7bc1691f56fcb6e06984797016ab3ae6f6563b17 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 29 Aug 2022 14:01:05 +0200 Subject: [PATCH 05/24] PERF: simplify criteria --- PyPDF2/generic/_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PyPDF2/generic/_utils.py b/PyPDF2/generic/_utils.py index bb1aa8a22..ec2182f0c 100644 --- a/PyPDF2/generic/_utils.py +++ b/PyPDF2/generic/_utils.py @@ -79,7 +79,7 @@ def read_string_from_stream( try: tok = escape_dict[tok] except KeyError: - if tok in b"01234567": + if tok >= b"0" and tok <= b"7": # "The number ddd may consist of one, two, or three # octal digits; high-order overflow shall be ignored. # Three octal digits shall be used, with leading zeros @@ -87,7 +87,7 @@ def read_string_from_stream( # a digit." (PDF reference 7.3.4.2, p 16) for _ in range(2): ntok = stream.read(1) - if ntok in b"01234567": + if ntok >= b"0" and ntok <= b"7": tok += ntok else: stream.seek(-1, 1) # ntok has to be analysed From bb8e317289d4fa596dd65b539c3189ea1acf9fab Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 29 Aug 2022 14:01:51 +0200 Subject: [PATCH 06/24] add test --- tests/test_utils.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/test_utils.py b/tests/test_utils.py index 10a6a19fc..040b5df64 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -5,6 +5,7 @@ import pytest import PyPDF2._utils +from PyPDF2 import PdfReader from PyPDF2._utils import ( _get_max_pdf_version_header, deprecate_bookmark, @@ -19,6 +20,8 @@ ) from PyPDF2.errors import PdfReadError, PdfStreamError +from . import get_pdf_from_url + TESTS_ROOT = Path(__file__).parent.resolve() PROJECT_ROOT = TESTS_ROOT.parent RESOURCE_ROOT = PROJECT_ROOT / "resources" @@ -243,3 +246,13 @@ def foo(old_param=1, baz=2): "old_param is deprecated. Use new_param instead." ) assert exc.value.args[0] == expected_msg + + +def test_escapedcode_followed_by_int(): + # iss #1294 + url = "https://github.com/timedegree/playground_files/raw/main/%E8%AE%BA%E6%96%87/AN%20EXACT%20ANALYTICAL%20SOLUTION%20OF%20KEPLER'S%20EQUATION.pdf" + name = "keppler.pdf" + + reader = PdfReader(io.BytesIO(get_pdf_from_url(url, name=name))) + for page in reader.pages: + page.extract_text() From e6c1d7aafcace341a62ba13533f6a4c8424ea9db Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 29 Aug 2022 15:21:22 +0200 Subject: [PATCH 07/24] ROB : cope with some corrupted entries in xref table fix #1292 --- PyPDF2/_reader.py | 27 ++++++++++++++++++++++++--- tests/test_reader.py | 19 +++++++++++++++++++ 2 files changed, 43 insertions(+), 3 deletions(-) diff --git a/PyPDF2/_reader.py b/PyPDF2/_reader.py index f9d201b12..8d3d36ddf 100644 --- a/PyPDF2/_reader.py +++ b/PyPDF2/_reader.py @@ -1385,10 +1385,31 @@ def _read_standard_xref_table(self, stream: StreamType) -> None: if line[-1] in b"0123456789t": stream.seek(-1, 1) - offset_b, generation_b = line[:16].split(b" ") - entry_type_b = line[17:18] + try: + offset_b, generation_b = line[:16].split(b" ") + entry_type_b = line[17:18] + + offset, generation = int(offset_b), int(generation_b) + except Exception: + # if something wrong occured + f = re.search( + f"({num}) (\\d+) obj".encode(), bytes(stream.getbuffer()) + ) + if f is None: + logger_warning( + f"entry {num} in Xref table invalid; object not found", + __name__, + ) + generation = 65535 + offset = -1 + else: + logger_warning( + f"entry {num} in Xref table invalid but object found", + __name__, + ) + generation = int(f.group(2)) + offset = f.start() - offset, generation = int(offset_b), int(generation_b) if generation not in self.xref: self.xref[generation] = {} self.xref_free_entry[generation] = {} diff --git a/tests/test_reader.py b/tests/test_reader.py index 12f956d0a..723a4aef5 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -1094,3 +1094,22 @@ def test_wrong_password_error(): def test_get_page_number_by_indirect(): reader = PdfReader(RESOURCE_ROOT / "crazyones.pdf") reader._get_page_number_by_indirect(1) + + +def test_corrupted_xref_table(): + # issue #1292 + url = "https://github.com/py-pdf/PyPDF2/files/9444747/BreezeManual.orig.pdf" + name = "BreezeMan1.pdf" + reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader.pages[0].extract_text() + + url = "https://github.com/py-pdf/PyPDF2/files/9444748/BreezeManual.failed.pdf" + name = "BreezeMan2.pdf" + reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + try: + reader.pages[0].extract_text() + except Exception as e: + pass + # Exception normal + else: + raise Exception("page 0 should not be corrupted") From 55d58ae512be8c587e52f96f4e841878fe042fb4 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 29 Aug 2022 15:24:20 +0200 Subject: [PATCH 08/24] flake8 --- tests/test_reader.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/test_reader.py b/tests/test_reader.py index 723a4aef5..314de4c8c 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -1108,8 +1108,7 @@ def test_corrupted_xref_table(): reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) try: reader.pages[0].extract_text() - except Exception as e: - pass - # Exception normal + except Exception: + pass # Exception normal else: raise Exception("page 0 should not be corrupted") From 0a342b122578c2f1b56c8da4b2a4c8072891d9f1 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 29 Aug 2022 15:44:10 +0200 Subject: [PATCH 09/24] mypy + improv --- PyPDF2/_reader.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/PyPDF2/_reader.py b/PyPDF2/_reader.py index 8d3d36ddf..dbb0e0c0b 100644 --- a/PyPDF2/_reader.py +++ b/PyPDF2/_reader.py @@ -1392,9 +1392,15 @@ def _read_standard_xref_table(self, stream: StreamType) -> None: offset, generation = int(offset_b), int(generation_b) except Exception: # if something wrong occured - f = re.search( - f"({num}) (\\d+) obj".encode(), bytes(stream.getbuffer()) - ) + if hasattr(stream, "getbuffer"): + buf = bytes(stream.getbuffer()) + else: + p = stream.tell() + stream.seek(0, 0) + buf = stream.read(-1) + streal.seek(p) + + f = re.search(f"{num}\\s+(\\d+)\\s+obj".encode(), buf) if f is None: logger_warning( f"entry {num} in Xref table invalid; object not found", @@ -1407,7 +1413,7 @@ def _read_standard_xref_table(self, stream: StreamType) -> None: f"entry {num} in Xref table invalid but object found", __name__, ) - generation = int(f.group(2)) + generation = int(f.group(1)) offset = f.start() if generation not in self.xref: From 0bb8079984a523bcd64a18c3c3b25d97ae86b754 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 29 Aug 2022 15:49:27 +0200 Subject: [PATCH 10/24] typo --- PyPDF2/_reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PyPDF2/_reader.py b/PyPDF2/_reader.py index dbb0e0c0b..03158ded6 100644 --- a/PyPDF2/_reader.py +++ b/PyPDF2/_reader.py @@ -1398,7 +1398,7 @@ def _read_standard_xref_table(self, stream: StreamType) -> None: p = stream.tell() stream.seek(0, 0) buf = stream.read(-1) - streal.seek(p) + stream.seek(p) f = re.search(f"{num}\\s+(\\d+)\\s+obj".encode(), buf) if f is None: From 1bf4b6104116f48001bc442559db802af9a01496 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 29 Aug 2022 15:58:15 +0200 Subject: [PATCH 11/24] mypy2 --- PyPDF2/_reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PyPDF2/_reader.py b/PyPDF2/_reader.py index 03158ded6..19baf7477 100644 --- a/PyPDF2/_reader.py +++ b/PyPDF2/_reader.py @@ -1392,7 +1392,7 @@ def _read_standard_xref_table(self, stream: StreamType) -> None: offset, generation = int(offset_b), int(generation_b) except Exception: # if something wrong occured - if hasattr(stream, "getbuffer"): + if isistance(stream, BytesIO) or hasattr(stream, "getbuffer"): buf = bytes(stream.getbuffer()) else: p = stream.tell() From 964079e05b36a5e40f29cae9db1d5e0ce35c029b Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 29 Aug 2022 15:59:57 +0200 Subject: [PATCH 12/24] typo --- PyPDF2/_reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PyPDF2/_reader.py b/PyPDF2/_reader.py index 19baf7477..e61482fd2 100644 --- a/PyPDF2/_reader.py +++ b/PyPDF2/_reader.py @@ -1392,7 +1392,7 @@ def _read_standard_xref_table(self, stream: StreamType) -> None: offset, generation = int(offset_b), int(generation_b) except Exception: # if something wrong occured - if isistance(stream, BytesIO) or hasattr(stream, "getbuffer"): + if isinstance(stream, BytesIO) or hasattr(stream, "getbuffer"): buf = bytes(stream.getbuffer()) else: p = stream.tell() From 041ea874a83ea876f898bc79b98d70394175af54 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 29 Aug 2022 16:07:48 +0200 Subject: [PATCH 13/24] mypy 3 --- PyPDF2/_reader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PyPDF2/_reader.py b/PyPDF2/_reader.py index e61482fd2..47dbae1c8 100644 --- a/PyPDF2/_reader.py +++ b/PyPDF2/_reader.py @@ -1392,8 +1392,8 @@ def _read_standard_xref_table(self, stream: StreamType) -> None: offset, generation = int(offset_b), int(generation_b) except Exception: # if something wrong occured - if isinstance(stream, BytesIO) or hasattr(stream, "getbuffer"): - buf = bytes(stream.getbuffer()) + if hasattr(stream, "getbuffer"): + buf = bytes(stream.getbuffer()) # type: ignore else: p = stream.tell() stream.seek(0, 0) From 9e97efc0b17d997c53450bea03f790e2d25998ed Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Wed, 31 Aug 2022 21:52:27 +0200 Subject: [PATCH 14/24] ROB : extend xref autorepair cases * if chained xref/trailer are not good * if the object header ('id' 'gen' obj) or if the object is not present in the xref table, will search the file for the object. fixes #1273 --- PyPDF2/_reader.py | 74 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 65 insertions(+), 9 deletions(-) diff --git a/PyPDF2/_reader.py b/PyPDF2/_reader.py index f1de4f6b1..e7269f89a 100644 --- a/PyPDF2/_reader.py +++ b/PyPDF2/_reader.py @@ -1129,7 +1129,25 @@ def get_object(self, indirect_reference: IndirectObject) -> Optional[PdfObject]: return NullObject() start = self.xref[indirect_reference.generation][indirect_reference.idnum] self.stream.seek(start, 0) - idnum, generation = self.read_object_header(self.stream) + try: + idnum, generation = self.read_object_header(self.stream) + except Exception as e: + m = re.search( + rf"\s{indirect_reference.idnum}\s+{indirect_reference.generation}\s+obj".encode(), + bytes(self.stream.getbuffer()), + ) + if m is not None: + logger_warning( + f"Object ID {indirect_reference.idnum},{indirect_reference.generation} ref repaired", + __name__, + ) + self.xref[indirect_reference.generation][ + indirect_reference.idnum + ] = (m.start(0) + 1) + self.stream.seek(m.start(0) + 1) + idnum, generation = self.read_object_header(self.stream) + else: + idnum = -1 # exception will be raised below if idnum != indirect_reference.idnum and self.xref_index: # Xref table probably had bad indexes due to not being zero-indexed if self.strict: @@ -1161,13 +1179,42 @@ def get_object(self, indirect_reference: IndirectObject) -> Optional[PdfObject]: retval, indirect_reference.idnum, indirect_reference.generation ) else: - logger_warning( - f"Object {indirect_reference.idnum} {indirect_reference.generation} " - "not defined.", - __name__, + m = re.search( + rf"\s{indirect_reference.idnum}\s+{indirect_reference.generation}\s+obj".encode(), + bytes(self.stream.getbuffer()), ) - if self.strict: - raise PdfReadError("Could not find object.") + if m is not None: + logger_warning( + f"Object {indirect_reference.idnum} {indirect_reference.generation} found", + __name__, + ) + if indirect_reference.generation not in self.xref: + self.xref[indirect_reference.generation] = {} + self.xref[indirect_reference.generation][indirect_reference.idnum] = ( + m.start(0) + 1 + ) + self.stream.seek(m.end(0) + 1) + skip_over_whitespace(self.stream) + self.stream.seek(-1, 1) + retval = read_object(self.stream, self) # type: ignore + + # override encryption is used for the /Encrypt dictionary + if not self._override_encryption and self._encryption is not None: + # if we don't have the encryption key: + if not self._encryption.is_decrypted(): + raise FileNotDecryptedError("File has not been decrypted") + # otherwise, decrypt here... + retval = cast(PdfObject, retval) + retval = self._encryption.decrypt_object( + retval, indirect_reference.idnum, indirect_reference.generation + ) + else: + logger_warning( + f"Object {indirect_reference.idnum} {indirect_reference.generation} not defined.", + __name__, + ) + if self.strict: + raise PdfReadError("Could not find object.") self.cache_indirect_object( indirect_reference.generation, indirect_reference.idnum, retval ) @@ -1441,8 +1488,17 @@ def _read_xref_tables_and_trailers( except Exception: xref_issue_nr = 0 elif x.isdigit(): - xrefstream = self._read_pdf15_xref_stream(stream) - + try: + xrefstream = self._read_pdf15_xref_stream(stream) + except Exception as e: + if TK.ROOT in self.trailer: + logger_warning( + f"Previous trailer can not be read {e.args}", + __name__, + ) + break + else: + raise PdfReadError(f"trailer can not be read {e.args}") trailer_keys = TK.ROOT, TK.ENCRYPT, TK.INFO, TK.ID for key in trailer_keys: if key in xrefstream and key not in self.trailer: From 9854643665a6784d6a1c2bce302780adb9b0053b Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Wed, 31 Aug 2022 23:12:35 +0200 Subject: [PATCH 15/24] adjust and extend test --- tests/test_reader.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/tests/test_reader.py b/tests/test_reader.py index 314de4c8c..291f8ffe6 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -1103,12 +1103,16 @@ def test_corrupted_xref_table(): reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) reader.pages[0].extract_text() + # slightly different url = "https://github.com/py-pdf/PyPDF2/files/9444748/BreezeManual.failed.pdf" name = "BreezeMan2.pdf" reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) - try: - reader.pages[0].extract_text() - except Exception: - pass # Exception normal - else: - raise Exception("page 0 should not be corrupted") + + +def test_reader(): + # iss #1273 + url = "https://github.com/py-pdf/PyPDF2/files/9464742/shiv_resume.pdf" + name = "shiv_resume.pdf" + reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader.pages[0].extract_text() + # TODO : rerun a second time the extraction to see there is no log the second time From 1051f65d90ca6b57909a8d63c8d66bdce65bd1cd Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Wed, 31 Aug 2022 23:31:24 +0200 Subject: [PATCH 16/24] ROB : improve extraction discard non readable XRef object to try to do your best --- PyPDF2/_reader.py | 8 +++++++- tests/test_xmp.py | 6 ++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/PyPDF2/_reader.py b/PyPDF2/_reader.py index c28698203..bf43653ca 100644 --- a/PyPDF2/_reader.py +++ b/PyPDF2/_reader.py @@ -1553,7 +1553,13 @@ def _read_xref(self, stream: StreamType) -> Optional[int]: if "/XRefStm" in new_trailer: p = stream.tell() stream.seek(cast(int, new_trailer["/XRefStm"]) + 1, 0) - self._read_pdf15_xref_stream(stream) + try: + self._read_pdf15_xref_stream(stream) + except Exception as e: + logger_warning( + f"XRef object at {new_trailer['/XRefStm']} can not be read, some object may be missing", + __name__, + ) stream.seek(p, 0) if "/Prev" in new_trailer: startxref = new_trailer["/Prev"] diff --git a/tests/test_xmp.py b/tests/test_xmp.py index 35a797078..144e5d054 100644 --- a/tests/test_xmp.py +++ b/tests/test_xmp.py @@ -172,10 +172,8 @@ def test_dc_subject(): def test_issue585(): url = "https://github.com/mstamy2/PyPDF2/files/5536984/test.pdf" name = "mstamy2-5536984.pdf" - with pytest.raises(PdfReadError) as exc: - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) - reader.xmp_metadata - assert exc.value.args[0].startswith("Stream length not defined") + reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader.xmp_metadata # def test_getter_bag(): From 8cc00ae48c497bf2fb64ace7d7c5e375b71256b8 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Wed, 31 Aug 2022 23:45:31 +0200 Subject: [PATCH 17/24] flake8 + merge --- PyPDF2/_reader.py | 8 +++++--- tests/test_xmp.py | 1 - 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/PyPDF2/_reader.py b/PyPDF2/_reader.py index af326319f..477c139c0 100644 --- a/PyPDF2/_reader.py +++ b/PyPDF2/_reader.py @@ -327,7 +327,9 @@ def metadata(self) -> Optional[DocumentInformation]: obj = self.trailer[TK.INFO] retval = DocumentInformation() if isinstance(obj, type(None)): - raise PdfReadError("trailer not found or does not point to document information directory") + raise PdfReadError( + "trailer not found or does not point to document information directory" + ) retval.update(obj) # type: ignore return retval @@ -1133,7 +1135,7 @@ def get_object(self, indirect_reference: IndirectObject) -> Optional[PdfObject]: self.stream.seek(start, 0) try: idnum, generation = self.read_object_header(self.stream) - except Exception as e: + except Exception: m = re.search( rf"\s{indirect_reference.idnum}\s+{indirect_reference.generation}\s+obj".encode(), bytes(self.stream.getbuffer()), @@ -1557,7 +1559,7 @@ def _read_xref(self, stream: StreamType) -> Optional[int]: stream.seek(cast(int, new_trailer["/XRefStm"]) + 1, 0) try: self._read_pdf15_xref_stream(stream) - except Exception as e: + except Exception: logger_warning( f"XRef object at {new_trailer['/XRefStm']} can not be read, some object may be missing", __name__, diff --git a/tests/test_xmp.py b/tests/test_xmp.py index 144e5d054..cd8385234 100644 --- a/tests/test_xmp.py +++ b/tests/test_xmp.py @@ -7,7 +7,6 @@ import PyPDF2.generic import PyPDF2.xmp from PyPDF2 import PdfReader -from PyPDF2.errors import PdfReadError from . import get_pdf_from_url From 26678b17c15b8e420e9456aa991561db02f05328 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Thu, 1 Sep 2022 00:18:55 +0200 Subject: [PATCH 18/24] cope with stream without getbuffer() + restore tst_xmp --- PyPDF2/_reader.py | 16 ++++++++++++++-- tests/test_xmp.py | 7 +++++-- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/PyPDF2/_reader.py b/PyPDF2/_reader.py index 477c139c0..bfc47ee33 100644 --- a/PyPDF2/_reader.py +++ b/PyPDF2/_reader.py @@ -1136,9 +1136,15 @@ def get_object(self, indirect_reference: IndirectObject) -> Optional[PdfObject]: try: idnum, generation = self.read_object_header(self.stream) except Exception: + if hasattr(self.stream, "getbuffer"): + buf = bytes(self.stream.getbuffer()) + else: + p = self.stream.tell() + buf = self.stream.read(-1) + self.stream.seek(p, 0) m = re.search( rf"\s{indirect_reference.idnum}\s+{indirect_reference.generation}\s+obj".encode(), - bytes(self.stream.getbuffer()), + buf, ) if m is not None: logger_warning( @@ -1183,9 +1189,15 @@ def get_object(self, indirect_reference: IndirectObject) -> Optional[PdfObject]: retval, indirect_reference.idnum, indirect_reference.generation ) else: + if hasattr(self.stream, "getbuffer"): + buf = bytes(self.stream.getbuffer()) + else: + p = self.stream.tell() + buf = self.stream.read(-1) + self.stream.seek(p, 0) m = re.search( rf"\s{indirect_reference.idnum}\s+{indirect_reference.generation}\s+obj".encode(), - bytes(self.stream.getbuffer()), + buf, ) if m is not None: logger_warning( diff --git a/tests/test_xmp.py b/tests/test_xmp.py index cd8385234..62e49bd37 100644 --- a/tests/test_xmp.py +++ b/tests/test_xmp.py @@ -7,6 +7,7 @@ import PyPDF2.generic import PyPDF2.xmp from PyPDF2 import PdfReader +from PyPDF2.errors import PdfReadError from . import get_pdf_from_url @@ -171,8 +172,10 @@ def test_dc_subject(): def test_issue585(): url = "https://github.com/mstamy2/PyPDF2/files/5536984/test.pdf" name = "mstamy2-5536984.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) - reader.xmp_metadata + with pytest.raises(PdfReadError) as exc: + reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader.xmp_metadata + assert exc.value.args[0].startswith("XML in XmpInformation was invalid") # def test_getter_bag(): From 44bf5bdd6c29952543120a5a3478ce975c01e101 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Thu, 1 Sep 2022 00:32:52 +0200 Subject: [PATCH 19/24] mypy --- PyPDF2/_reader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PyPDF2/_reader.py b/PyPDF2/_reader.py index bfc47ee33..558d386b6 100644 --- a/PyPDF2/_reader.py +++ b/PyPDF2/_reader.py @@ -1137,7 +1137,7 @@ def get_object(self, indirect_reference: IndirectObject) -> Optional[PdfObject]: idnum, generation = self.read_object_header(self.stream) except Exception: if hasattr(self.stream, "getbuffer"): - buf = bytes(self.stream.getbuffer()) + buf = bytes(self.stream.getbuffer()) # type: ignore else: p = self.stream.tell() buf = self.stream.read(-1) @@ -1190,7 +1190,7 @@ def get_object(self, indirect_reference: IndirectObject) -> Optional[PdfObject]: ) else: if hasattr(self.stream, "getbuffer"): - buf = bytes(self.stream.getbuffer()) + buf = bytes(self.stream.getbuffer()) # type: ignore else: p = self.stream.tell() buf = self.stream.read(-1) From 1b17f72e35f2e1d4b4ea8178a5a4ad8e5b9e4b43 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Thu, 1 Sep 2022 09:38:34 +0200 Subject: [PATCH 20/24] extend test complete TODO test --- tests/test_reader.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/tests/test_reader.py b/tests/test_reader.py index 910604064..20e237342 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -101,13 +101,15 @@ def test_read_metadata(pdf_path, expected): @pytest.mark.parametrize( - "pdf_path", - [EXTERNAL_ROOT / "017-unreadable-meta-data/unreadablemetadata.pdf"] + "pdf_path", [EXTERNAL_ROOT / "017-unreadable-meta-data/unreadablemetadata.pdf"] ) def test_broken_meta_data(pdf_path): - with open(pdf_path, 'rb') as f: + with open(pdf_path, "rb") as f: reader = PdfReader(f) - with pytest.raises(PdfReadError, match=r"trailer not found or does not point to document information directory"): + with pytest.raises( + PdfReadError, + match=r"trailer not found or does not point to document information directory", + ): reader.metadata @@ -1119,10 +1121,18 @@ def test_corrupted_xref_table(): reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) -def test_reader(): +def test_reader(caplog): # iss #1273 url = "https://github.com/py-pdf/PyPDF2/files/9464742/shiv_resume.pdf" name = "shiv_resume.pdf" reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + assert "Previous trailer can not be read" in caplog.text + caplog.clear() + # first call requires some reparations... reader.pages[0].extract_text() - # TODO : rerun a second time the extraction to see there is no log the second time + assert "repaired" in caplog.text + assert "found" in caplog.text + caplog.clear() + # ...and now no more required + reader.pages[0].extract_text() + assert caplog.text == "" From b71c071519b2668a6f67e676be58ec4b92bd65ff Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sat, 3 Sep 2022 09:41:41 +0200 Subject: [PATCH 21/24] Update tests/test_reader.py Co-authored-by: Martin Thoma --- tests/test_reader.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_reader.py b/tests/test_reader.py index 20e237342..a50a52e4d 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -1115,7 +1115,6 @@ def test_corrupted_xref_table(): name = "BreezeMan1.pdf" reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) reader.pages[0].extract_text() - # slightly different url = "https://github.com/py-pdf/PyPDF2/files/9444748/BreezeManual.failed.pdf" name = "BreezeMan2.pdf" reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) From cfa33eed1432b2970bdfa8367bad1f0c4ec258b6 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sat, 3 Sep 2022 09:47:33 +0200 Subject: [PATCH 22/24] Update tests/test_reader.py Co-authored-by: Martin Thoma --- tests/test_reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_reader.py b/tests/test_reader.py index a50a52e4d..96cc77234 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -1118,7 +1118,7 @@ def test_corrupted_xref_table(): url = "https://github.com/py-pdf/PyPDF2/files/9444748/BreezeManual.failed.pdf" name = "BreezeMan2.pdf" reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) - + reader.pages[0].extract_text() def test_reader(caplog): # iss #1273 From 27fbc7f7df91302c903c08c63133c977de7090a5 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Sat, 3 Sep 2022 09:52:42 +0200 Subject: [PATCH 23/24] Flake8: Add missing newline --- tests/test_reader.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_reader.py b/tests/test_reader.py index 96cc77234..fde3f3ffe 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -1120,6 +1120,7 @@ def test_corrupted_xref_table(): reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) reader.pages[0].extract_text() + def test_reader(caplog): # iss #1273 url = "https://github.com/py-pdf/PyPDF2/files/9464742/shiv_resume.pdf" From 4edf6f8b03c69be380c63e8ee6d2b4b137cb7918 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sat, 3 Sep 2022 10:20:34 +0200 Subject: [PATCH 24/24] rollback iaw review --- tests/test_xmp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_xmp.py b/tests/test_xmp.py index 62e49bd37..a53b27b0e 100644 --- a/tests/test_xmp.py +++ b/tests/test_xmp.py @@ -172,8 +172,8 @@ def test_dc_subject(): def test_issue585(): url = "https://github.com/mstamy2/PyPDF2/files/5536984/test.pdf" name = "mstamy2-5536984.pdf" + reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) with pytest.raises(PdfReadError) as exc: - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) reader.xmp_metadata assert exc.value.args[0].startswith("XML in XmpInformation was invalid")