From b449664b0396163d809b5335b48c68fe460d4b20 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Fri, 3 May 2024 23:04:15 +0200 Subject: [PATCH 01/42] ROB: improve inline image extraction closes #2598 --- pypdf/generic/_data_structures.py | 99 +++++------- pypdf/generic/_image_inline.py | 242 ++++++++++++++++++++++++++++++ tests/test_images.py | 12 ++ 3 files changed, 294 insertions(+), 59 deletions(-) create mode 100644 pypdf/generic/_image_inline.py diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index 3ca761403..c70f5421a 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -49,7 +49,6 @@ from .._protocols import PdfReaderProtocol, PdfWriterProtocol, XmpInformationProtocol from .._utils import ( - WHITESPACES, StreamType, b_, deprecate_no_replacement, @@ -81,6 +80,13 @@ TextStringObject, ) from ._fit import Fit +from ._image_inline import ( + extract_inline_A85, + extract_inline_AHex, + extract_inline_DCT, + extract_inline_default, + extract_inline_RL, +) from ._utils import read_hex_string_from_stream, read_string_from_stream if sys.version_info >= (3, 11): @@ -1152,65 +1158,40 @@ def _read_inline_image(self, stream: StreamType) -> Dict[str, Any]: # left at beginning of ID tmp = stream.read(3) assert tmp[:2] == b"ID" - data = BytesIO() - # Read the inline image, while checking for EI (End Image) operator. - while True: - # Read 8 kB at a time and check if the chunk contains the E operator. - buf = stream.read(8192) - # We have reached the end of the stream, but haven't found the EI operator. - if not buf: - raise PdfReadError("Unexpected end of stream") - loc = buf.find( - b"E" - ) # we can not look straight for "EI" because it may not have been loaded in the buffer - - if loc == -1: - data.write(buf) + filtr = settings.get("/F", "not set") + # print("inline", stream.tell(),filtr,"*",settings) + if isinstance(filtr, list): + filtr = filtr[0] # used forencoding + if filtr == "AHx": + data = extract_inline_AHex(stream) + elif filtr == "A85": + data = extract_inline_A85(stream) + elif filtr == "RL": + data = extract_inline_RL(stream) + elif filtr == "DCT": + data = extract_inline_DCT(stream) + elif filtr == "not set": + cs = settings["/CS"] + if cs == "/I" or cs == "/G": + lcs = 1 + elif cs == "/RGB": + lcs = 3 + elif cs == "/CMYK": + lcs = 4 else: - # Write out everything before the E. - data.write(buf[0:loc]) - - # Seek back in the stream to read the E next. - stream.seek(loc - len(buf), 1) - tok = stream.read(1) # E of "EI" - # Check for End Image - tok2 = stream.read(1) # I of "EI" - if tok2 != b"I": - stream.seek(-1, 1) - data.write(tok) - continue - # for further debug : print("!!!!",buf[loc-1:loc+10]) - info = tok + tok2 - tok3 = stream.read( - 1 - ) # possible space after "EI" may not been loaded in buf - if tok3 not in WHITESPACES: - stream.seek(-2, 1) # to step back on I - data.write(tok) - elif buf[loc - 1 : loc] in WHITESPACES: # and tok3 in WHITESPACES: - # Data can contain [\s]EI[\s]: 4 chars sufficient, checking Q operator not required. - while tok3 in WHITESPACES: - # needed ???? : info += tok3 - tok3 = stream.read(1) - stream.seek(-1, 1) - # we do not insert EI - break - else: # buf[loc - 1 : loc] not in WHITESPACES and tok3 in WHITESPACES: - # Data can contain [!\s]EI[\s], so check for Q or EMC operator is required to have 4 chars. - while tok3 in WHITESPACES: - info += tok3 - tok3 = stream.read(1) - stream.seek(-1, 1) - if tok3 == b"Q": - break - elif tok3 == b"E": - ope = stream.read(3) - stream.seek(-3, 1) - if ope == b"EMC": - break - else: - data.write(info) - return {"settings": settings, "data": data.getvalue()} + raise PdfReadError("Invalid CS value:", cs) + data = stream.read( + cast(int, settings["/W"]) * cast(int, settings["/H"]) * lcs + ) + ei = read_non_whitespace(stream) + ei += stream.read(1) + stream.seek(-2, 1) + else: + data = extract_inline_default(stream) + + ei = stream.read(2) + assert ei == b"EI" + return {"settings": settings, "data": data} # This overrides the parent method: def get_data(self) -> bytes: diff --git a/pypdf/generic/_image_inline.py b/pypdf/generic/_image_inline.py new file mode 100644 index 000000000..4c1ab1b62 --- /dev/null +++ b/pypdf/generic/_image_inline.py @@ -0,0 +1,242 @@ +# Copyright (c) 2024, PubPub-ZZ +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * The name of the author may not be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +import logging +from io import BytesIO + +from .._utils import ( + WHITESPACES, + StreamType, + read_non_whitespace, +) +from ..errors import PdfReadError + +logger = logging.getLogger(__name__) + +BUFFER_SIZE = 8192 + + +def extract_inline_AHex(stream: StreamType) -> bytes: + """ + Extract HexEncoded Stream from Inline Image. + the stream will be moved onto the EI + """ + data: bytes = b"" + # Read data until delimiter > and EI as backup + # ignoring backup. + while True: + buf = stream.read(BUFFER_SIZE) + if not buf: + raise PdfReadError("Unexpected end of stream") + loc = buf.find(b">") + if loc >= 0: # found > + data += buf[: (loc + 1)] + stream.seek(-BUFFER_SIZE + loc + 1) + break + loc = buf.find(b"EI") + if loc >= 0: # found EI + stream.seek(-BUFFER_SIZE + loc - 1, 1) + c = stream.read(1) + while c in WHITESPACES: + stream.seek(-2, 1) + c = stream.read(1) + loc -= 1 + data += buf[:loc] + else: # > nor EI found + data += buf[:-1] + stream.seek(-1, 1) + + ei = read_non_whitespace(stream) + ei += stream.read(1) + stream.seek(-2, 1) + if ei != b"EI": + raise PdfReadError("EI stream not found") + return data + + +def extract_inline_A85(stream: StreamType) -> bytes: + """ + Extract A85 Stream from Inline Image. + the stream will be moved onto the EI + """ + data: bytes = b"" + # Read data up to delimiter ~> + # see §3.3.2 from PDF ref 1.7 + while True: + buf = stream.read(BUFFER_SIZE) + if not buf: + raise PdfReadError("Unexpected end of stream") + loc = buf.find(b"~>") + if loc >= 0: # found! + data += buf[: loc + 2] + stream.seek(-BUFFER_SIZE + loc + 2, 1) + break + data += buf[:-1] # back by one char in case of in the middle of ~> + stream.seek(-1, 1) + + ei = read_non_whitespace(stream) + ei += stream.read(1) + stream.seek(-2, 1) + if ei != b"EI": + raise PdfReadError("EI stream not found") + return data + + +def extract_inline_RL(stream: StreamType) -> bytes: + """ + Extract RL Stream from Inline Image. + the stream will be moved onto the EI + """ + data: bytes = b"" + # Read data up to delimiter ~> + # see §3.3.4 from PDF ref 1.7 + while True: + buf = stream.read(BUFFER_SIZE) + if not buf: + raise PdfReadError("Unexpected end of stream") + loc = buf.find(b"\x80") + if loc >= 0: # found + data = buf[: loc + 1] + stream.seek(-BUFFER_SIZE + loc + 1, 1) + break + data += buf # back by one char in case of in the middle of ~> + + data += buf[:loc] + ei = read_non_whitespace(stream) + ei += stream.read(1) + stream.seek(-2, 1) + if ei != b"EI": + raise PdfReadError("EI stream not found") + return data + + +def extract_inline_DCT(stream: StreamType) -> bytes: + """ + Extract DCT (JPEG) Stream from Inline Image. + the stream will be moved onto the EI + """ + data: bytes = b"" + # Read Blocks of data (ID/Size/data) up to ID=FF/D9 + # see https://www.digicamsoft.com/itu/itu-t81-36.html + while True: + c = stream.read(1) + data += c + if c != b"\xff": + continue + c = stream.read(1) + if c == b"\xff": + stream.seek(-1, 1) + elif c == b"\x00": # stuffing + data += c + elif c == b"\xd9": # end + data += c + break + elif c in ( + b"\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc9\xca\xcb\xcc\xcd\xce\xcf" + b"\xda\xdb\xdc\xdd\xde\xdf" + b"\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xfe" + ): + data += c + c = stream.read(2) + data += c + sz = ord(c[0]) * 256 + c[1] + data += stream.read(sz - 2) + else: + data += c + + ei = read_non_whitespace(stream) + ei += stream.read(1) + stream.seek(-2, 1) + if ei != b"EI": + raise PdfReadError("EI stream not found") + return data + + +def extract_inline_default(stream: StreamType) -> bytes: + """ + Legacy method + used by default + """ + data = BytesIO() + # Read the inline image, while checking for EI (End Image) operator. + while True: + buf = stream.read(BUFFER_SIZE) + if not buf: + raise PdfReadError("Unexpected end of stream") + loc = buf.find( + b"E" + ) # we can not look straight for "EI" because it may not have been loaded in the buffer + + if loc == -1: + data.write(buf) + else: + # Write out everything before the E. + data.write(buf[0:loc]) + + # Seek back in the stream to read the E next. + stream.seek(loc - len(buf), 1) + saved_pos = stream.tell() + tok = stream.read(1) # E of "EI" + # Check for End Image + tok2 = stream.read(1) # I of "EI" + if tok2 != b"I": + stream.seek(-1, 1) + data.write(tok) + continue + # for further debug : print("!!!!",buf[loc-1:loc+10]) + info = tok + tok2 + tok3 = stream.read( + 1 + ) # possible space after "EI" may not been loaded in buf + if tok3 not in WHITESPACES: + stream.seek(-2, 1) # to step back on I + data.write(tok) + elif buf[loc - 1 : loc] in WHITESPACES: # and tok3 in WHITESPACES: + # Data can contain [\s]EI[\s]: 4 chars sufficient, checking Q operator not required. + while tok3 in WHITESPACES: + # needed ???? : info += tok3 + tok3 = stream.read(1) + stream.seek(-1, 1) + # we do not insert EI + break + else: # buf[loc - 1 : loc] not in WHITESPACES and tok3 in WHITESPACES: + # Data can contain [!\s]EI[\s], so check for Q or EMC operator is required to have 4 chars. + while tok3 in WHITESPACES: + info += tok3 + tok3 = stream.read(1) + stream.seek(-1, 1) + if tok3 == b"Q": + break + elif tok3 == b"E": + ope = stream.read(3) + stream.seek(-3, 1) + if ope == b"EMC": + break + else: + data.write(info) + stream.seek(saved_pos, 0) + return data.getvalue() diff --git a/tests/test_images.py b/tests/test_images.py index ad694d669..148893abb 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -346,3 +346,15 @@ def test_corrupted_jpeg_iss2266(pdf, pdf_name, images, images_name, filtr): print(fn) # noqa: T201 img = Image.open(BytesIO(zf.read(fn))) assert image_similarity(reader.pages[p].images[i].image, img) >= 0.99 + + +@pytest.mark.enable_socket() +def test_inline_image_extraction(): + """Cf #2598""" + url = "https://github.com/py-pdf/pypdf/files/14982414/lebo102.pdf" + name = "iss2598.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + # there is no error because images are correctly extracted + reader.pages[1].extract_text() + reader.pages[2].extract_text() + reader.pages[3].extract_text() From 44b41a7a78a0450d49253931f20eb5c7f20bff46 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sat, 4 May 2024 15:04:07 +0200 Subject: [PATCH 02/42] fix --- pypdf/generic/_data_structures.py | 30 +++++++++++++++++++----------- tests/test_workflows.py | 4 +--- 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index c70f5421a..9463efe38 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -33,6 +33,7 @@ import re import sys from io import BytesIO +from math import ceil from typing import ( Any, Callable, @@ -1159,29 +1160,34 @@ def _read_inline_image(self, stream: StreamType) -> Dict[str, Any]: tmp = stream.read(3) assert tmp[:2] == b"ID" filtr = settings.get("/F", "not set") + savpos = stream.tell() # print("inline", stream.tell(),filtr,"*",settings) if isinstance(filtr, list): filtr = filtr[0] # used forencoding - if filtr == "AHx": + if filtr == "AHx" or "ASCIIHexDecode" in filtr: data = extract_inline_AHex(stream) - elif filtr == "A85": + elif filtr == "A85" or "ASCII85Decode" in filtr: data = extract_inline_A85(stream) - elif filtr == "RL": + elif filtr == "RL" or "RunLengthDecode" in filtr: data = extract_inline_RL(stream) - elif filtr == "DCT": + elif filtr == "DCT" or "DCTDecode" in filtr: data = extract_inline_DCT(stream) elif filtr == "not set": - cs = settings["/CS"] - if cs == "/I" or cs == "/G": + cs = settings.get("/CS", "") + if cs == "/I" or cs == "/G" or cs == "/Indexed" or cs == "/DeviceGray": lcs = 1 - elif cs == "/RGB": + elif "RGB" in cs: lcs = 3 - elif cs == "/CMYK": + elif "CMYK" in cs: lcs = 4 else: - raise PdfReadError("Invalid CS value:", cs) + bits = settings.get("/BPC", -1) + if bits > 0: + lcs = bits / 8.0 + else: + raise PdfReadError("Invalid CS value:", cs) data = stream.read( - cast(int, settings["/W"]) * cast(int, settings["/H"]) * lcs + ceil(cast(int, settings["/W"]) * lcs) * cast(int, settings["/H"]) ) ei = read_non_whitespace(stream) ei += stream.read(1) @@ -1190,7 +1196,9 @@ def _read_inline_image(self, stream: StreamType) -> Dict[str, Any]: data = extract_inline_default(stream) ei = stream.read(2) - assert ei == b"EI" + if ei != b"EI": + stream.seek(savpos, 0) + data = extract_inline_default(stream) return {"settings": settings, "data": data} # This overrides the parent method: diff --git a/tests/test_workflows.py b/tests/test_workflows.py index 94e380dca..c79a36b51 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -935,9 +935,7 @@ def test_extra_test_iss1541(): stream = BytesIO() cs.write_to_stream(stream) stream.seek(0) - with pytest.raises(PdfReadError) as exc: - ContentStream(read_object(stream, None, None), None, None).operations - assert exc.value.args[0] == "Unexpected end of stream" + ContentStream(read_object(stream, None, None), None, None).operations b = BytesIO(data.getbuffer()) reader = PdfReader( From 0952fee4a34a46f80730bcbe6b8811129dbaf745 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sat, 4 May 2024 16:39:57 +0200 Subject: [PATCH 03/42] complete testing --- pypdf/generic/_data_structures.py | 16 +++++++++------- tests/test_images.py | 6 ++++++ 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index 9463efe38..941c020fb 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -1185,13 +1185,15 @@ def _read_inline_image(self, stream: StreamType) -> Dict[str, Any]: if bits > 0: lcs = bits / 8.0 else: - raise PdfReadError("Invalid CS value:", cs) - data = stream.read( - ceil(cast(int, settings["/W"]) * lcs) * cast(int, settings["/H"]) - ) - ei = read_non_whitespace(stream) - ei += stream.read(1) - stream.seek(-2, 1) + data = extract_inline_default(stream) + lcs = -1 + if lcs > 0: + data = stream.read( + ceil(cast(int, settings["/W"]) * lcs) * cast(int, settings["/H"]) + ) + ei = read_non_whitespace(stream) + ei += stream.read(1) + stream.seek(-2, 1) else: data = extract_inline_default(stream) diff --git a/tests/test_images.py b/tests/test_images.py index 148893abb..4bfcfd0a3 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -358,3 +358,9 @@ def test_inline_image_extraction(): reader.pages[1].extract_text() reader.pages[2].extract_text() reader.pages[3].extract_text() + + url = "https://github.com/py-pdf/pypdf/files/15210011/Pages.62.73.from.0560-22_WSP.Plan_July.2022_Version.1.pdf" + name = "iss2598a.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader.pages[0].extract_text() + reader.pages[1].extract_text() From 0ba5ae41d3617c7fd4e85911381a76898e632b44 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 5 May 2024 22:17:02 +0200 Subject: [PATCH 04/42] complete test --- pypdf/_page.py | 117 ++++++++++++++++++++++------------- pypdf/_xobj_image_helpers.py | 42 +++++++------ pypdf/filters.py | 2 +- tests/test_images.py | 9 +++ 4 files changed, 107 insertions(+), 63 deletions(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index 47cbc866b..a0e5b96fc 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -82,6 +82,7 @@ NameObject, NullObject, NumberObject, + PdfObject, RectangleObject, StreamObject, ) @@ -551,6 +552,46 @@ def images(self) -> List[ImageFile]: """ return _VirtualListImages(self._get_ids_image, self._get_image) # type: ignore + def _translate_value_inlineimage(self, k: str, v: PdfObject) -> PdfObject: + """Translate values used in inline image""" + try: + v = NameObject( + { + "/G": "/DeviceGray", + "/RGB": "/DeviceRGB", + "/CMYK": "/DeviceCMYK", + "/I": "/Indexed", + "/AHx": "/ASCIIHexDecode", + "/A85": "/ASCII85Decode", + "/LZW": "/LZWDecode", + "/Fl": "/FlateDecode", + "/RL": "/RunLengthDecode", + "/CCF": "/CCITTFaxDecode", + "/DCT": "/DCTDecode", + "/DeviceGray": "/DeviceGray", + "/DeviceRGB": "/DeviceRGB", + "/DeviceCMYK": "/DeviceCMYK", + "/Indexed": "/Indexed", + "/ASCIIHexDecode": "/ASCIIHexDecode", + "/ASCII85Decode": "/ASCII85Decode", + "/LZWDecode": "/LZWDecode", + "/FlateDecode": "/FlateDecode", + "/RunLengthDecode": "/RunLengthDecode", + "/CCITTFaxDecode": "/CCITTFaxDecode", + "/DCTDecode": "/DCTDecode", + }[cast(str, v)] + ) + except (TypeError, KeyError): + if isinstance(v, NameObject): + # it is a custom name : we have to look in resources : + # the only applicable case is for ColorSpace + try: + res = cast(DictionaryObject, self["/Resources"])["/ColorSpace"] + v = cast(DictionaryObject, res)[v] + except KeyError: # for res and v + raise PdfReadError(f"Can not find resource entry {v} for {k}") + return v + def _get_inline_images(self) -> Dict[str, ImageFile]: """ get inline_images @@ -593,51 +634,39 @@ def _get_inline_images(self) -> Dict[str, ImageFile]: "/Length": len(ii["__streamdata__"]), } for k, v in ii["settings"].items(): - try: - v = NameObject( - { - "/G": "/DeviceGray", - "/RGB": "/DeviceRGB", - "/CMYK": "/DeviceCMYK", - "/I": "/Indexed", - "/AHx": "/ASCIIHexDecode", - "/A85": "/ASCII85Decode", - "/LZW": "/LZWDecode", - "/Fl": "/FlateDecode", - "/RL": "/RunLengthDecode", - "/CCF": "/CCITTFaxDecode", - "/DCT": "/DCTDecode", - }[v] + if k in ("/Length", "/L"): # no length is expected + continue + if isinstance(v, list): + v = ArrayObject( + [self._translate_value_inlineimage(k, x) for x in v] ) - except (TypeError, KeyError): - if isinstance(v, NameObject): - # it is a custom name : we have to look in resources : - # the only applicable case is for ColorSpace - try: - res = cast(DictionaryObject, self["/Resources"])[ - "/ColorSpace" - ] - v = cast(DictionaryObject, res)[v] - except KeyError: # for res and v - raise PdfReadError( - f"Can not find resource entry {v} for {k}" - ) - init[ - NameObject( - { - "/BPC": "/BitsPerComponent", - "/CS": "/ColorSpace", - "/D": "/Decode", - "/DP": "/DecodeParms", - "/F": "/Filter", - "/H": "/Height", - "/W": "/Width", - "/I": "/Interpolate", - "/Intent": "/Intent", - "/IM": "/ImageMask", - }[k] - ) - ] = v + else: + v = self._translate_value_inlineimage(k, v) + k = NameObject( + { + "/BPC": "/BitsPerComponent", + "/CS": "/ColorSpace", + "/D": "/Decode", + "/DP": "/DecodeParms", + "/F": "/Filter", + "/H": "/Height", + "/W": "/Width", + "/I": "/Interpolate", + "/Intent": "/Intent", + "/IM": "/ImageMask", + "/BitsPerComponent": "/BitsPerComponent", + "/ColorSpace": "/ColorSpace", + "/Decode": "/Decode", + "/DecodeParms": "/DecodeParms", + "/Filter": "/Filter", + "/Height": "/Height", + "/Width": "/Width", + "/Interpolate": "/Interpolate", + "/ImageMask": "/ImageMask", + }[k] + ) + if k not in init: + init[k] = v ii["object"] = EncodedStreamObject.initialize_from_dictionary(init) extension, byte_stream, img = _xobj_to_image(ii["object"]) files[f"~{num}~"] = ImageFile( diff --git a/pypdf/_xobj_image_helpers.py b/pypdf/_xobj_image_helpers.py index cc0123ff2..cd1cdca17 100644 --- a/pypdf/_xobj_image_helpers.py +++ b/pypdf/_xobj_image_helpers.py @@ -123,10 +123,34 @@ def _get_imagemode( return mode, mode == "CMYK" +def bits2byte(data: bytes, size: Tuple[int, int], bits: int) -> bytes: + mask = (1 << bits) - 1 + nbuff = bytearray(size[0] * size[1]) + by = 0 + bit = 8 - bits + for y in range(size[1]): + if (bit != 0) and (bit != 8 - bits): + by += 1 + bit = 8 - bits + for x in range(size[0]): + nbuff[y * size[0] + x] = (data[by] >> bit) & mask + bit -= bits + if bit < 0: + by += 1 + bit = 8 - bits + return bytes(nbuff) + + def _extended_image_frombytes( mode: str, size: Tuple[int, int], data: bytes ) -> Image.Image: try: + if mode == "2bits": + mode = "P" + data = bits2byte(data, size, 2) + elif mode == "4bits": + mode = "P" + data = bits2byte(data, size, 4) img = Image.frombytes(mode, size, data) except ValueError as exc: nb_pix = size[0] * size[1] @@ -150,24 +174,6 @@ def _handle_flate( Process image encoded in flateEncode Returns img, image_format, extension, color inversion """ - - def bits2byte(data: bytes, size: Tuple[int, int], bits: int) -> bytes: - mask = (2 << bits) - 1 - nbuff = bytearray(size[0] * size[1]) - by = 0 - bit = 8 - bits - for y in range(size[1]): - if (bit != 0) and (bit != 8 - bits): - by += 1 - bit = 8 - bits - for x in range(size[0]): - nbuff[y * size[0] + x] = (data[by] >> bit) & mask - bit -= bits - if bit < 0: - by += 1 - bit = 8 - bits - return bytes(nbuff) - extension = ".png" # mime_type = "image/png" image_format = "PNG" lookup: Any diff --git a/pypdf/filters.py b/pypdf/filters.py index d62cf7842..26d71229d 100644 --- a/pypdf/filters.py +++ b/pypdf/filters.py @@ -803,7 +803,7 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes, # I'm not sure if the following logic is correct. # There might not be any relationship between the filters and the # extension - if x_object_obj[SA.FILTER] in [[FT.LZW_DECODE], [FT.CCITT_FAX_DECODE]]: + if lfilters in (FT.LZW_DECODE, FT.CCITT_FAX_DECODE): extension = ".tiff" # mime_type = "image/tiff" image_format = "TIFF" else: diff --git a/tests/test_images.py b/tests/test_images.py index 4bfcfd0a3..90732e8f8 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -364,3 +364,12 @@ def test_inline_image_extraction(): reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) reader.pages[0].extract_text() reader.pages[1].extract_text() + + url = "https://github.com/mozilla/pdf.js/raw/master/test/pdfs/issue14256.pdf" + name = "iss2598b.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + url = "https://github.com/py-pdf/pypdf/assets/4083478/71bc5053-cfc7-44ba-b7be-8e2333e2c749" + name = "iss2598b.png" + img = Image.open(BytesIO(get_data_from_url(url, name=name))) + for i in range(8): + assert image_similarity(reader.pages[0].images[i].image, img) == 1 From fdbc0923adee909e4ffc2f476f5d8f77f2c75e21 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 5 May 2024 22:40:57 +0200 Subject: [PATCH 05/42] tests --- tests/test_images.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_images.py b/tests/test_images.py index 90732e8f8..e860aece1 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -373,3 +373,4 @@ def test_inline_image_extraction(): img = Image.open(BytesIO(get_data_from_url(url, name=name))) for i in range(8): assert image_similarity(reader.pages[0].images[i].image, img) == 1 + reader.pages[0].extract_text() From fd57ef7803f971b1cced234aadb3c3e5f6ad6b9c Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 6 May 2024 08:51:43 +0200 Subject: [PATCH 06/42] fix --- pypdf/generic/_data_structures.py | 14 +++++++------- pypdf/generic/_image_inline.py | 8 ++++---- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index 941c020fb..f3d6b7868 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -1164,13 +1164,13 @@ def _read_inline_image(self, stream: StreamType) -> Dict[str, Any]: # print("inline", stream.tell(),filtr,"*",settings) if isinstance(filtr, list): filtr = filtr[0] # used forencoding - if filtr == "AHx" or "ASCIIHexDecode" in filtr: + if "AHx" in filtr or "ASCIIHexDecode" in filtr: data = extract_inline_AHex(stream) - elif filtr == "A85" or "ASCII85Decode" in filtr: + elif "A85" in filtr or "ASCII85Decode" in filtr: data = extract_inline_A85(stream) - elif filtr == "RL" or "RunLengthDecode" in filtr: + elif "RL" in filtr or "RunLengthDecode" in filtr: data = extract_inline_RL(stream) - elif filtr == "DCT" or "DCTDecode" in filtr: + elif "DCT" in filtr or "DCTDecode" in filtr: data = extract_inline_DCT(stream) elif filtr == "not set": cs = settings.get("/CS", "") @@ -1191,9 +1191,9 @@ def _read_inline_image(self, stream: StreamType) -> Dict[str, Any]: data = stream.read( ceil(cast(int, settings["/W"]) * lcs) * cast(int, settings["/H"]) ) - ei = read_non_whitespace(stream) - ei += stream.read(1) - stream.seek(-2, 1) + ei = read_non_whitespace(stream) + ei += stream.read(1) + stream.seek(-2, 1) else: data = extract_inline_default(stream) diff --git a/pypdf/generic/_image_inline.py b/pypdf/generic/_image_inline.py index 4c1ab1b62..0ca1b5d1c 100644 --- a/pypdf/generic/_image_inline.py +++ b/pypdf/generic/_image_inline.py @@ -55,11 +55,11 @@ def extract_inline_AHex(stream: StreamType) -> bytes: loc = buf.find(b">") if loc >= 0: # found > data += buf[: (loc + 1)] - stream.seek(-BUFFER_SIZE + loc + 1) + stream.seek(-len(buf) + loc + 1, 1) break loc = buf.find(b"EI") if loc >= 0: # found EI - stream.seek(-BUFFER_SIZE + loc - 1, 1) + stream.seek(-len(buf) + loc - 1, 1) c = stream.read(1) while c in WHITESPACES: stream.seek(-2, 1) @@ -93,7 +93,7 @@ def extract_inline_A85(stream: StreamType) -> bytes: loc = buf.find(b"~>") if loc >= 0: # found! data += buf[: loc + 2] - stream.seek(-BUFFER_SIZE + loc + 2, 1) + stream.seek(-len(buf) + loc + 2, 1) break data += buf[:-1] # back by one char in case of in the middle of ~> stream.seek(-1, 1) @@ -121,7 +121,7 @@ def extract_inline_RL(stream: StreamType) -> bytes: loc = buf.find(b"\x80") if loc >= 0: # found data = buf[: loc + 1] - stream.seek(-BUFFER_SIZE + loc + 1, 1) + stream.seek(-len(buf) + loc + 1, 1) break data += buf # back by one char in case of in the middle of ~> From 70f9c02ffc35db0c29f5f3ca4ce12e341634fa0d Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Tue, 7 May 2024 12:09:44 +0200 Subject: [PATCH 07/42] fix DCT --- pypdf/generic/_data_structures.py | 14 +++++++------- pypdf/generic/_image_inline.py | 24 +++++++++++++----------- tests/test_images.py | 8 ++++++++ 3 files changed, 28 insertions(+), 18 deletions(-) diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index 941c020fb..f3d6b7868 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -1164,13 +1164,13 @@ def _read_inline_image(self, stream: StreamType) -> Dict[str, Any]: # print("inline", stream.tell(),filtr,"*",settings) if isinstance(filtr, list): filtr = filtr[0] # used forencoding - if filtr == "AHx" or "ASCIIHexDecode" in filtr: + if "AHx" in filtr or "ASCIIHexDecode" in filtr: data = extract_inline_AHex(stream) - elif filtr == "A85" or "ASCII85Decode" in filtr: + elif "A85" in filtr or "ASCII85Decode" in filtr: data = extract_inline_A85(stream) - elif filtr == "RL" or "RunLengthDecode" in filtr: + elif "RL" in filtr or "RunLengthDecode" in filtr: data = extract_inline_RL(stream) - elif filtr == "DCT" or "DCTDecode" in filtr: + elif "DCT" in filtr or "DCTDecode" in filtr: data = extract_inline_DCT(stream) elif filtr == "not set": cs = settings.get("/CS", "") @@ -1191,9 +1191,9 @@ def _read_inline_image(self, stream: StreamType) -> Dict[str, Any]: data = stream.read( ceil(cast(int, settings["/W"]) * lcs) * cast(int, settings["/H"]) ) - ei = read_non_whitespace(stream) - ei += stream.read(1) - stream.seek(-2, 1) + ei = read_non_whitespace(stream) + ei += stream.read(1) + stream.seek(-2, 1) else: data = extract_inline_default(stream) diff --git a/pypdf/generic/_image_inline.py b/pypdf/generic/_image_inline.py index 4c1ab1b62..17f1e9c97 100644 --- a/pypdf/generic/_image_inline.py +++ b/pypdf/generic/_image_inline.py @@ -55,11 +55,11 @@ def extract_inline_AHex(stream: StreamType) -> bytes: loc = buf.find(b">") if loc >= 0: # found > data += buf[: (loc + 1)] - stream.seek(-BUFFER_SIZE + loc + 1) + stream.seek(-len(buf) + loc + 1, 1) break loc = buf.find(b"EI") if loc >= 0: # found EI - stream.seek(-BUFFER_SIZE + loc - 1, 1) + stream.seek(-len(buf) + loc - 1, 1) c = stream.read(1) while c in WHITESPACES: stream.seek(-2, 1) @@ -93,7 +93,7 @@ def extract_inline_A85(stream: StreamType) -> bytes: loc = buf.find(b"~>") if loc >= 0: # found! data += buf[: loc + 2] - stream.seek(-BUFFER_SIZE + loc + 2, 1) + stream.seek(-len(buf) + loc + 2, 1) break data += buf[:-1] # back by one char in case of in the middle of ~> stream.seek(-1, 1) @@ -121,7 +121,7 @@ def extract_inline_RL(stream: StreamType) -> bytes: loc = buf.find(b"\x80") if loc >= 0: # found data = buf[: loc + 1] - stream.seek(-BUFFER_SIZE + loc + 1, 1) + stream.seek(-len(buf) + loc + 1, 1) break data += buf # back by one char in case of in the middle of ~> @@ -142,31 +142,33 @@ def extract_inline_DCT(stream: StreamType) -> bytes: data: bytes = b"" # Read Blocks of data (ID/Size/data) up to ID=FF/D9 # see https://www.digicamsoft.com/itu/itu-t81-36.html + notfirst = False while True: c = stream.read(1) - data += c + if notfirst or (c == b"\xff"): + data += c if c != b"\xff": continue + else: + notfirst = True c = stream.read(1) + data += c if c == b"\xff": stream.seek(-1, 1) elif c == b"\x00": # stuffing - data += c + pass elif c == b"\xd9": # end - data += c break elif c in ( b"\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc9\xca\xcb\xcc\xcd\xce\xcf" b"\xda\xdb\xdc\xdd\xde\xdf" b"\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xfe" ): - data += c c = stream.read(2) data += c - sz = ord(c[0]) * 256 + c[1] + sz = c[0] * 256 + c[1] data += stream.read(sz - 2) - else: - data += c + # else: pass ei = read_non_whitespace(stream) ei += stream.read(1) diff --git a/tests/test_images.py b/tests/test_images.py index e860aece1..3674a6870 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -374,3 +374,11 @@ def test_inline_image_extraction(): for i in range(8): assert image_similarity(reader.pages[0].images[i].image, img) == 1 reader.pages[0].extract_text() + + url = "https://github.com/py-pdf/pypdf/files/15233597/bug1065245.pdf" + name = "iss2598c.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + url = "https://github.com/py-pdf/pypdf/assets/4083478/bfb221be-11bd-46fe-8129-55a58088a4b6" + name = "iss2598c.jpg" + img = Image.open(BytesIO(get_data_from_url(url, name=name))) + image_similarity(reader.pages[0].images[0].image, img) >= 0.99 From 8996a739fad674bfab6e94da640e2fdcf3acaa9c Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Tue, 7 May 2024 12:22:35 +0200 Subject: [PATCH 08/42] Fix A85 --- pypdf/_utils.py | 3 +- pypdf/_xobj_image_helpers.py | 2 +- pypdf/filters.py | 57 ++++++++++++++++++------------- pypdf/generic/_data_structures.py | 3 +- pypdf/generic/_image_inline.py | 4 +-- tests/test_filters.py | 10 +++--- 6 files changed, 45 insertions(+), 34 deletions(-) diff --git a/pypdf/_utils.py b/pypdf/_utils.py index 366a24eb4..d98162205 100644 --- a/pypdf/_utils.py +++ b/pypdf/_utils.py @@ -391,7 +391,8 @@ def ord_(b: Union[int, str, bytes]) -> Union[int, bytes]: WHITESPACES = (b" ", b"\n", b"\r", b"\t", b"\x00") -WHITESPACES_AS_REGEXP = b"[ \n\r\t\x00]" +WHITESPACES_AS_BYTES = b"".join(WHITESPACES) +WHITESPACES_AS_REGEXP = b"[" + WHITESPACES_AS_BYTES + b"]" def paeth_predictor(left: int, up: int, up_left: int) -> int: diff --git a/pypdf/_xobj_image_helpers.py b/pypdf/_xobj_image_helpers.py index cd1cdca17..ba49f1179 100644 --- a/pypdf/_xobj_image_helpers.py +++ b/pypdf/_xobj_image_helpers.py @@ -29,7 +29,7 @@ try: - from PIL import Image + from PIL import Image, UnidentifiedImageError # noqa: F401 except ImportError: raise ImportError( "pillow is required to do image extraction. " diff --git a/pypdf/filters.py b/pypdf/filters.py index 26d71229d..cc47d051c 100644 --- a/pypdf/filters.py +++ b/pypdf/filters.py @@ -37,10 +37,12 @@ import math import struct import zlib +from base64 import a85decode, a85encode from io import BytesIO from typing import Any, Dict, List, Optional, Tuple, Union, cast from ._utils import ( + WHITESPACES_AS_BYTES, b_, deprecate_with_replacement, deprecation_no_replacement, @@ -462,7 +464,7 @@ def decode( Decode an LZW encoded data stream. Args: - data: bytes`` or ``str`` text to decode. + data: ``bytes`` or ``str`` text to decode. decode_parms: a dictionary of parameter values. Returns: @@ -482,29 +484,34 @@ def decode( decode_parms: Optional[DictionaryObject] = None, **kwargs: Any, ) -> bytes: - # decode_parms is unused here + """ + Decode an Ascii85 encoded data stream. + + Args: + data: ``bytes`` or ``str`` text to decode. + decode_parms: a dictionary of parameter values. + Returns: + decoded data. + """ if isinstance(data, str): - data = data.encode("ascii") - group_index = b = 0 - out = bytearray() - for char in data: - if ord("!") <= char <= ord("u"): - group_index += 1 - b = b * 85 + (char - 33) - if group_index == 5: - out += struct.pack(b">L", b) - group_index = b = 0 - elif char == ord("z"): - assert group_index == 0 - out += b"\0\0\0\0" - elif char == ord("~"): - if group_index: - for _ in range(5 - group_index): - b = b * 85 + 84 - out += struct.pack(b">L", b)[: group_index - 1] - break - return bytes(out) + data = data.encode() + data = data.strip(WHITESPACES_AS_BYTES) + return a85decode(data, adobe=True, ignorechars=WHITESPACES_AS_BYTES) + + @staticmethod + def encode(data: bytes, level: int = -1) -> bytes: + """ + Compress the input data using A85 encoding in Adobe format. + + Args: + data: The data to be compressed. + level: See https://docs.python.org/3/library/zlib.html#zlib.compress + + Returns: + The compressed data. + """ + return a85encode(data, adobe=True, wrapcol=32) class DCTDecode: @@ -737,6 +744,7 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes, """ from ._xobj_image_helpers import ( Image, + UnidentifiedImageError, _extended_image_frombytes, _get_imagemode, _handle_flate, @@ -809,7 +817,10 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes, else: extension = ".png" # mime_type = "image/png" image_format = "PNG" - img = Image.open(BytesIO(data), formats=("TIFF", "PNG")) + try: + img = Image.open(BytesIO(data), formats=("TIFF", "PNG")) + except UnidentifiedImageError: + img = _extended_image_frombytes(mode, size, data) elif lfilters == FT.DCT_DECODE: img, image_format, extension = Image.open(BytesIO(data)), "JPEG", ".jpg" # invert_color kept unchanged diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index f3d6b7868..31594f439 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -1159,8 +1159,9 @@ def _read_inline_image(self, stream: StreamType) -> Dict[str, Any]: # left at beginning of ID tmp = stream.read(3) assert tmp[:2] == b"ID" - filtr = settings.get("/F", "not set") + filtr = settings.get("/F", settings.get("/Filter", "not set")) savpos = stream.tell() + # import pdb;pdb.set_trace() # print("inline", stream.tell(),filtr,"*",settings) if isinstance(filtr, list): filtr = filtr[0] # used forencoding diff --git a/pypdf/generic/_image_inline.py b/pypdf/generic/_image_inline.py index 17f1e9c97..0fb75a586 100644 --- a/pypdf/generic/_image_inline.py +++ b/pypdf/generic/_image_inline.py @@ -49,7 +49,7 @@ def extract_inline_AHex(stream: StreamType) -> bytes: # Read data until delimiter > and EI as backup # ignoring backup. while True: - buf = stream.read(BUFFER_SIZE) + buf = read_non_whitespace(stream) + stream.read(BUFFER_SIZE) if not buf: raise PdfReadError("Unexpected end of stream") loc = buf.find(b">") @@ -87,7 +87,7 @@ def extract_inline_A85(stream: StreamType) -> bytes: # Read data up to delimiter ~> # see §3.3.2 from PDF ref 1.7 while True: - buf = stream.read(BUFFER_SIZE) + buf = read_non_whitespace(stream) + stream.read(BUFFER_SIZE) if not buf: raise PdfReadError("Unexpected end of stream") loc = buf.find(b"~>") diff --git a/tests/test_filters.py b/tests/test_filters.py index d3980be0b..146ce43cb 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -147,11 +147,10 @@ def test_decode_ahx(): _ = list(p.images.keys()) -@pytest.mark.xfail() def test_ascii85decode_with_overflow(): inputs = ( v + "~>" - for v in "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0e\x0f" + for v in "\x01\x02\x03\x04\x05\x06\x07\x08\x0e\x0f" "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a" "\x1b\x1c\x1d\x1e\x1fvwxy{|}~\x7f\x80\x81\x82" "\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d" @@ -161,9 +160,8 @@ def test_ascii85decode_with_overflow(): ) for i in inputs: - with pytest.raises(ValueError) as exc: + with pytest.raises(ValueError): ASCII85Decode.decode(i) - assert exc.value.args[0] == "" def test_ascii85decode_five_zero_bytes(): @@ -183,10 +181,10 @@ def test_ascii85decode_five_zero_bytes(): b"\x00\x00\x00\x00" * 3, ) - assert ASCII85Decode.decode("!!!!!") == ASCII85Decode.decode("z") + assert ASCII85Decode.decode("!!!!!~>") == ASCII85Decode.decode("z~>") for expected, i in zip(exp_outputs, inputs): - assert ASCII85Decode.decode(i) == expected + assert ASCII85Decode.decode(i + "~>") == expected def test_ccitparameters(): From 5b38f344713ceaea4f27139b50f86d5fb2dcb08c Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Tue, 7 May 2024 13:27:55 +0200 Subject: [PATCH 09/42] blank --- tests/test_reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_reader.py b/tests/test_reader.py index ff39189e0..837b9b8c2 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -1408,7 +1408,7 @@ def test_iss1689(): @pytest.mark.enable_socket() def test_iss1710(): - url = "https://nlp.stanford.edu/IR-book/pdf/irbookonlinereading.pdf" + url = "" name = "irbookonlinereading.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) reader.outline From 67d51ea3e8b893988a334ff04475a7d263d40631 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Tue, 7 May 2024 13:31:38 +0200 Subject: [PATCH 10/42] with new link --- tests/test_reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_reader.py b/tests/test_reader.py index 837b9b8c2..83b61bc59 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -1408,7 +1408,7 @@ def test_iss1689(): @pytest.mark.enable_socket() def test_iss1710(): - url = "" + url = "https://github.com/py-pdf/pypdf/files/15234776/irbookonlinereading.pdf" name = "irbookonlinereading.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) reader.outline From 092e2a5fcef3878dead9caedcce83b097d3c7857 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Tue, 7 May 2024 13:36:16 +0200 Subject: [PATCH 11/42] fix test --- tests/test_images.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_images.py b/tests/test_images.py index 3674a6870..95ce5f413 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -381,4 +381,4 @@ def test_inline_image_extraction(): url = "https://github.com/py-pdf/pypdf/assets/4083478/bfb221be-11bd-46fe-8129-55a58088a4b6" name = "iss2598c.jpg" img = Image.open(BytesIO(get_data_from_url(url, name=name))) - image_similarity(reader.pages[0].images[0].image, img) >= 0.99 + assert image_similarity(reader.pages[0].images[0].image, img) >= 0.99 From c5d62a344ddf98a345cd51eb4763819e56e5406f Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Wed, 8 May 2024 11:18:05 +0200 Subject: [PATCH 12/42] BUG: Incorrect number of inline images closes #2629 --- pypdf/_page.py | 24 ++++++------------------ tests/test_workflows.py | 5 +++++ 2 files changed, 11 insertions(+), 18 deletions(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index 6f44aa522..1cd665ee1 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -28,7 +28,6 @@ # POSSIBILITY OF SUCH DAMAGE. import math -import re import sys from decimal import Decimal from pathlib import Path @@ -58,7 +57,6 @@ mult, ) from ._utils import ( - WHITESPACES_AS_REGEXP, CompressedTransformationMatrix, File, ImageFile, @@ -335,7 +333,6 @@ def __init__( self.pdf = pdf self.inline_images: Optional[Dict[str, ImageFile]] = None # below Union for mypy but actually Optional[List[str]] - self.inline_images_keys: Optional[List[Union[str, List[str]]]] = None self.indirect_reference = indirect_reference def hash_value_data(self) -> bytes: @@ -439,19 +436,8 @@ def _get_ids_image( return [] else: call_stack.append(_i) - if self.inline_images_keys is None: - content = self._get_contents_as_bytes() or b"" - nb_inlines = 0 - for matching in re.finditer( - WHITESPACES_AS_REGEXP + b"BI" + WHITESPACES_AS_REGEXP, - content, - ): - start_of_string = content[: matching.start()] - if len(re.findall(b"[^\\\\]\\(", start_of_string)) == len( - re.findall(b"[^\\\\]\\)", start_of_string) - ): - nb_inlines += 1 - self.inline_images_keys = [f"~{x}~" for x in range(nb_inlines)] + if self.inline_images is None: + self.inline_images = self._get_inline_images() if obj is None: obj = self if ancest is None: @@ -460,7 +446,7 @@ def _get_ids_image( if PG.RESOURCES not in obj or RES.XOBJECT not in cast( DictionaryObject, obj[PG.RESOURCES] ): - return self.inline_images_keys + return [] if self.inline_images is None else list(self.inline_images.keys()) x_object = obj[PG.RESOURCES][RES.XOBJECT].get_object() # type: ignore for o in x_object: @@ -470,7 +456,9 @@ def _get_ids_image( lst.append(o if len(ancest) == 0 else ancest + [o]) else: # is a form with possible images inside lst.extend(self._get_ids_image(x_object[o], ancest + [o], call_stack)) - return lst + self.inline_images_keys + if self.inline_images is not None: + lst.extend(list(self.inline_images.keys())) + return lst def _get_image( self, diff --git a/tests/test_workflows.py b/tests/test_workflows.py index 94e380dca..8e9c1b219 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -1025,6 +1025,11 @@ def test_inline_images(): with pytest.raises(KeyError) as exc: reader.pages[2]._get_image(("test",)) + url = "https://github.com/py-pdf/pypdf/files/15233597/bug1065245.pdf" + name = "iss2598c.pdf" # test coming from another test in test_image.py + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + assert len(reader.pages[0].images) == 3 + @pytest.mark.enable_socket() def test_iss(): From 51bea2cfa4458b21489de86e67ab04d6b47f84a6 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sat, 11 May 2024 15:26:34 +0200 Subject: [PATCH 13/42] add test for RL + fix --- pypdf/generic/_image_inline.py | 1 - tests/test_images.py | 8 ++++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/pypdf/generic/_image_inline.py b/pypdf/generic/_image_inline.py index 0fb75a586..8836d1991 100644 --- a/pypdf/generic/_image_inline.py +++ b/pypdf/generic/_image_inline.py @@ -125,7 +125,6 @@ def extract_inline_RL(stream: StreamType) -> bytes: break data += buf # back by one char in case of in the middle of ~> - data += buf[:loc] ei = read_non_whitespace(stream) ei += stream.read(1) stream.seek(-2, 1) diff --git a/tests/test_images.py b/tests/test_images.py index 95ce5f413..56ceb418a 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -382,3 +382,11 @@ def test_inline_image_extraction(): name = "iss2598c.jpg" img = Image.open(BytesIO(get_data_from_url(url, name=name))) assert image_similarity(reader.pages[0].images[0].image, img) >= 0.99 + + url = "https://github.com/py-pdf/pypdf/files/15282904/tt.pdf" + name = "iss2598d.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + url = "https://github.com/py-pdf/pypdf/assets/4083478/1a770e1b-9ad2-4125-89ae-6069992dda23" + name = "iss2598d.png" + img = Image.open(BytesIO(get_data_from_url(url, name=name))) + assert image_similarity(reader.pages[0].images[0].image, img) == 1 From bd8449600075e9bfd9a7809d08f0f87ef3ce8975 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sat, 11 May 2024 16:25:42 +0200 Subject: [PATCH 14/42] remove encode as not used for the moment --- pypdf/filters.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pypdf/filters.py b/pypdf/filters.py index cc47d051c..896f9dd76 100644 --- a/pypdf/filters.py +++ b/pypdf/filters.py @@ -37,7 +37,7 @@ import math import struct import zlib -from base64 import a85decode, a85encode +from base64 import a85decode from io import BytesIO from typing import Any, Dict, List, Optional, Tuple, Union, cast @@ -499,9 +499,9 @@ def decode( data = data.strip(WHITESPACES_AS_BYTES) return a85decode(data, adobe=True, ignorechars=WHITESPACES_AS_BYTES) - @staticmethod + """@staticmethod def encode(data: bytes, level: int = -1) -> bytes: - """ + ''' Compress the input data using A85 encoding in Adobe format. Args: @@ -510,8 +510,9 @@ def encode(data: bytes, level: int = -1) -> bytes: Returns: The compressed data. - """ + ''' return a85encode(data, adobe=True, wrapcol=32) + """ class DCTDecode: From 770aabaf02b64c088219131fca89ea6d79e4fb8d Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sat, 11 May 2024 16:27:14 +0200 Subject: [PATCH 15/42] Fix + Test --- pypdf/generic/_image_inline.py | 10 +++++++--- tests/test_generic.py | 28 ++++++++++++++++++++++++++-- 2 files changed, 33 insertions(+), 5 deletions(-) diff --git a/pypdf/generic/_image_inline.py b/pypdf/generic/_image_inline.py index 8836d1991..4486f091c 100644 --- a/pypdf/generic/_image_inline.py +++ b/pypdf/generic/_image_inline.py @@ -66,9 +66,13 @@ def extract_inline_AHex(stream: StreamType) -> bytes: c = stream.read(1) loc -= 1 data += buf[:loc] + break + elif len(buf) == 2: + data += buf + break else: # > nor EI found - data += buf[:-1] - stream.seek(-1, 1) + data += buf[:-2] + stream.seek(-2, 1) ei = read_non_whitespace(stream) ei += stream.read(1) @@ -153,7 +157,7 @@ def extract_inline_DCT(stream: StreamType) -> bytes: c = stream.read(1) data += c if c == b"\xff": - stream.seek(-1, 1) + stream.seek(-1, 1) # pragma: no cover elif c == b"\x00": # stuffing pass elif c == b"\xd9": # end diff --git a/tests/test_generic.py b/tests/test_generic.py index 24da063a2..e42aae5b7 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -35,6 +35,11 @@ read_object, read_string_from_stream, ) +from pypdf.generic._image_inline import ( + extract_inline_A85, + extract_inline_AHex, + extract_inline_RL, +) from . import ReaderDummy, get_data_from_url @@ -883,7 +888,7 @@ def test_annotation_builder_highlight(pdf_file_path): FloatObject(705.4493), ] ), - printing=False + printing=False, ) writer.add_annotation(0, highlight_annotation) for annot in writer.pages[0]["/Annots"]: @@ -910,7 +915,7 @@ def test_annotation_builder_highlight(pdf_file_path): FloatObject(705.4493), ] ), - printing=True + printing=True, ) writer.add_annotation(1, highlight_annotation) for annot in writer.pages[1]["/Annots"]: @@ -1350,3 +1355,22 @@ def test_array_operators(): la = len(a) a -= 300 assert len(a) == la + + +def test_unitary_extract_inline_buffer_empty(): + with pytest.raises(PdfReadError): + extract_inline_AHex(BytesIO()) + with pytest.raises(PdfReadError): + extract_inline_A85(BytesIO()) + with pytest.raises(PdfReadError): + extract_inline_RL(BytesIO()) + + +def test_unitary_extract_inline_ahx(): + b = 16000 * b"00" + b += b" EI" + assert len(extract_inline_AHex(BytesIO(b))) == 16000 * 2 + b = 16000 * b"00" + b += b">" + with pytest.raises(PdfReadError): + extract_inline_AHex(BytesIO(b)) From a37b73f9acb0d461b054d3e15fc05cf432f7b538 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sat, 11 May 2024 18:41:41 +0200 Subject: [PATCH 16/42] test+fix --- pypdf/generic/_image_inline.py | 3 +++ tests/test_generic.py | 5 +++++ 2 files changed, 8 insertions(+) diff --git a/pypdf/generic/_image_inline.py b/pypdf/generic/_image_inline.py index 4486f091c..314c71c7f 100644 --- a/pypdf/generic/_image_inline.py +++ b/pypdf/generic/_image_inline.py @@ -99,6 +99,9 @@ def extract_inline_A85(stream: StreamType) -> bytes: data += buf[: loc + 2] stream.seek(-len(buf) + loc + 2, 1) break + elif len(buf) == 2: # end of buffer + data += buf + break data += buf[:-1] # back by one char in case of in the middle of ~> stream.seek(-1, 1) diff --git a/tests/test_generic.py b/tests/test_generic.py index e42aae5b7..6cdf2d352 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -1,5 +1,6 @@ """Test the pypdf.generic module.""" +from base64 import a85encode from copy import deepcopy from io import BytesIO from pathlib import Path @@ -1360,8 +1361,12 @@ def test_array_operators(): def test_unitary_extract_inline_buffer_empty(): with pytest.raises(PdfReadError): extract_inline_AHex(BytesIO()) + with pytest.raises(PdfReadError): + extract_inline_AHex(BytesIO(4095 * b"00" + b" ")) with pytest.raises(PdfReadError): extract_inline_A85(BytesIO()) + with pytest.raises(PdfReadError): + extract_inline_A85(BytesIO(a85encode(b"1"))) with pytest.raises(PdfReadError): extract_inline_RL(BytesIO()) From 184e141b697160f6ccae392a0489b9113b13daa6 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sat, 11 May 2024 18:52:25 +0200 Subject: [PATCH 17/42] test --- pypdf/generic/_image_inline.py | 2 +- tests/test_generic.py | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/pypdf/generic/_image_inline.py b/pypdf/generic/_image_inline.py index 314c71c7f..ac448c0a6 100644 --- a/pypdf/generic/_image_inline.py +++ b/pypdf/generic/_image_inline.py @@ -130,7 +130,7 @@ def extract_inline_RL(stream: StreamType) -> bytes: data = buf[: loc + 1] stream.seek(-len(buf) + loc + 1, 1) break - data += buf # back by one char in case of in the middle of ~> + data += buf ei = read_non_whitespace(stream) ei += stream.read(1) diff --git a/tests/test_generic.py b/tests/test_generic.py index 6cdf2d352..3ab68a569 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -39,6 +39,7 @@ from pypdf.generic._image_inline import ( extract_inline_A85, extract_inline_AHex, + extract_inline_DCT, extract_inline_RL, ) @@ -1369,6 +1370,10 @@ def test_unitary_extract_inline_buffer_empty(): extract_inline_A85(BytesIO(a85encode(b"1"))) with pytest.raises(PdfReadError): extract_inline_RL(BytesIO()) + with pytest.raises(PdfReadError): + extract_inline_RL(BytesIO(b"\x01\x01\x80")) + with pytest.raises(PdfReadError): + extract_inline_DCT(BytesIO(b"\xFF\xD9")) def test_unitary_extract_inline_ahx(): From 85e08bb66bf9504d19c0e8f6f5aed3bce9a2a460 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sat, 11 May 2024 19:16:15 +0200 Subject: [PATCH 18/42] test + fix --- pypdf/generic/_image_inline.py | 4 ++-- tests/test_generic.py | 6 +++++- tests/test_images.py | 1 + 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/pypdf/generic/_image_inline.py b/pypdf/generic/_image_inline.py index ac448c0a6..d3df959d3 100644 --- a/pypdf/generic/_image_inline.py +++ b/pypdf/generic/_image_inline.py @@ -102,8 +102,8 @@ def extract_inline_A85(stream: StreamType) -> bytes: elif len(buf) == 2: # end of buffer data += buf break - data += buf[:-1] # back by one char in case of in the middle of ~> - stream.seek(-1, 1) + data += buf[:-2] # back by one char in case of in the middle of ~> + stream.seek(-2, 1) ei = read_non_whitespace(stream) ei += stream.read(1) diff --git a/tests/test_generic.py b/tests/test_generic.py index 3ab68a569..4d12bbadd 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -1359,15 +1359,19 @@ def test_array_operators(): assert len(a) == la -def test_unitary_extract_inline_buffer_empty(): +def test_unitary_extract_inline_buffer_invalid(): with pytest.raises(PdfReadError): extract_inline_AHex(BytesIO()) with pytest.raises(PdfReadError): extract_inline_AHex(BytesIO(4095 * b"00" + b" ")) + with pytest.raises(PdfReadError): + extract_inline_AHex(BytesIO(b"00")) with pytest.raises(PdfReadError): extract_inline_A85(BytesIO()) with pytest.raises(PdfReadError): extract_inline_A85(BytesIO(a85encode(b"1"))) + with pytest.raises(PdfReadError): + extract_inline_A85(BytesIO(a85encode(b"1234578" * 990))) with pytest.raises(PdfReadError): extract_inline_RL(BytesIO()) with pytest.raises(PdfReadError): diff --git a/tests/test_images.py b/tests/test_images.py index 56ceb418a..6f8a35e12 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -373,6 +373,7 @@ def test_inline_image_extraction(): img = Image.open(BytesIO(get_data_from_url(url, name=name))) for i in range(8): assert image_similarity(reader.pages[0].images[i].image, img) == 1 + reader.pages[0].images[i].image # to test acceleration of second call reader.pages[0].extract_text() url = "https://github.com/py-pdf/pypdf/files/15233597/bug1065245.pdf" From a7ce07cbdc453628be0044281532ced4c793dc87 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sat, 11 May 2024 22:52:26 +0200 Subject: [PATCH 19/42] test + fix +refactor --- pypdf/generic/_image_inline.py | 58 +++++++++++----------------------- tests/test_generic.py | 13 ++++---- 2 files changed, 26 insertions(+), 45 deletions(-) diff --git a/pypdf/generic/_image_inline.py b/pypdf/generic/_image_inline.py index d3df959d3..ae701cec4 100644 --- a/pypdf/generic/_image_inline.py +++ b/pypdf/generic/_image_inline.py @@ -69,7 +69,7 @@ def extract_inline_AHex(stream: StreamType) -> bytes: break elif len(buf) == 2: data += buf - break + raise PdfReadError("Unexpected end of stream") else: # > nor EI found data += buf[:-2] stream.seek(-2, 1) @@ -101,7 +101,7 @@ def extract_inline_A85(stream: StreamType) -> bytes: break elif len(buf) == 2: # end of buffer data += buf - break + raise PdfReadError("Unexpected end of stream") data += buf[:-2] # back by one char in case of in the middle of ~> stream.seek(-2, 1) @@ -127,7 +127,7 @@ def extract_inline_RL(stream: StreamType) -> bytes: raise PdfReadError("Unexpected end of stream") loc = buf.find(b"\x80") if loc >= 0: # found - data = buf[: loc + 1] + data += buf[: loc + 1] stream.seek(-len(buf) + loc + 1, 1) break data += buf @@ -203,48 +203,28 @@ def extract_inline_default(stream: StreamType) -> bytes: data.write(buf) else: # Write out everything before the E. - data.write(buf[0:loc]) + data.write(buf[0 : (loc + 1)]) # Seek back in the stream to read the E next. - stream.seek(loc - len(buf), 1) + stream.seek(loc + 1 - len(buf), 1) saved_pos = stream.tell() - tok = stream.read(1) # E of "EI" # Check for End Image tok2 = stream.read(1) # I of "EI" if tok2 != b"I": - stream.seek(-1, 1) - data.write(tok) + stream.seek(saved_pos, 0) continue - # for further debug : print("!!!!",buf[loc-1:loc+10]) - info = tok + tok2 - tok3 = stream.read( - 1 - ) # possible space after "EI" may not been loaded in buf + tok3 = stream.read(1) # possible space after "EI" if tok3 not in WHITESPACES: - stream.seek(-2, 1) # to step back on I - data.write(tok) - elif buf[loc - 1 : loc] in WHITESPACES: # and tok3 in WHITESPACES: - # Data can contain [\s]EI[\s]: 4 chars sufficient, checking Q operator not required. - while tok3 in WHITESPACES: - # needed ???? : info += tok3 - tok3 = stream.read(1) - stream.seek(-1, 1) - # we do not insert EI - break - else: # buf[loc - 1 : loc] not in WHITESPACES and tok3 in WHITESPACES: - # Data can contain [!\s]EI[\s], so check for Q or EMC operator is required to have 4 chars. - while tok3 in WHITESPACES: - info += tok3 - tok3 = stream.read(1) - stream.seek(-1, 1) - if tok3 == b"Q": - break - elif tok3 == b"E": - ope = stream.read(3) - stream.seek(-3, 1) - if ope == b"EMC": - break - else: - data.write(info) - stream.seek(saved_pos, 0) + stream.seek(saved_pos, 0) + continue + while tok3 in WHITESPACES: + tok3 = stream.read(1) + if buf[loc - 1 : loc] not in WHITESPACES and tok3 not in ( + b"Q", + b"E", + ): # for Q ou EMC + stream.seek(saved_pos, 0) + continue + # Data contains [\s]EI[\s](Q|EMC): 4 chars sufficient, checking Q operator not required. + break return data.getvalue() diff --git a/tests/test_generic.py b/tests/test_generic.py index 4d12bbadd..5133e162d 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -1380,11 +1380,12 @@ def test_unitary_extract_inline_buffer_invalid(): extract_inline_DCT(BytesIO(b"\xFF\xD9")) -def test_unitary_extract_inline_ahx(): +def test_unitary_extract_inline(): + # AHx b = 16000 * b"00" - b += b" EI" - assert len(extract_inline_AHex(BytesIO(b))) == 16000 * 2 - b = 16000 * b"00" - b += b">" + assert len(extract_inline_AHex(BytesIO(b + b" EI"))) == len(b) with pytest.raises(PdfReadError): - extract_inline_AHex(BytesIO(b)) + extract_inline_AHex(BytesIO(b + b"> ")) + # RL + b = 8200 * b"\x00\xAB" + b"\x80" + assert len(extract_inline_RL(BytesIO(b + b" EI"))) == len(b) From d17d1927872b50012f2d23b98f36487dc03dac14 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 12 May 2024 10:39:51 +0200 Subject: [PATCH 20/42] fix regeneration of inline images --- pypdf/_page.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pypdf/_page.py b/pypdf/_page.py index b91288ffe..6173b3bb7 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -951,6 +951,8 @@ def replace_contents( # as a backup solution, we put content as an object although not in accordance with pdf ref # this will be fixed with the _add_object self[NameObject(PG.CONTENTS)] = content + # forces recalculation of inline_images + self.inline_images = None def merge_page( self, page2: "PageObject", expand: bool = False, over: bool = True From 6807f3c6d43c804df1519b7cc085c5d5c34758ad Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 12 May 2024 10:43:08 +0200 Subject: [PATCH 21/42] coverage --- tests/test_generic.py | 2 ++ tests/test_images.py | 54 +++++++++++++++++++++++++++++++++++++++---- 2 files changed, 51 insertions(+), 5 deletions(-) diff --git a/tests/test_generic.py b/tests/test_generic.py index 5133e162d..23d6289b0 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -1370,6 +1370,8 @@ def test_unitary_extract_inline_buffer_invalid(): extract_inline_A85(BytesIO()) with pytest.raises(PdfReadError): extract_inline_A85(BytesIO(a85encode(b"1"))) + with pytest.raises(PdfReadError): + extract_inline_A85(BytesIO(a85encode(b"1") + b"~> Q")) with pytest.raises(PdfReadError): extract_inline_A85(BytesIO(a85encode(b"1234578" * 990))) with pytest.raises(PdfReadError): diff --git a/tests/test_images.py b/tests/test_images.py index 6f8a35e12..c97d4bbb1 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -13,7 +13,7 @@ import pytest from PIL import Image, ImageChops, ImageDraw -from pypdf import PageObject, PdfReader +from pypdf import PageObject, PdfReader, PdfWriter from pypdf.generic import NameObject, NullObject from . import get_data_from_url @@ -367,14 +367,58 @@ def test_inline_image_extraction(): url = "https://github.com/mozilla/pdf.js/raw/master/test/pdfs/issue14256.pdf" name = "iss2598b.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + writer = PdfWriter(BytesIO(get_data_from_url(url, name=name))) url = "https://github.com/py-pdf/pypdf/assets/4083478/71bc5053-cfc7-44ba-b7be-8e2333e2c749" name = "iss2598b.png" img = Image.open(BytesIO(get_data_from_url(url, name=name))) for i in range(8): - assert image_similarity(reader.pages[0].images[i].image, img) == 1 - reader.pages[0].images[i].image # to test acceleration of second call - reader.pages[0].extract_text() + assert image_similarity(writer.pages[0].images[i].image, img) == 1 + writer.pages[0].extract_text() + # check recalculation of inline images + assert writer.pages[0].inline_images is not None + writer.pages[0].merge_scaled_page(writer.pages[0], 0.25) + assert writer.pages[0].inline_images is None + reader = PdfReader(RESOURCE_ROOT / "imagemagick-ASCII85Decode.pdf") + writer.pages[0].merge_page(reader.pages[0]) + assert list(writer.pages[0].images.keys()) == [ + "/Im0", + "~0~", + "~1~", + "~2~", + "~3~", + "~4~", + "~5~", + "~6~", + "~7~", + "~8~", + "~9~", + "~10~", + "~11~", + "~12~", + "~13~", + "~14~", + "~15~", + ] + # 2nd call for acceleration test + assert list(writer.pages[0].images.keys()) == [ + "/Im0", + "~0~", + "~1~", + "~2~", + "~3~", + "~4~", + "~5~", + "~6~", + "~7~", + "~8~", + "~9~", + "~10~", + "~11~", + "~12~", + "~13~", + "~14~", + "~15~", + ] url = "https://github.com/py-pdf/pypdf/files/15233597/bug1065245.pdf" name = "iss2598c.pdf" From 5d713fcedd94479761a14cc5b5d73ca3ffba6471 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 12 May 2024 11:58:08 +0200 Subject: [PATCH 22/42] coverage --- pypdf/_xobj_image_helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_xobj_image_helpers.py b/pypdf/_xobj_image_helpers.py index ba49f1179..183b28ed5 100644 --- a/pypdf/_xobj_image_helpers.py +++ b/pypdf/_xobj_image_helpers.py @@ -30,7 +30,7 @@ try: from PIL import Image, UnidentifiedImageError # noqa: F401 -except ImportError: +except ImportError: # deprecated raise ImportError( "pillow is required to do image extraction. " "It can be installed via 'pip install pypdf[image]'" From 623b7153a48cf0d12f2743492c97999c2349bcaf Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 12 May 2024 12:14:29 +0200 Subject: [PATCH 23/42] check for space after EI --- pypdf/generic/_data_structures.py | 6 ++++-- pypdf/generic/_image_inline.py | 24 ++++++++++++------------ tests/test_generic.py | 28 ++++++++++++++++++++++++++++ 3 files changed, 44 insertions(+), 14 deletions(-) diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index 31594f439..9a2671016 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -50,6 +50,7 @@ from .._protocols import PdfReaderProtocol, PdfWriterProtocol, XmpInformationProtocol from .._utils import ( + WHITESPACES, StreamType, b_, deprecate_no_replacement, @@ -1198,8 +1199,9 @@ def _read_inline_image(self, stream: StreamType) -> Dict[str, Any]: else: data = extract_inline_default(stream) - ei = stream.read(2) - if ei != b"EI": + ei = stream.read(3) + stream.seek(-1, 1) + if ei[0:2] != b"EI" or ei[2:3] not in WHITESPACES: stream.seek(savpos, 0) data = extract_inline_default(stream) return {"settings": settings, "data": data} diff --git a/pypdf/generic/_image_inline.py b/pypdf/generic/_image_inline.py index ae701cec4..5804f8389 100644 --- a/pypdf/generic/_image_inline.py +++ b/pypdf/generic/_image_inline.py @@ -75,9 +75,9 @@ def extract_inline_AHex(stream: StreamType) -> bytes: stream.seek(-2, 1) ei = read_non_whitespace(stream) - ei += stream.read(1) - stream.seek(-2, 1) - if ei != b"EI": + ei += stream.read(2) + stream.seek(-3, 1) + if ei[0:2] != b"EI" or not (ei[2:3] == b"" or ei[2:3] in WHITESPACES): raise PdfReadError("EI stream not found") return data @@ -106,9 +106,9 @@ def extract_inline_A85(stream: StreamType) -> bytes: stream.seek(-2, 1) ei = read_non_whitespace(stream) - ei += stream.read(1) - stream.seek(-2, 1) - if ei != b"EI": + ei += stream.read(2) + stream.seek(-3, 1) + if ei[0:2] != b"EI" or not (ei[2:3] == b"" or ei[2:3] in WHITESPACES): raise PdfReadError("EI stream not found") return data @@ -133,9 +133,9 @@ def extract_inline_RL(stream: StreamType) -> bytes: data += buf ei = read_non_whitespace(stream) - ei += stream.read(1) - stream.seek(-2, 1) - if ei != b"EI": + ei += stream.read(2) + stream.seek(-3, 1) + if ei[0:2] != b"EI" or not (ei[2:3] == b"" or ei[2:3] in WHITESPACES): raise PdfReadError("EI stream not found") return data @@ -177,9 +177,9 @@ def extract_inline_DCT(stream: StreamType) -> bytes: # else: pass ei = read_non_whitespace(stream) - ei += stream.read(1) - stream.seek(-2, 1) - if ei != b"EI": + ei += stream.read(2) + stream.seek(-3, 1) + if ei[0:2] != b"EI" or not (ei[2:3] == b"" or ei[2:3] in WHITESPACES): raise PdfReadError("EI stream not found") return data diff --git a/tests/test_generic.py b/tests/test_generic.py index 23d6289b0..a72d2f4fb 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -16,6 +16,8 @@ ArrayObject, BooleanObject, ByteStringObject, + ContentStream, + DecodedStreamObject, Destination, DictionaryObject, Fit, @@ -1391,3 +1393,29 @@ def test_unitary_extract_inline(): # RL b = 8200 * b"\x00\xAB" + b"\x80" assert len(extract_inline_RL(BytesIO(b + b" EI"))) == len(b) + + # default + # EIDD instead of EI; using A85 + b = b"""1 0 0 1 0 0 cm BT /F1 12 Tf 14.4 TL ET\nq 100 0 0 100 100 100 cm +BI\n/W 16 /H 16 /BPC 8 /CS /RGB /F [/A85 /Fl]\nID +Gar8O(o6*is8QV#;;JAuTq2lQ8J;%6#\'d5b"Q[+ZD?\'\\+CGj9~> +EIDD +Q\nBT 1 0 0 1 200 100 Tm (Test) Tj T* ET\n \n""" + ec = DecodedStreamObject() + ec.set_data(b) + co = ContentStream(ec, None) + with pytest.raises(PdfReadError) as exc: + co.operations + assert "EI stream not found" in exc.value.args[0] + # EIDD instead of EI; using /Fl (default extraction) + b = b"""1 0 0 1 0 0 cm BT /F1 12 Tf 14.4 TL ET\nq 100 0 0 100 100 100 cm +BI\n/W 16 /H 16 /BPC 8 /CS /RGB /F /Fl \nID +Gar8O(o6*is8QV#;;JAuTq2lQ8J;%6#\'d5b"Q[+ZD?\'\\+CGj9~> +EIDD +Q\nBT 1 0 0 1 200 100 Tm (Test) Tj T* ET\n \n""" + ec = DecodedStreamObject() + ec.set_data(b) + co = ContentStream(ec, None) + with pytest.raises(PdfReadError) as exc: + co.operations + assert "Unexpected end of stream" in exc.value.args[0] From 0da933e3748eb4ed1aa7ca2ba4724e3cf2e4cc91 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 12 May 2024 12:44:33 +0200 Subject: [PATCH 24/42] coverage --- pypdf/_xobj_image_helpers.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/pypdf/_xobj_image_helpers.py b/pypdf/_xobj_image_helpers.py index 183b28ed5..d797feda0 100644 --- a/pypdf/_xobj_image_helpers.py +++ b/pypdf/_xobj_image_helpers.py @@ -31,7 +31,7 @@ try: from PIL import Image, UnidentifiedImageError # noqa: F401 except ImportError: # deprecated - raise ImportError( + raise ImportError( # deprecated "pillow is required to do image extraction. " "It can be installed via 'pip install pypdf[image]'" ) @@ -145,12 +145,6 @@ def _extended_image_frombytes( mode: str, size: Tuple[int, int], data: bytes ) -> Image.Image: try: - if mode == "2bits": - mode = "P" - data = bits2byte(data, size, 2) - elif mode == "4bits": - mode = "P" - data = bits2byte(data, size, 4) img = Image.frombytes(mode, size, data) except ValueError as exc: nb_pix = size[0] * size[1] From 422eb180f1503568c4c5a88753eea4dc50b86149 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 12 May 2024 13:04:49 +0200 Subject: [PATCH 25/42] coverage --- pypdf/_page.py | 4 ++-- tests/test_generic.py | 11 +++++++++++ tests/test_images.py | 20 -------------------- 3 files changed, 13 insertions(+), 22 deletions(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index 6173b3bb7..dcaf8adbc 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -457,8 +457,8 @@ def _get_ids_image( lst.append(o if len(ancest) == 0 else ancest + [o]) else: # is a form with possible images inside lst.extend(self._get_ids_image(x_object[o], ancest + [o], call_stack)) - if self.inline_images is not None: - lst.extend(list(self.inline_images.keys())) + assert self.inline_images is not None + lst.extend(list(self.inline_images.keys())) return lst def _get_image( diff --git a/tests/test_generic.py b/tests/test_generic.py index a72d2f4fb..3ae434e1e 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -1419,3 +1419,14 @@ def test_unitary_extract_inline(): with pytest.raises(PdfReadError) as exc: co.operations assert "Unexpected end of stream" in exc.value.args[0] + + b = b"""1 0 0 1 0 0 cm BT /F1 12 Tf 14.4 TL ET\nq 100 0 0 100 100 100 cm +BI\n/W 16 /H 16 /BPC 8 /CS /RGB /F /Fl \nID +Gar8O(o6*is8QV#;;JAuTq2lQ8J;%6#\'d5b"Q[+ZD?\'\\+CGj9~>EI +BT\nQ\nBT 1 0 0 1 200 100 Tm (Test) Tj T* ET\n \n""" + ec = DecodedStreamObject() + ec.set_data(b) + co = ContentStream(ec, None) + with pytest.raises(PdfReadError) as exc: + co.operations + assert "Unexpected end of stream" in exc.value.args[0] diff --git a/tests/test_images.py b/tests/test_images.py index c97d4bbb1..dbd3f9109 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -399,26 +399,6 @@ def test_inline_image_extraction(): "~14~", "~15~", ] - # 2nd call for acceleration test - assert list(writer.pages[0].images.keys()) == [ - "/Im0", - "~0~", - "~1~", - "~2~", - "~3~", - "~4~", - "~5~", - "~6~", - "~7~", - "~8~", - "~9~", - "~10~", - "~11~", - "~12~", - "~13~", - "~14~", - "~15~", - ] url = "https://github.com/py-pdf/pypdf/files/15233597/bug1065245.pdf" name = "iss2598c.pdf" From b79164ed75ab2e085fef3eceeaa2d4d9d61e20a1 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 12 May 2024 14:50:01 +0200 Subject: [PATCH 26/42] test / fix /refactoring --- pypdf/generic/_data_structures.py | 17 +++++++++++------ pypdf/generic/_image_inline.py | 2 +- tests/test_generic.py | 18 ++++++++++++++++++ 3 files changed, 30 insertions(+), 7 deletions(-) diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index 9a2671016..89f972c3e 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -1176,14 +1176,20 @@ def _read_inline_image(self, stream: StreamType) -> Dict[str, Any]: data = extract_inline_DCT(stream) elif filtr == "not set": cs = settings.get("/CS", "") - if cs == "/I" or cs == "/G" or cs == "/Indexed" or cs == "/DeviceGray": - lcs = 1 - elif "RGB" in cs: + if "RGB" in cs: lcs = 3 elif "CMYK" in cs: lcs = 4 else: - bits = settings.get("/BPC", -1) + bits = settings.get( + "/BPC", + 8 + if cs == "/I" + or cs == "/G" + or cs == "/Indexed" + or cs == "/DeviceGray" + else -1, + ) if bits > 0: lcs = bits / 8.0 else: @@ -1194,8 +1200,7 @@ def _read_inline_image(self, stream: StreamType) -> Dict[str, Any]: ceil(cast(int, settings["/W"]) * lcs) * cast(int, settings["/H"]) ) ei = read_non_whitespace(stream) - ei += stream.read(1) - stream.seek(-2, 1) + stream.seek(-1, 1) else: data = extract_inline_default(stream) diff --git a/pypdf/generic/_image_inline.py b/pypdf/generic/_image_inline.py index 5804f8389..56f5de72a 100644 --- a/pypdf/generic/_image_inline.py +++ b/pypdf/generic/_image_inline.py @@ -203,7 +203,7 @@ def extract_inline_default(stream: StreamType) -> bytes: data.write(buf) else: # Write out everything before the E. - data.write(buf[0 : (loc + 1)]) + data.write(buf[0:loc]) # Seek back in the stream to read the E next. stream.seek(loc + 1 - len(buf), 1) diff --git a/tests/test_generic.py b/tests/test_generic.py index 3ae434e1e..12d64c9f5 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -1430,3 +1430,21 @@ def test_unitary_extract_inline(): with pytest.raises(PdfReadError) as exc: co.operations assert "Unexpected end of stream" in exc.value.args[0] + + b = b"""1 0 0 1 0 0 cm BT /F1 12 Tf 14.4 TL ET\nq 100 0 0 100 100 100 cm +BI\n/W 4 /H 4 /CS /G \nID +abcdefghijklmnopEI +Q\nQ\nBT 1 0 0 1 200 100 Tm (Test) Tj T* ET\n \n""" + ec = DecodedStreamObject() + ec.set_data(b) + co = ContentStream(ec, None) + assert co.operations[7][0]["data"] == b"abcdefghijklmnop" + + b = b"""1 0 0 1 0 0 cm BT /F1 12 Tf 14.4 TL ET\nq 100 0 0 100 100 100 cm +BI\n/W 4 /H 4 \nID +abcdefghijklmnopEI +Q\nQ\nBT 1 0 0 1 200 100 Tm (Test) Tj T* ET\n \n""" + ec = DecodedStreamObject() + ec.set_data(b) + co = ContentStream(ec, None) + assert co.operations[7][0]["data"] == b"abcdefghijklmnop" From 66f858cc5256f03501f5ef82d12f6e8db85ebe4a Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Tue, 14 May 2024 22:13:17 +0200 Subject: [PATCH 27/42] fix --- pypdf/generic/_image_inline.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pypdf/generic/_image_inline.py b/pypdf/generic/_image_inline.py index 56f5de72a..1c1bde079 100644 --- a/pypdf/generic/_image_inline.py +++ b/pypdf/generic/_image_inline.py @@ -203,7 +203,7 @@ def extract_inline_default(stream: StreamType) -> bytes: data.write(buf) else: # Write out everything before the E. - data.write(buf[0:loc]) + data.write(buf[0 : loc + 1]) # Seek back in the stream to read the E next. stream.seek(loc + 1 - len(buf), 1) @@ -227,4 +227,5 @@ def extract_inline_default(stream: StreamType) -> bytes: continue # Data contains [\s]EI[\s](Q|EMC): 4 chars sufficient, checking Q operator not required. break + return data.getvalue() From ee637c02e611d71fe18aaf15fbe9c14a02c126e8 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Tue, 14 May 2024 22:36:44 +0200 Subject: [PATCH 28/42] fix2 --- pypdf/generic/_image_inline.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pypdf/generic/_image_inline.py b/pypdf/generic/_image_inline.py index 1c1bde079..579514692 100644 --- a/pypdf/generic/_image_inline.py +++ b/pypdf/generic/_image_inline.py @@ -202,9 +202,9 @@ def extract_inline_default(stream: StreamType) -> bytes: if loc == -1: data.write(buf) else: - # Write out everything before the E. + # Write out everything including E (the one from EI to be removed). data.write(buf[0 : loc + 1]) - + dataposE = data.tell() - 1 # Seek back in the stream to read the E next. stream.seek(loc + 1 - len(buf), 1) saved_pos = stream.tell() @@ -225,7 +225,9 @@ def extract_inline_default(stream: StreamType) -> bytes: ): # for Q ou EMC stream.seek(saved_pos, 0) continue - # Data contains [\s]EI[\s](Q|EMC): 4 chars sufficient, checking Q operator not required. + # Data contains [\s]EI[\s](Q|EMC): 4 chars are sufficients + # remove E(I) wrongly inserted earlier + data.truncate(dataposE) break return data.getvalue() From 2874e567e98a40e0c95009025b0d54ff64f94cf8 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 20 May 2024 10:36:04 +0200 Subject: [PATCH 29/42] Update pypdf/_page.py Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com> --- pypdf/_page.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index cdd541502..1c1976b3e 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -571,7 +571,7 @@ def _translate_value_inlineimage(self, k: str, v: PdfObject) -> PdfObject: ) except (TypeError, KeyError): if isinstance(v, NameObject): - # it is a custom name : we have to look in resources : + # It is a custom name, thus we have to look in resources. # the only applicable case is for ColorSpace try: res = cast(DictionaryObject, self["/Resources"])["/ColorSpace"] From 81e1f30837f7fe7525a9f7573f03dbd931ed03f9 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 20 May 2024 10:36:17 +0200 Subject: [PATCH 30/42] Update pypdf/_page.py Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com> --- pypdf/_page.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index 1c1976b3e..ddd47b017 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -572,7 +572,7 @@ def _translate_value_inlineimage(self, k: str, v: PdfObject) -> PdfObject: except (TypeError, KeyError): if isinstance(v, NameObject): # It is a custom name, thus we have to look in resources. - # the only applicable case is for ColorSpace + # The only applicable case is for ColorSpace. try: res = cast(DictionaryObject, self["/Resources"])["/ColorSpace"] v = cast(DictionaryObject, res)[v] From 90fe4598787cf35fd972a9380135504d4d19f8cc Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 20 May 2024 10:36:26 +0200 Subject: [PATCH 31/42] Update pypdf/_page.py Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com> --- pypdf/_page.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index ddd47b017..a897434e0 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -577,7 +577,7 @@ def _translate_value_inlineimage(self, k: str, v: PdfObject) -> PdfObject: res = cast(DictionaryObject, self["/Resources"])["/ColorSpace"] v = cast(DictionaryObject, res)[v] except KeyError: # for res and v - raise PdfReadError(f"Can not find resource entry {v} for {k}") + raise PdfReadError(f"Cannot find resource entry {v} for {k}") return v def _get_inline_images(self) -> Dict[str, ImageFile]: From 54e4c1d7c30bd387fc51a0121e3eb3a2e6dcd798 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 20 May 2024 10:37:05 +0200 Subject: [PATCH 32/42] Update pypdf/_page.py Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com> --- pypdf/_page.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index a897434e0..50d030250 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -622,7 +622,7 @@ def _get_inline_images(self) -> Dict[str, ImageFile]: "/Length": len(ii["__streamdata__"]), } for k, v in ii["settings"].items(): - if k in ("/Length", "/L"): # no length is expected + if k in {"/Length", "/L"}: # no length is expected continue if isinstance(v, list): v = ArrayObject( From d9841ddc40b1d1ab438ae474abec9d9351de1119 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 20 May 2024 10:39:07 +0200 Subject: [PATCH 33/42] Update pypdf/generic/_data_structures.py Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com> --- pypdf/generic/_data_structures.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index 4e5b0ad0f..4365c4929 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -1162,7 +1162,6 @@ def _read_inline_image(self, stream: StreamType) -> Dict[str, Any]: assert tmp[:2] == b"ID" filtr = settings.get("/F", settings.get("/Filter", "not set")) savpos = stream.tell() - # import pdb;pdb.set_trace() # print("inline", stream.tell(),filtr,"*",settings) if isinstance(filtr, list): filtr = filtr[0] # used forencoding From ecdba022214ef425cde8357adf10009365b17500 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 20 May 2024 10:39:23 +0200 Subject: [PATCH 34/42] Update pypdf/generic/_data_structures.py Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com> --- pypdf/generic/_data_structures.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index 4365c4929..e3de2259b 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -1162,7 +1162,6 @@ def _read_inline_image(self, stream: StreamType) -> Dict[str, Any]: assert tmp[:2] == b"ID" filtr = settings.get("/F", settings.get("/Filter", "not set")) savpos = stream.tell() - # print("inline", stream.tell(),filtr,"*",settings) if isinstance(filtr, list): filtr = filtr[0] # used forencoding if "AHx" in filtr or "ASCIIHexDecode" in filtr: From ae9fdfc2105c2421ebe639c0e6804c1591da5780 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 20 May 2024 10:58:37 +0200 Subject: [PATCH 35/42] update from comments --- pypdf/_xobj_image_helpers.py | 4 +- pypdf/filters.py | 15 ---- pypdf/generic/_data_structures.py | 7 +- pypdf/generic/_image_inline.py | 136 +++++++++++++++--------------- 4 files changed, 72 insertions(+), 90 deletions(-) diff --git a/pypdf/_xobj_image_helpers.py b/pypdf/_xobj_image_helpers.py index d797feda0..33905d850 100644 --- a/pypdf/_xobj_image_helpers.py +++ b/pypdf/_xobj_image_helpers.py @@ -30,8 +30,8 @@ try: from PIL import Image, UnidentifiedImageError # noqa: F401 -except ImportError: # deprecated - raise ImportError( # deprecated +except ImportError: # pragma: no cover + raise ImportError( # pragma: no cover "pillow is required to do image extraction. " "It can be installed via 'pip install pypdf[image]'" ) diff --git a/pypdf/filters.py b/pypdf/filters.py index 39a7f8e3d..069a3d023 100644 --- a/pypdf/filters.py +++ b/pypdf/filters.py @@ -504,21 +504,6 @@ def decode( data = data.strip(WHITESPACES_AS_BYTES) return a85decode(data, adobe=True, ignorechars=WHITESPACES_AS_BYTES) - """@staticmethod - def encode(data: bytes, level: int = -1) -> bytes: - ''' - Compress the input data using A85 encoding in Adobe format. - - Args: - data: The data to be compressed. - level: See https://docs.python.org/3/library/zlib.html#zlib.compress - - Returns: - The compressed data. - ''' - return a85encode(data, adobe=True, wrapcol=32) - """ - class DCTDecode: @staticmethod diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index e3de2259b..f983fc625 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -1181,12 +1181,7 @@ def _read_inline_image(self, stream: StreamType) -> Dict[str, Any]: else: bits = settings.get( "/BPC", - 8 - if cs == "/I" - or cs == "/G" - or cs == "/Indexed" - or cs == "/DeviceGray" - else -1, + 8 if cs in {"/I", "/G", "/Indexed", "/DeviceGray"} else -1, ) if bits > 0: lcs = bits / 8.0 diff --git a/pypdf/generic/_image_inline.py b/pypdf/generic/_image_inline.py index 579514692..14c302902 100644 --- a/pypdf/generic/_image_inline.py +++ b/pypdf/generic/_image_inline.py @@ -45,41 +45,41 @@ def extract_inline_AHex(stream: StreamType) -> bytes: Extract HexEncoded Stream from Inline Image. the stream will be moved onto the EI """ - data: bytes = b"" + data_out: bytes = b"" # Read data until delimiter > and EI as backup # ignoring backup. while True: - buf = read_non_whitespace(stream) + stream.read(BUFFER_SIZE) - if not buf: + data_buffered = read_non_whitespace(stream) + stream.read(BUFFER_SIZE) + if not data_buffered: raise PdfReadError("Unexpected end of stream") - loc = buf.find(b">") - if loc >= 0: # found > - data += buf[: (loc + 1)] - stream.seek(-len(buf) + loc + 1, 1) + pos_tok = data_buffered.find(b">") + if pos_tok >= 0: # found > + data_out += data_buffered[: (pos_tok + 1)] + stream.seek(-len(data_buffered) + pos_tok + 1, 1) break - loc = buf.find(b"EI") - if loc >= 0: # found EI - stream.seek(-len(buf) + loc - 1, 1) + pos_ei = data_buffered.find(b"EI") + if pos_ei >= 0: # found EI + stream.seek(-len(data_buffered) + pos_ei - 1, 1) c = stream.read(1) while c in WHITESPACES: stream.seek(-2, 1) c = stream.read(1) - loc -= 1 - data += buf[:loc] + pos_ei -= 1 + data_out += data_buffered[:pos_ei] break - elif len(buf) == 2: - data += buf + elif len(data_buffered) == 2: + data_out += data_buffered raise PdfReadError("Unexpected end of stream") else: # > nor EI found - data += buf[:-2] + data_out += data_buffered[:-2] stream.seek(-2, 1) - ei = read_non_whitespace(stream) - ei += stream.read(2) + ei_tok = read_non_whitespace(stream) + ei_tok += stream.read(2) stream.seek(-3, 1) - if ei[0:2] != b"EI" or not (ei[2:3] == b"" or ei[2:3] in WHITESPACES): + if ei_tok[0:2] != b"EI" or not (ei_tok[2:3] == b"" or ei_tok[2:3] in WHITESPACES): raise PdfReadError("EI stream not found") - return data + return data_out def extract_inline_A85(stream: StreamType) -> bytes: @@ -87,30 +87,32 @@ def extract_inline_A85(stream: StreamType) -> bytes: Extract A85 Stream from Inline Image. the stream will be moved onto the EI """ - data: bytes = b"" + data_out: bytes = b"" # Read data up to delimiter ~> # see §3.3.2 from PDF ref 1.7 while True: - buf = read_non_whitespace(stream) + stream.read(BUFFER_SIZE) - if not buf: + data_buffered = read_non_whitespace(stream) + stream.read(BUFFER_SIZE) + if not data_buffered: raise PdfReadError("Unexpected end of stream") - loc = buf.find(b"~>") - if loc >= 0: # found! - data += buf[: loc + 2] - stream.seek(-len(buf) + loc + 2, 1) + pos_tok = data_buffered.find(b"~>") + if pos_tok >= 0: # found! + data_out += data_buffered[: pos_tok + 2] + stream.seek(-len(data_buffered) + pos_tok + 2, 1) break - elif len(buf) == 2: # end of buffer - data += buf + elif len(data_buffered) == 2: # end of buffer + data_out += data_buffered raise PdfReadError("Unexpected end of stream") - data += buf[:-2] # back by one char in case of in the middle of ~> + data_out += data_buffered[ + :-2 + ] # back by one char in case of in the middle of ~> stream.seek(-2, 1) - ei = read_non_whitespace(stream) - ei += stream.read(2) + ei_tok = read_non_whitespace(stream) + ei_tok += stream.read(2) stream.seek(-3, 1) - if ei[0:2] != b"EI" or not (ei[2:3] == b"" or ei[2:3] in WHITESPACES): + if ei_tok[0:2] != b"EI" or not (ei_tok[2:3] == b"" or ei_tok[2:3] in WHITESPACES): raise PdfReadError("EI stream not found") - return data + return data_out def extract_inline_RL(stream: StreamType) -> bytes: @@ -118,26 +120,26 @@ def extract_inline_RL(stream: StreamType) -> bytes: Extract RL Stream from Inline Image. the stream will be moved onto the EI """ - data: bytes = b"" + data_out: bytes = b"" # Read data up to delimiter ~> # see §3.3.4 from PDF ref 1.7 while True: - buf = stream.read(BUFFER_SIZE) - if not buf: + data_buffered = stream.read(BUFFER_SIZE) + if not data_buffered: raise PdfReadError("Unexpected end of stream") - loc = buf.find(b"\x80") - if loc >= 0: # found - data += buf[: loc + 1] - stream.seek(-len(buf) + loc + 1, 1) + pos_tok = data_buffered.find(b"\x80") + if pos_tok >= 0: # found + data_out += data_buffered[: pos_tok + 1] + stream.seek(-len(data_buffered) + pos_tok + 1, 1) break - data += buf + data_out += data_buffered - ei = read_non_whitespace(stream) - ei += stream.read(2) + ei_tok = read_non_whitespace(stream) + ei_tok += stream.read(2) stream.seek(-3, 1) - if ei[0:2] != b"EI" or not (ei[2:3] == b"" or ei[2:3] in WHITESPACES): + if ei_tok[0:2] != b"EI" or not (ei_tok[2:3] == b"" or ei_tok[2:3] in WHITESPACES): raise PdfReadError("EI stream not found") - return data + return data_out def extract_inline_DCT(stream: StreamType) -> bytes: @@ -145,20 +147,20 @@ def extract_inline_DCT(stream: StreamType) -> bytes: Extract DCT (JPEG) Stream from Inline Image. the stream will be moved onto the EI """ - data: bytes = b"" + data_out: bytes = b"" # Read Blocks of data (ID/Size/data) up to ID=FF/D9 # see https://www.digicamsoft.com/itu/itu-t81-36.html notfirst = False while True: c = stream.read(1) if notfirst or (c == b"\xff"): - data += c + data_out += c if c != b"\xff": continue else: notfirst = True c = stream.read(1) - data += c + data_out += c if c == b"\xff": stream.seek(-1, 1) # pragma: no cover elif c == b"\x00": # stuffing @@ -171,17 +173,17 @@ def extract_inline_DCT(stream: StreamType) -> bytes: b"\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xfe" ): c = stream.read(2) - data += c + data_out += c sz = c[0] * 256 + c[1] - data += stream.read(sz - 2) + data_out += stream.read(sz - 2) # else: pass - ei = read_non_whitespace(stream) - ei += stream.read(2) + ei_tok = read_non_whitespace(stream) + ei_tok += stream.read(2) stream.seek(-3, 1) - if ei[0:2] != b"EI" or not (ei[2:3] == b"" or ei[2:3] in WHITESPACES): + if ei_tok[0:2] != b"EI" or not (ei_tok[2:3] == b"" or ei_tok[2:3] in WHITESPACES): raise PdfReadError("EI stream not found") - return data + return data_out def extract_inline_default(stream: StreamType) -> bytes: @@ -189,24 +191,24 @@ def extract_inline_default(stream: StreamType) -> bytes: Legacy method used by default """ - data = BytesIO() + stream_out = BytesIO() # Read the inline image, while checking for EI (End Image) operator. while True: - buf = stream.read(BUFFER_SIZE) - if not buf: + data_buffered = stream.read(BUFFER_SIZE) + if not data_buffered: raise PdfReadError("Unexpected end of stream") - loc = buf.find( + pos_ei = data_buffered.find( b"E" ) # we can not look straight for "EI" because it may not have been loaded in the buffer - if loc == -1: - data.write(buf) + if pos_ei == -1: + stream_out.write(data_buffered) else: # Write out everything including E (the one from EI to be removed). - data.write(buf[0 : loc + 1]) - dataposE = data.tell() - 1 + stream_out.write(data_buffered[0 : pos_ei + 1]) + sav_pos_ei = stream_out.tell() - 1 # Seek back in the stream to read the E next. - stream.seek(loc + 1 - len(buf), 1) + stream.seek(pos_ei + 1 - len(data_buffered), 1) saved_pos = stream.tell() # Check for End Image tok2 = stream.read(1) # I of "EI" @@ -219,15 +221,15 @@ def extract_inline_default(stream: StreamType) -> bytes: continue while tok3 in WHITESPACES: tok3 = stream.read(1) - if buf[loc - 1 : loc] not in WHITESPACES and tok3 not in ( + if data_buffered[pos_ei - 1 : pos_ei] not in WHITESPACES and tok3 not in { b"Q", b"E", - ): # for Q ou EMC + }: # for Q ou EMC stream.seek(saved_pos, 0) continue # Data contains [\s]EI[\s](Q|EMC): 4 chars are sufficients # remove E(I) wrongly inserted earlier - data.truncate(dataposE) + stream_out.truncate(sav_pos_ei) break - return data.getvalue() + return stream_out.getvalue() From bcabdc8fc70c0076e0a7871ac7d91f14dca6eeff Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 20 May 2024 18:10:50 +0200 Subject: [PATCH 36/42] Update _data_structures.py --- pypdf/generic/_data_structures.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index f983fc625..1688d5d5c 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -84,7 +84,7 @@ from ._fit import Fit from ._image_inline import ( extract_inline_A85, - extract_inline_AHex, + extract_inline_AHx, extract_inline_DCT, extract_inline_default, extract_inline_RL, @@ -1165,7 +1165,7 @@ def _read_inline_image(self, stream: StreamType) -> Dict[str, Any]: if isinstance(filtr, list): filtr = filtr[0] # used forencoding if "AHx" in filtr or "ASCIIHexDecode" in filtr: - data = extract_inline_AHex(stream) + data = extract_inline_AHx(stream) elif "A85" in filtr or "ASCII85Decode" in filtr: data = extract_inline_A85(stream) elif "RL" in filtr or "RunLengthDecode" in filtr: From dc045b6fda3bf0a5e79990e032a7e80935293835 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 20 May 2024 18:11:26 +0200 Subject: [PATCH 37/42] Update _image_inline.py --- pypdf/generic/_image_inline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/generic/_image_inline.py b/pypdf/generic/_image_inline.py index 14c302902..776f69660 100644 --- a/pypdf/generic/_image_inline.py +++ b/pypdf/generic/_image_inline.py @@ -40,7 +40,7 @@ BUFFER_SIZE = 8192 -def extract_inline_AHex(stream: StreamType) -> bytes: +def extract_inline_AHx(stream: StreamType) -> bytes: """ Extract HexEncoded Stream from Inline Image. the stream will be moved onto the EI From 9c03aa77855b04d32eca5af19827c8763448051d Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 20 May 2024 18:29:14 +0200 Subject: [PATCH 38/42] Update test_generic.py --- tests/test_generic.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/test_generic.py b/tests/test_generic.py index 12d64c9f5..f59c559e0 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -40,7 +40,7 @@ ) from pypdf.generic._image_inline import ( extract_inline_A85, - extract_inline_AHex, + extract_inline_AHx, extract_inline_DCT, extract_inline_RL, ) @@ -1363,11 +1363,11 @@ def test_array_operators(): def test_unitary_extract_inline_buffer_invalid(): with pytest.raises(PdfReadError): - extract_inline_AHex(BytesIO()) + extract_inline_AHx(BytesIO()) with pytest.raises(PdfReadError): - extract_inline_AHex(BytesIO(4095 * b"00" + b" ")) + extract_inline_AHx(BytesIO(4095 * b"00" + b" ")) with pytest.raises(PdfReadError): - extract_inline_AHex(BytesIO(b"00")) + extract_inline_AHx(BytesIO(b"00")) with pytest.raises(PdfReadError): extract_inline_A85(BytesIO()) with pytest.raises(PdfReadError): @@ -1387,9 +1387,9 @@ def test_unitary_extract_inline_buffer_invalid(): def test_unitary_extract_inline(): # AHx b = 16000 * b"00" - assert len(extract_inline_AHex(BytesIO(b + b" EI"))) == len(b) + assert len(extract_inline_AHx(BytesIO(b + b" EI"))) == len(b) with pytest.raises(PdfReadError): - extract_inline_AHex(BytesIO(b + b"> ")) + extract_inline_AHx(BytesIO(b + b"> ")) # RL b = 8200 * b"\x00\xAB" + b"\x80" assert len(extract_inline_RL(BytesIO(b + b" EI"))) == len(b) From a56959899c01870a0a86f962e5b3d0a9b6e49cf0 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 26 May 2024 23:13:48 +0200 Subject: [PATCH 39/42] Update test_workflows.py --- tests/test_workflows.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_workflows.py b/tests/test_workflows.py index 8e3fd6e42..93bc0c9e5 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -1024,7 +1024,7 @@ def test_inline_images(): reader.pages[2]._get_image(("test",)) url = "https://github.com/py-pdf/pypdf/files/15233597/bug1065245.pdf" - name = "iss2598c.pdf" # test coming from another test in test_image.py + name = "iss2598c.pdf" # test data also used in test_images.py/test_inline_image_extraction() reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) assert len(reader.pages[0].images) == 3 From a52541e23b230dc5b77e8b53fb618e6b6ecd540c Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 26 May 2024 23:14:12 +0200 Subject: [PATCH 40/42] Update _image_inline.py --- pypdf/generic/_image_inline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/generic/_image_inline.py b/pypdf/generic/_image_inline.py index 776f69660..f6c48e883 100644 --- a/pypdf/generic/_image_inline.py +++ b/pypdf/generic/_image_inline.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, PubPub-ZZ +# Copyright (c) 2024, pypdf contributors # All rights reserved. # # Redistribution and use in source and binary forms, with or without From cfe61a91ad19bba4d2a4fc06997cd1480f8904f4 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 26 May 2024 23:59:46 +0200 Subject: [PATCH 41/42] Update _image_inline.py --- pypdf/generic/_image_inline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/generic/_image_inline.py b/pypdf/generic/_image_inline.py index f6c48e883..41826ac31 100644 --- a/pypdf/generic/_image_inline.py +++ b/pypdf/generic/_image_inline.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, pypdf contributors +# Copyright (c) 2024, pypdf contributors # All rights reserved. # # Redistribution and use in source and binary forms, with or without From 7be1fd6c7f3b328cf23da32ac0059a584e368279 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 27 May 2024 08:40:17 +0200 Subject: [PATCH 42/42] remove coverage ignore on PIL import --- pypdf/_xobj_image_helpers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pypdf/_xobj_image_helpers.py b/pypdf/_xobj_image_helpers.py index 33905d850..45b0c145b 100644 --- a/pypdf/_xobj_image_helpers.py +++ b/pypdf/_xobj_image_helpers.py @@ -30,8 +30,8 @@ try: from PIL import Image, UnidentifiedImageError # noqa: F401 -except ImportError: # pragma: no cover - raise ImportError( # pragma: no cover +except ImportError: + raise ImportError( "pillow is required to do image extraction. " "It can be installed via 'pip install pypdf[image]'" )