Skip to content

Commit

Permalink
BUG: Reading large compressed images takes huge time to process (#2644)
Browse files Browse the repository at this point in the history
Added buffered reading for zlib decompression
  • Loading branch information
snanda85 authored May 14, 2024
1 parent 6226d66 commit c227b0c
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 8 deletions.
21 changes: 13 additions & 8 deletions pypdf/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,14 +80,19 @@ def decompress(data: bytes) -> bytes:
try:
return zlib.decompress(data)
except zlib.error:
d = zlib.decompressobj(zlib.MAX_WBITS | 32)
result_str = b""
for b in [data[i : i + 1] for i in range(len(data))]:
try:
result_str += d.decompress(b)
except zlib.error:
pass
return result_str
try:
# For larger files, use Decompress object to enable buffered reading
return zlib.decompressobj().decompress(data)
except zlib.error:
# If still failed, then try with increased window size
d = zlib.decompressobj(zlib.MAX_WBITS | 32)
result_str = b""
for b in [data[i : i + 1] for i in range(len(data))]:
try:
result_str += d.decompress(b)
except zlib.error:
pass
return result_str


class FlateDecode:
Expand Down
12 changes: 12 additions & 0 deletions tests/bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,3 +227,15 @@ def test_image_new_property_performance(benchmark):
data = BytesIO(get_data_from_url(url, name=name))

benchmark(image_new_property, data)


def image_extraction(data):
reader = PdfReader(data)
list(reader.pages[0].images)


@pytest.mark.enable_socket()
def test_large_compressed_image_performance(benchmark):
url = "https://github.com/py-pdf/pypdf/files/15306199/file_with_large_compressed_image.pdf"
data = BytesIO(get_data_from_url(url, name="file_with_large_compressed_image.pdf"))
benchmark(image_extraction, data)
8 changes: 8 additions & 0 deletions tests/test_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,3 +346,11 @@ def test_corrupted_jpeg_iss2266(pdf, pdf_name, images, images_name, filtr):
print(fn) # noqa: T201
img = Image.open(BytesIO(zf.read(fn)))
assert image_similarity(reader.pages[p].images[i].image, img) >= 0.99


@pytest.mark.enable_socket()
@pytest.mark.timeout(30)
def test_large_compressed_image():
url = "https://github.com/py-pdf/pypdf/files/15306199/file_with_large_compressed_image.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name="file_with_large_compressed_image.pdf")))
list(reader.pages[0].images)

0 comments on commit c227b0c

Please sign in to comment.