From 0b589cff93dd29c1c7b8d3803c4deb6a8683a463 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 29 Aug 2022 10:35:08 +0200 Subject: [PATCH 1/3] ROB : Multi-line entries in bfrange(cmap) Fixes #1285 --- PyPDF2/_cmap.py | 46 +++++++++++++++++++++++++++++++++++++--------- 1 file changed, 37 insertions(+), 9 deletions(-) diff --git a/PyPDF2/_cmap.py b/PyPDF2/_cmap.py index afce26088..2863c03d3 100644 --- a/PyPDF2/_cmap.py +++ b/PyPDF2/_cmap.py @@ -180,10 +180,13 @@ def parse_to_unicode( return {}, space_code, [] process_rg: bool = False process_char: bool = False + multiline_rg: Union[ + None, Tuple[int, int] + ] = None # tuple = (current_char, remaining size) ; cf #1285 for example of file cm = prepare_cm(ft) for l in cm.split(b"\n"): - process_rg, process_char = process_cm_line( - l.strip(b" "), process_rg, process_char, map_dict, int_entry + process_rg, process_char, multiline_rg = process_cm_line( + l.strip(b" "), process_rg, process_char, multiline_rg, map_dict, int_entry ) for a, value in map_dict.items(): @@ -228,11 +231,12 @@ def process_cm_line( l: bytes, process_rg: bool, process_char: bool, + multiline_rg: Union[None, Tuple[int, int]], map_dict: Dict[Any, Any], int_entry: List[int], -) -> Tuple[bool, bool]: +) -> Tuple[bool, bool, Union[None, Tuple[int, int]]]: if l in (b"", b" ") or l[0] == 37: # 37 = % - return process_rg, process_char + return process_rg, process_char, multiline_rg if b"beginbfrange" in l: process_rg = True elif b"endbfrange" in l: @@ -242,22 +246,44 @@ def process_cm_line( elif b"endbfchar" in l: process_char = False elif process_rg: - parse_bfrange(l, map_dict, int_entry) + multiline_rg = parse_bfrange(l, map_dict, int_entry, multiline_rg) elif process_char: parse_bfchar(l, map_dict, int_entry) - return process_rg, process_char + return process_rg, process_char, multiline_rg -def parse_bfrange(l: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) -> None: +def parse_bfrange( + l: bytes, + map_dict: Dict[Any, Any], + int_entry: List[int], + multiline_rg: Union[None, Tuple[int, int]], +) -> Union[None, Tuple[int, int]]: lst = [x for x in l.split(b" ") if x] a = int(lst[0], 16) b = int(lst[1], 16) nbi = len(lst[0]) + closure_found = False map_dict[-1] = nbi // 2 fmt = b"%%0%dX" % nbi - if lst[2] == b"[": + if multiline_rg is not None: + a = multiline_rg[0] # a, b not in the current line + b = multiline_rg[1] + for sq in lst[1:]: + if sq == b"]": + closure_found = True + break + map_dict[ + unhexlify(fmt % a).decode( + "charmap" if map_dict[-1] == 1 else "utf-16-be", + "surrogatepass", + ) + ] = unhexlify(sq).decode("utf-16-be", "surrogatepass") + int_entry.append(a) + a += 1 + elif lst[2] == b"[": for sq in lst[3:]: if sq == b"]": + closure_found = True break map_dict[ unhexlify(fmt % a).decode( @@ -267,9 +293,10 @@ def parse_bfrange(l: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) -> N ] = unhexlify(sq).decode("utf-16-be", "surrogatepass") int_entry.append(a) a += 1 - else: + else: # case without list c = int(lst[2], 16) fmt2 = b"%%0%dX" % max(4, len(lst[2])) + closure_found = True while a <= b: map_dict[ unhexlify(fmt % a).decode( @@ -280,6 +307,7 @@ def parse_bfrange(l: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) -> N int_entry.append(a) a += 1 c += 1 + return None if closure_found else (a, b) def parse_bfchar(l: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) -> None: From 91fc95e38ec08ad4763bdfe355c075d58c8d13dd Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 29 Aug 2022 10:42:20 +0200 Subject: [PATCH 2/3] add test --- tests/test_cmap.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/test_cmap.py b/tests/test_cmap.py index 8aa151436..3d991c115 100644 --- a/tests/test_cmap.py +++ b/tests/test_cmap.py @@ -46,3 +46,12 @@ def test_get_font_width_from_default(): # L40 reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) for page in reader.pages: page.extract_text() + + +def test_multiline_bfrange(): + # non regression test for iss_1285 + url = "https://github.com/alexanderquispe/1REI05/raw/main/reports/report_1/The%20lean%20times%20in%20the%20Peruvian%20economy.pdf" + name = "tika-908104.pdf" + reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + for page in reader.pages: + page.extract_text() From f3aa9031ef66d066f79b8cae21de6e6d36fb08d4 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 29 Aug 2022 20:46:40 +0200 Subject: [PATCH 3/3] ROB : ending list with only one item on the line fixes #1274 --- PyPDF2/_cmap.py | 61 +++++++++++++++++++++++----------------------- tests/test_cmap.py | 5 ++++ 2 files changed, 36 insertions(+), 30 deletions(-) diff --git a/PyPDF2/_cmap.py b/PyPDF2/_cmap.py index 2863c03d3..13dc9a906 100644 --- a/PyPDF2/_cmap.py +++ b/PyPDF2/_cmap.py @@ -259,10 +259,8 @@ def parse_bfrange( multiline_rg: Union[None, Tuple[int, int]], ) -> Union[None, Tuple[int, int]]: lst = [x for x in l.split(b" ") if x] - a = int(lst[0], 16) - b = int(lst[1], 16) - nbi = len(lst[0]) closure_found = False + nbi = len(lst[0]) map_dict[-1] = nbi // 2 fmt = b"%%0%dX" % nbi if multiline_rg is not None: @@ -280,33 +278,36 @@ def parse_bfrange( ] = unhexlify(sq).decode("utf-16-be", "surrogatepass") int_entry.append(a) a += 1 - elif lst[2] == b"[": - for sq in lst[3:]: - if sq == b"]": - closure_found = True - break - map_dict[ - unhexlify(fmt % a).decode( - "charmap" if map_dict[-1] == 1 else "utf-16-be", - "surrogatepass", - ) - ] = unhexlify(sq).decode("utf-16-be", "surrogatepass") - int_entry.append(a) - a += 1 - else: # case without list - c = int(lst[2], 16) - fmt2 = b"%%0%dX" % max(4, len(lst[2])) - closure_found = True - while a <= b: - map_dict[ - unhexlify(fmt % a).decode( - "charmap" if map_dict[-1] == 1 else "utf-16-be", - "surrogatepass", - ) - ] = unhexlify(fmt2 % c).decode("utf-16-be", "surrogatepass") - int_entry.append(a) - a += 1 - c += 1 + else: + a = int(lst[0], 16) + b = int(lst[1], 16) + if lst[2] == b"[": + for sq in lst[3:]: + if sq == b"]": + closure_found = True + break + map_dict[ + unhexlify(fmt % a).decode( + "charmap" if map_dict[-1] == 1 else "utf-16-be", + "surrogatepass", + ) + ] = unhexlify(sq).decode("utf-16-be", "surrogatepass") + int_entry.append(a) + a += 1 + else: # case without list + c = int(lst[2], 16) + fmt2 = b"%%0%dX" % max(4, len(lst[2])) + closure_found = True + while a <= b: + map_dict[ + unhexlify(fmt % a).decode( + "charmap" if map_dict[-1] == 1 else "utf-16-be", + "surrogatepass", + ) + ] = unhexlify(fmt2 % c).decode("utf-16-be", "surrogatepass") + int_entry.append(a) + a += 1 + c += 1 return None if closure_found else (a, b) diff --git a/tests/test_cmap.py b/tests/test_cmap.py index 3d991c115..4a8053669 100644 --- a/tests/test_cmap.py +++ b/tests/test_cmap.py @@ -55,3 +55,8 @@ def test_multiline_bfrange(): reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) for page in reader.pages: page.extract_text() + url = "https://github.com/yxj-HGNwmb5kdp8ewr/yxj-HGNwmb5kdp8ewr.github.io/raw/master/files/Giacalone%20Llobell%20Jaeger%20(2022)%20Food%20Qual%20Prefer.pdf" + name = "Giacalone.pdf" + reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + for page in reader.pages: + page.extract_text()