Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 52 additions & 23 deletions PyPDF2/_cmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,10 +180,13 @@ def parse_to_unicode(
return {}, space_code, []
process_rg: bool = False
process_char: bool = False
multiline_rg: Union[
None, Tuple[int, int]
] = None # tuple = (current_char, remaining size) ; cf #1285 for example of file
cm = prepare_cm(ft)
for l in cm.split(b"\n"):
process_rg, process_char = process_cm_line(
l.strip(b" "), process_rg, process_char, map_dict, int_entry
process_rg, process_char, multiline_rg = process_cm_line(
l.strip(b" "), process_rg, process_char, multiline_rg, map_dict, int_entry
)

for a, value in map_dict.items():
Expand Down Expand Up @@ -228,11 +231,12 @@ def process_cm_line(
l: bytes,
process_rg: bool,
process_char: bool,
multiline_rg: Union[None, Tuple[int, int]],
map_dict: Dict[Any, Any],
int_entry: List[int],
) -> Tuple[bool, bool]:
) -> Tuple[bool, bool, Union[None, Tuple[int, int]]]:
if l in (b"", b" ") or l[0] == 37: # 37 = %
return process_rg, process_char
return process_rg, process_char, multiline_rg
if b"beginbfrange" in l:
process_rg = True
elif b"endbfrange" in l:
Expand All @@ -242,22 +246,29 @@ def process_cm_line(
elif b"endbfchar" in l:
process_char = False
elif process_rg:
parse_bfrange(l, map_dict, int_entry)
multiline_rg = parse_bfrange(l, map_dict, int_entry, multiline_rg)
elif process_char:
parse_bfchar(l, map_dict, int_entry)
return process_rg, process_char
return process_rg, process_char, multiline_rg


def parse_bfrange(l: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) -> None:
def parse_bfrange(
l: bytes,
map_dict: Dict[Any, Any],
int_entry: List[int],
multiline_rg: Union[None, Tuple[int, int]],
) -> Union[None, Tuple[int, int]]:
lst = [x for x in l.split(b" ") if x]
a = int(lst[0], 16)
b = int(lst[1], 16)
closure_found = False
nbi = len(lst[0])
map_dict[-1] = nbi // 2
fmt = b"%%0%dX" % nbi
if lst[2] == b"[":
for sq in lst[3:]:
if multiline_rg is not None:
a = multiline_rg[0] # a, b not in the current line
b = multiline_rg[1]
for sq in lst[1:]:
if sq == b"]":
closure_found = True
break
map_dict[
unhexlify(fmt % a).decode(
Expand All @@ -268,18 +279,36 @@ def parse_bfrange(l: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) -> N
int_entry.append(a)
a += 1
else:
c = int(lst[2], 16)
fmt2 = b"%%0%dX" % max(4, len(lst[2]))
while a <= b:
map_dict[
unhexlify(fmt % a).decode(
"charmap" if map_dict[-1] == 1 else "utf-16-be",
"surrogatepass",
)
] = unhexlify(fmt2 % c).decode("utf-16-be", "surrogatepass")
int_entry.append(a)
a += 1
c += 1
a = int(lst[0], 16)
b = int(lst[1], 16)
if lst[2] == b"[":
for sq in lst[3:]:
if sq == b"]":
closure_found = True
break
map_dict[
unhexlify(fmt % a).decode(
"charmap" if map_dict[-1] == 1 else "utf-16-be",
"surrogatepass",
)
] = unhexlify(sq).decode("utf-16-be", "surrogatepass")
int_entry.append(a)
a += 1
else: # case without list
c = int(lst[2], 16)
fmt2 = b"%%0%dX" % max(4, len(lst[2]))
closure_found = True
while a <= b:
map_dict[
unhexlify(fmt % a).decode(
"charmap" if map_dict[-1] == 1 else "utf-16-be",
"surrogatepass",
)
] = unhexlify(fmt2 % c).decode("utf-16-be", "surrogatepass")
int_entry.append(a)
a += 1
c += 1
return None if closure_found else (a, b)


def parse_bfchar(l: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) -> None:
Expand Down
14 changes: 14 additions & 0 deletions tests/test_cmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,20 @@ def test_get_font_width_from_default(): # L40
page.extract_text()


def test_multiline_bfrange():
# non regression test for iss_1285
url = "https://github.com/alexanderquispe/1REI05/raw/main/reports/report_1/The%20lean%20times%20in%20the%20Peruvian%20economy.pdf"
name = "tika-908104.pdf"
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
for page in reader.pages:
page.extract_text()
url = "https://github.com/yxj-HGNwmb5kdp8ewr/yxj-HGNwmb5kdp8ewr.github.io/raw/master/files/Giacalone%20Llobell%20Jaeger%20(2022)%20Food%20Qual%20Prefer.pdf"
name = "Giacalone.pdf"
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
for page in reader.pages:
page.extract_text()


def test_bfchar_on_2_chars():
# iss #1293
url = "https://github.com/xyegithub/myBlog/raw/main/posts/c94b2364/paper_pdfs/ImageClassification/2007%2CASurveyofImageClassificationBasedTechniques.pdf"
Expand Down