From 54fbcd7f89f97ac091493b3211e091d696fffe13 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Fri, 21 Jun 2024 21:04:44 +0200 Subject: [PATCH 1/2] ENH: accepts ETen-B5 and UniCNS-UTF16 encodings closes #2356 --- pypdf/_cmap.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index c77a849bc..9ff3c62d3 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -121,10 +121,15 @@ def build_char_map_from_dict( "/GBK-EUC-V": "gbk", # TBC "/GBK2K-H": "gb18030", "/GBK2K-V": "gb18030", + "/ETen-B5-H": "cp950", + "/ETen-B5-V": "cp950", + "/ETenms-B5-H": "cp950", + "/ETenms-B5-V": "cp950", + "/UniCNS-UTF16-H": "utf-16-be", # TBC + "/UniCNS-UTF16-V": "utf-16-be", # TBC # UCS2 in code } - # manually extracted from http://mirrors.ctan.org/fonts/adobe/afm/Adobe-Core35_AFMs-229.tar.gz _default_fonts_space_width: Dict[str, int] = { "/Courier": 600, From fdbf37c57d9cd2be0ad48ab9ff0bdd12163c2a7d Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sat, 22 Jun 2024 12:09:53 +0200 Subject: [PATCH 2/2] from comments --- pypdf/_cmap.py | 16 ++++++++-------- tests/test_cmap.py | 7 +++++++ 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index 9ff3c62d3..9a2d10a61 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -113,20 +113,20 @@ def build_char_map_from_dict( _predefined_cmap: Dict[str, str] = { "/Identity-H": "utf-16-be", "/Identity-V": "utf-16-be", - "/GB-EUC-H": "gbk", # TBC - "/GB-EUC-V": "gbk", # TBC - "/GBpc-EUC-H": "gb2312", # TBC - "/GBpc-EUC-V": "gb2312", # TBC - "/GBK-EUC-H": "gbk", # TBC - "/GBK-EUC-V": "gbk", # TBC + "/GB-EUC-H": "gbk", + "/GB-EUC-V": "gbk", + "/GBpc-EUC-H": "gb2312", + "/GBpc-EUC-V": "gb2312", + "/GBK-EUC-H": "gbk", + "/GBK-EUC-V": "gbk", "/GBK2K-H": "gb18030", "/GBK2K-V": "gb18030", "/ETen-B5-H": "cp950", "/ETen-B5-V": "cp950", "/ETenms-B5-H": "cp950", "/ETenms-B5-V": "cp950", - "/UniCNS-UTF16-H": "utf-16-be", # TBC - "/UniCNS-UTF16-V": "utf-16-be", # TBC + "/UniCNS-UTF16-H": "utf-16-be", + "/UniCNS-UTF16-V": "utf-16-be", # UCS2 in code } diff --git a/tests/test_cmap.py b/tests/test_cmap.py index b00ba6abb..9dcfb252d 100644 --- a/tests/test_cmap.py +++ b/tests/test_cmap.py @@ -199,3 +199,10 @@ def test_ignoring_non_put_entries(): """Issue #2290""" reader = PdfReader(BytesIO(get_data_from_url(name="iss2290.pdf"))) reader.pages[0].extract_text() + + +@pytest.mark.enable_socket() +def test_eten_b5(): + """Issue #2356""" + reader = PdfReader(BytesIO(get_data_from_url(name="iss2290.pdf"))) + reader.pages[0].extract_text().startswith("1/7 \n富邦新終身壽險")