Skip to content

Commit

Permalink
ENH: Accept ETen-B5 and UniCNS-UTF16 encodings (#2721)
Browse files Browse the repository at this point in the history
Related to #2356.
  • Loading branch information
pubpub-zz authored Jun 23, 2024
1 parent e0a92e2 commit 81f35f9
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 7 deletions.
19 changes: 12 additions & 7 deletions pypdf/_cmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,18 +113,23 @@ def build_char_map_from_dict(
_predefined_cmap: Dict[str, str] = {
"/Identity-H": "utf-16-be",
"/Identity-V": "utf-16-be",
"/GB-EUC-H": "gbk", # TBC
"/GB-EUC-V": "gbk", # TBC
"/GBpc-EUC-H": "gb2312", # TBC
"/GBpc-EUC-V": "gb2312", # TBC
"/GBK-EUC-H": "gbk", # TBC
"/GBK-EUC-V": "gbk", # TBC
"/GB-EUC-H": "gbk",
"/GB-EUC-V": "gbk",
"/GBpc-EUC-H": "gb2312",
"/GBpc-EUC-V": "gb2312",
"/GBK-EUC-H": "gbk",
"/GBK-EUC-V": "gbk",
"/GBK2K-H": "gb18030",
"/GBK2K-V": "gb18030",
"/ETen-B5-H": "cp950",
"/ETen-B5-V": "cp950",
"/ETenms-B5-H": "cp950",
"/ETenms-B5-V": "cp950",
"/UniCNS-UTF16-H": "utf-16-be",
"/UniCNS-UTF16-V": "utf-16-be",
# UCS2 in code
}


# manually extracted from http://mirrors.ctan.org/fonts/adobe/afm/Adobe-Core35_AFMs-229.tar.gz
_default_fonts_space_width: Dict[str, int] = {
"/Courier": 600,
Expand Down
7 changes: 7 additions & 0 deletions tests/test_cmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,3 +199,10 @@ def test_ignoring_non_put_entries():
"""Issue #2290"""
reader = PdfReader(BytesIO(get_data_from_url(name="iss2290.pdf")))
reader.pages[0].extract_text()


@pytest.mark.enable_socket()
def test_eten_b5():
"""Issue #2356"""
reader = PdfReader(BytesIO(get_data_from_url(name="iss2290.pdf")))
reader.pages[0].extract_text().startswith("1/7 \n富邦新終身壽險")

0 comments on commit 81f35f9

Please sign in to comment.