Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 36 additions & 1 deletion pdftext/pdf/chars.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@
from pdftext.schema import Bbox, Char, Chars, Spans, Span


MAX_UNICODE_INT = 1114111 # 0x10ffff


def get_chars(textpage: pdfium.PdfTextPage, page_bbox: list[float], page_rotation: int, quote_loosebox=True) -> Chars:
chars: Chars = []

Expand All @@ -15,7 +18,7 @@ def get_chars(textpage: pdfium.PdfTextPage, page_bbox: list[float], page_rotatio
page_height = math.ceil(abs(y_end - y_start))

for i in range(textpage.count_chars()):
text = chr(pdfium_c.FPDFText_GetUnicode(textpage, i))
text = utf8_int_to_string(pdfium_c.FPDFText_GetUnicode(textpage, i))

rotation = pdfium_c.FPDFText_GetCharAngle(textpage, i)
loosebox = (rotation == 0) and (text != "'" or quote_loosebox)
Expand Down Expand Up @@ -117,3 +120,35 @@ def word_break():
deduped.append(word)

return [char for word in deduped for char in word['chars']]


def utf8_int_to_string(utf8_int: int) -> str:
"""Decode UTF-8 integer to string.
`
PDFium's `FPDFText_GetUnicode` returns unsigned 32-bit integer. Integers ≤ 1114111 are valid
Unicode codepoint and can be converted with python in-built `chr` function.
Larger integers are UTF-8 bytes packed into integers and must be handled separately.

Parameters
----------
utf8_int
Unsgined 32-bit ingeger value from FPDFText_GetUnicode that may be either a valid Unicode
codepoint, or UTF-8 bytes packed as an integer.

Returns
-------
The decoded character or string.

Examples
--------
>>> utf8_int_to_string(65) # Valid Unicode
'A'
>>> utf8_int_to_string(15112101) # UTF-8 bytes for '日'
'日'
"""
if utf8_int <= MAX_UNICODE_INT:
return chr(utf8_int)
# Compute byte length using 8-bit ceiling
byte_length = (utf8_int.bit_length() + 7) // 8
bytes_obj = utf8_int.to_bytes(byte_length, "big")
return bytes_obj.decode("utf-8")
4 changes: 4 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@ def pdf_path():
def pdf_path2():
return "tests/data/communication.pdf"

@pytest.fixture(scope="session")
def pdf_with_non_unicode_chars():
return "tests/data/non_unicode_chars.pdf"

@pytest.fixture()
def pdf_doc(pdf_path):
doc = pdfium.PdfDocument(pdf_path)
Expand Down
Binary file added tests/data/non_unicode_chars.pdf
Binary file not shown.
28 changes: 28 additions & 0 deletions tests/pdf/test_chars.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import pytest

from pdftext.pdf.chars import utf8_int_to_string

@pytest.mark.unit
@pytest.mark.parametrize(
"utf8_int, expected_str",
[
(65, "A"), # uppercase letter
(97, "a"), # lowercase letter
(51, "3"), # number
(64, "@"), # special character
# 3-byte UTF-8
(15112101, "日"), # japanese char for day
(14989485, "中"), # chinese char for middle
(15111815, "文"), # chinese char for text/writing
(15113388, "本"), # chinese char for book/origin
(15570332, "한"), # korean hangul syllable
(14990232, "付"), # japanese char for "attach/give" (part of date 日付)
# 4-byte UTF-8
(4036991104, "😀"), # 😀
(4036991616, "🚀"), # 🚀
(4036859279, "𝕏"), # 𝕏, mathmetical notation
(4037057678, "𠜎"), # 𠜎, CJK ideograph
],
)
def test_utf8_int_to_string(utf8_int: int, expected_str: str) -> None:
assert utf8_int_to_string(utf8_int) == expected_str
7 changes: 7 additions & 0 deletions tests/test_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,10 @@ def test_line_joining(pdf_path2):
text = plain_text_output(pdf_path2, page_range=pages).lower()
assert "the axis media control viewer toolbar" in text
assert "axismediacontrolviewertoolbar" not in text


def test_parsing_non_unicode_chars(pdf_with_non_unicode_chars):
text = plain_text_output(pdf_with_non_unicode_chars).lower()
assert "日" in text
assert "付" in text