datalab-to · stancld · Oct 7, 2025
diff --git a/pdftext/pdf/chars.py b/pdftext/pdf/chars.py
@@ -7,6 +7,9 @@
 from pdftext.schema import Bbox, Char, Chars, Spans, Span
 
 
+MAX_UNICODE_INT = 1114111  # 0x10ffff
+
+
 def get_chars(textpage: pdfium.PdfTextPage, page_bbox: list[float], page_rotation: int, quote_loosebox=True) -> Chars:
     chars: Chars = []
 
@@ -15,7 +18,7 @@ def get_chars(textpage: pdfium.PdfTextPage, page_bbox: list[float], page_rotatio
     page_height = math.ceil(abs(y_end - y_start))
 
     for i in range(textpage.count_chars()):
-        text = chr(pdfium_c.FPDFText_GetUnicode(textpage, i))
+        text = utf8_int_to_string(pdfium_c.FPDFText_GetUnicode(textpage, i))
 
         rotation = pdfium_c.FPDFText_GetCharAngle(textpage, i)
         loosebox = (rotation == 0) and (text != "'" or quote_loosebox)
@@ -117,3 +120,35 @@ def word_break():
             deduped.append(word)
 
     return [char for word in deduped for char in word['chars']]
+
+
+def utf8_int_to_string(utf8_int: int) -> str:
+    """Decode UTF-8 integer to string.
+`
+    PDFium's `FPDFText_GetUnicode` returns unsigned 32-bit integer. Integers ≤ 1114111 are valid
+    Unicode codepoint and can be converted with python in-built `chr` function.
+    Larger integers are UTF-8 bytes packed into integers and must be handled separately.
+
+    Parameters
+    ----------
+    utf8_int
+        Unsgined 32-bit ingeger value from FPDFText_GetUnicode that may be either a valid Unicode
+        codepoint, or UTF-8 bytes packed as an integer.
+
+    Returns
+    -------
+    The decoded character or string.
+
+    Examples
+    --------
+    >>> utf8_int_to_string(65)  # Valid Unicode
+    'A'
+    >>> utf8_int_to_string(15112101)  # UTF-8 bytes for '日'
+    '日'
+    """
+    if utf8_int <= MAX_UNICODE_INT:
+        return chr(utf8_int)
+    # Compute byte length using 8-bit ceiling
+    byte_length = (utf8_int.bit_length() + 7) // 8
+    bytes_obj = utf8_int.to_bytes(byte_length, "big")
+    return bytes_obj.decode("utf-8")
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -9,6 +9,10 @@ def pdf_path():
 def pdf_path2():
     return "tests/data/communication.pdf"
 
+@pytest.fixture(scope="session")
+def pdf_with_non_unicode_chars():
+    return "tests/data/non_unicode_chars.pdf"
+
 @pytest.fixture()
 def pdf_doc(pdf_path):
     doc = pdfium.PdfDocument(pdf_path)

diff --git a/tests/data/non_unicode_chars.pdf b/tests/data/non_unicode_chars.pdf
diff --git a/tests/pdf/test_chars.py b/tests/pdf/test_chars.py
@@ -0,0 +1,28 @@
+import pytest
+
+from pdftext.pdf.chars import utf8_int_to_string
+
+@pytest.mark.unit
+@pytest.mark.parametrize(
+    "utf8_int, expected_str",
+    [
+        (65, "A"),  # uppercase letter
+        (97, "a"),  # lowercase letter
+        (51, "3"),  # number
+        (64, "@"),  # special character
+        # 3-byte UTF-8
+        (15112101, "日"),  # japanese char for day
+        (14989485, "中"),  # chinese char for middle
+        (15111815, "文"),  # chinese char for text/writing
+        (15113388, "本"),  # chinese char for book/origin
+        (15570332, "한"),  # korean hangul syllable
+        (14990232, "付"),  # japanese char for "attach/give" (part of date 日付)
+        # 4-byte UTF-8
+        (4036991104, "😀"),  # 😀
+        (4036991616, "🚀"),  # 🚀
+        (4036859279, "𝕏"),  # 𝕏, mathmetical notation
+        (4037057678, "𠜎"),  # 𠜎, CJK ideograph
+    ],
+)
+def test_utf8_int_to_string(utf8_int: int, expected_str: str) -> None:
+    assert utf8_int_to_string(utf8_int) == expected_str
diff --git a/tests/test_extraction.py b/tests/test_extraction.py
@@ -42,3 +42,10 @@ def test_line_joining(pdf_path2):
     text = plain_text_output(pdf_path2, page_range=pages).lower()
     assert "the axis media control viewer toolbar" in text
     assert "axismediacontrolviewertoolbar" not in text
+
+
+def test_parsing_non_unicode_chars(pdf_with_non_unicode_chars):
+    text = plain_text_output(pdf_with_non_unicode_chars).lower()
+    assert "日" in text
+    assert "付" in text
+