Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Add support for /Kids in page labels #2562

Merged
merged 13 commits into from
Apr 3, 2024
112 changes: 68 additions & 44 deletions pypdf/_page_labels.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,11 +57,11 @@
aa to zz for the next 26, and so on)
"""

from typing import Iterator, Optional, Tuple, cast
from typing import Iterator, List, Optional, Tuple, cast

from ._protocols import PdfCommonDocProtocol
from ._utils import logger_warning
from .generic import ArrayObject, DictionaryObject, NumberObject
from .generic import ArrayObject, DictionaryObject, NullObject, NumberObject


def number2uppercase_roman_numeral(num: int) -> str:
Expand Down Expand Up @@ -116,6 +116,42 @@ def number2lowercase_letter(number: int) -> str:
return number2uppercase_letter(number).lower()


def get_label_from_nums(dictionary_object: DictionaryObject, index: int) -> str:
# [Nums] shall be an array of the form
# [ key 1 value 1 key 2 value 2 ... key n value n ]
# where each key_i is an integer and the corresponding
# value_i shall be the object associated with that key.
# The keys shall be sorted in numerical order,
# analogously to the arrangement of keys in a name tree
# as described in 7.9.6, "Name Trees."
nums = cast(ArrayObject, dictionary_object["/Nums"])
i = 0
value = None
start_index = 0
while i < len(nums):
start_index = nums[i]
value = nums[i + 1].get_object()
if i + 2 == len(nums):
break
if nums[i + 2] > index:
break
i += 2
m = {
None: lambda n: "",
"/D": lambda n: str(n),
"/R": number2uppercase_roman_numeral,
"/r": number2lowercase_roman_numeral,
"/A": number2uppercase_letter,
"/a": number2lowercase_letter,
}
# if /Nums array is not following the specification or if /Nums is empty
if not isinstance(value, dict):
return str(index + 1) # Fallback
start = value.get("/St", 1)
prefix = value.get("/P", "")
return prefix + m[value.get("/S")](index - start_index + start)


def index2label(reader: PdfCommonDocProtocol, index: int) -> str:
"""
See 7.9.7 "Number Trees".
Expand All @@ -132,49 +168,37 @@ def index2label(reader: PdfCommonDocProtocol, index: int) -> str:
return str(index + 1) # Fallback
number_tree = cast(DictionaryObject, root["/PageLabels"].get_object())
if "/Nums" in number_tree:
# [Nums] shall be an array of the form
# [ key 1 value 1 key 2 value 2 ... key n value n ]
# where each key_i is an integer and the corresponding
# value_i shall be the object associated with that key.
# The keys shall be sorted in numerical order,
# analogously to the arrangement of keys in a name tree
# as described in 7.9.6, "Name Trees."
nums = cast(ArrayObject, number_tree["/Nums"])
i = 0
value = None
start_index = 0
while i < len(nums):
start_index = nums[i]
value = nums[i + 1].get_object()
if i + 2 == len(nums):
return get_label_from_nums(number_tree, index)
if "/Kids" in number_tree and not isinstance(number_tree["/Kids"], NullObject):
# number_tree = {'/Kids': [IndirectObject(7333, 0, 140132998195856), ...]}
# Limit maximum depth.
level = 0
while level < 100:
kids = cast(List[DictionaryObject], number_tree["/Kids"])
for kid in kids:
# kid = {'/Limits': [0, 63], '/Nums': [0, {'/P': 'C1'}, ...]}
limits = cast(List[int], kid["/Limits"])
if limits[0] <= index <= limits[1]:
if kid.get("/Kids", None) is not None:
# Recursive definition.
level += 1
if level == 100: # pragma: no cover
raise NotImplementedError("Too deep nesting is not supported.")
number_tree = kid
# Exit the inner `for` loop and continue at the next level with the
# next iteration of the `while` loop.
break
return get_label_from_nums(kid, index)
else:
# When there are no kids, make sure to exit the `while` loop directly
# and continue with the fallback.
break
if nums[i + 2] > index:
break
i += 2
m = {
None: lambda n: "",
"/D": lambda n: str(n),
"/R": number2uppercase_roman_numeral,
"/r": number2lowercase_roman_numeral,
"/A": number2uppercase_letter,
"/a": number2lowercase_letter,
}
# if /Nums array is not following the specification or if /Nums is empty
if not isinstance(value, dict):
return str(index + 1) # Fallback
start = value.get("/St", 1)
prefix = value.get("/P", "")
return prefix + m[value.get("/S")](index - start_index + start)
if "/Kids" in number_tree or "/Limits" in number_tree:
logger_warning(
(
"/Kids or /Limits found in PageLabels. "
"This is not yet supported."
),
__name__,
)
# TODO: Implement /Kids and /Limits for number tree
return str(index + 1) # Fallback if /Nums is not in the number_tree

logger_warning(
f"Could not reliably determine page label for {index}.",
__name__
)
return str(index + 1) # Fallback if neither /Nums nor /Kids is in the number_tree


def nums_insert(
Expand Down
51 changes: 51 additions & 0 deletions tests/test_page_labels.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
"""Test the pypdf._page_labels module."""
from io import BytesIO
from pathlib import Path

import pytest

from pypdf import PdfReader
from pypdf._page_labels import (
get_label_from_nums,
index2label,
number2lowercase_letter,
number2lowercase_roman_numeral,
Expand All @@ -15,6 +17,7 @@
nums_next,
)
from pypdf.generic import (
ArrayObject,
DictionaryObject,
NameObject,
NullObject,
Expand All @@ -23,6 +26,10 @@

from . import get_data_from_url

TESTS_ROOT = Path(__file__).parent.resolve()
PROJECT_ROOT = TESTS_ROOT.parent
RESOURCE_ROOT = PROJECT_ROOT / "resources"


@pytest.mark.parametrize(
("number", "expected"),
Expand Down Expand Up @@ -103,3 +110,47 @@ def test_index2label(caplog):
r.trailer["/Root"]["/PageLabels"][NameObject("/Kids")] = NullObject()
assert index2label(r, 1) == "2"
assert caplog.text != ""


@pytest.mark.enable_socket()
def test_index2label_kids():
url = "https://www.bk.admin.ch/dam/bk/de/dokumente/terminologie/publikation_25_jahre_rtd.pdf.download.pdf/Terminologie_Epochen,%20Schwerpunkte,%20Umsetzungen.pdf" # noqa: E501
r = PdfReader(BytesIO(get_data_from_url(url=url, name="index2label_kids.pdf")))
expected = [
"C1",
"I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX", "X",
"XI", "XII", "XIII", "XIV", "XV", "XVI", "XVII",
] + list(map(str, range(1, 284)))
for x in ["20", "44", "58", "82", "94", "116", "154", "166", "192", "224", "250"]:
# Some page labels are unused. Removing them is still easier than copying the
# whole list itself here.
expected.remove(x)
assert r.page_labels == expected


@pytest.mark.enable_socket()
def test_index2label_kids__recursive(caplog):
url = "https://github.com/py-pdf/pypdf/files/14842446/tt1.pdf"
r = PdfReader(BytesIO(get_data_from_url(url=url, name="index2label_kids_recursive.pdf")))
expected = [
"A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L",
"M", "N", "O", "P", "17", "18", "19"
]
assert r.page_labels == expected
assert caplog.text != ""


def test_get_label_from_nums__empty_nums_list():
dictionary_object = DictionaryObject()
dictionary_object[NameObject("/Nums")] = ArrayObject()
assert get_label_from_nums(dictionary_object, 13) == "14"


def test_index2label__empty_kids_list():
reader = PdfReader(RESOURCE_ROOT / "crazyones.pdf")
number_tree = DictionaryObject()
number_tree[NameObject("/Kids")] = ArrayObject()
root = reader.root_object
root[NameObject("/PageLabels")] = number_tree

assert index2label(reader, 42) == "43"
Loading