290 lines
8.3 KiB
Python
290 lines
8.3 KiB
Python
|
|
"""
|
||
|
|
Page labels are shown by PDF viewers as "the page number".
|
||
|
|
|
||
|
|
A page has a numeric index, starting at 0. Additionally, the page
|
||
|
|
has a label. In the most simple case:
|
||
|
|
|
||
|
|
label = index + 1
|
||
|
|
|
||
|
|
However, the title page and the table of contents might have Roman numerals as
|
||
|
|
page labels. This makes things more complicated.
|
||
|
|
|
||
|
|
Example 1
|
||
|
|
---------
|
||
|
|
|
||
|
|
>>> reader.root_object["/PageLabels"]["/Nums"]
|
||
|
|
[0, IndirectObject(18, 0, 139929798197504),
|
||
|
|
8, IndirectObject(19, 0, 139929798197504)]
|
||
|
|
>>> reader.get_object(reader.root_object["/PageLabels"]["/Nums"][1])
|
||
|
|
{'/S': '/r'}
|
||
|
|
>>> reader.get_object(reader.root_object["/PageLabels"]["/Nums"][3])
|
||
|
|
{'/S': '/D'}
|
||
|
|
|
||
|
|
Example 2
|
||
|
|
---------
|
||
|
|
The following is a document with pages labeled
|
||
|
|
i, ii, iii, iv, 1, 2, 3, A-8, A-9, ...
|
||
|
|
|
||
|
|
1 0 obj
|
||
|
|
<< /Type /Catalog
|
||
|
|
/PageLabels << /Nums [
|
||
|
|
0 << /S /r >>
|
||
|
|
4 << /S /D >>
|
||
|
|
7 << /S /D
|
||
|
|
/P ( A- )
|
||
|
|
/St 8
|
||
|
|
>>
|
||
|
|
% A number tree containing
|
||
|
|
% three page label dictionaries
|
||
|
|
]
|
||
|
|
>>
|
||
|
|
...
|
||
|
|
>>
|
||
|
|
endobj
|
||
|
|
|
||
|
|
|
||
|
|
§12.4.2 PDF Specification 1.7 and 2.0
|
||
|
|
=====================================
|
||
|
|
|
||
|
|
Entries in a page label dictionary
|
||
|
|
----------------------------------
|
||
|
|
The /S key:
|
||
|
|
D Decimal Arabic numerals
|
||
|
|
R Uppercase Roman numerals
|
||
|
|
r Lowercase Roman numerals
|
||
|
|
A Uppercase letters (A to Z for the first 26 pages,
|
||
|
|
AA to ZZ for the next 26, and so on)
|
||
|
|
a Lowercase letters (a to z for the first 26 pages,
|
||
|
|
aa to zz for the next 26, and so on)
|
||
|
|
"""
|
||
|
|
|
||
|
|
from collections.abc import Iterator
|
||
|
|
from typing import Optional, cast
|
||
|
|
|
||
|
|
from ._protocols import PdfCommonDocProtocol
|
||
|
|
from ._utils import logger_warning
|
||
|
|
from .generic import (
|
||
|
|
ArrayObject,
|
||
|
|
DictionaryObject,
|
||
|
|
NullObject,
|
||
|
|
NumberObject,
|
||
|
|
is_null_or_none,
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
def number2uppercase_roman_numeral(num: int) -> str:
|
||
|
|
roman = [
|
||
|
|
(1000, "M"),
|
||
|
|
(900, "CM"),
|
||
|
|
(500, "D"),
|
||
|
|
(400, "CD"),
|
||
|
|
(100, "C"),
|
||
|
|
(90, "XC"),
|
||
|
|
(50, "L"),
|
||
|
|
(40, "XL"),
|
||
|
|
(10, "X"),
|
||
|
|
(9, "IX"),
|
||
|
|
(5, "V"),
|
||
|
|
(4, "IV"),
|
||
|
|
(1, "I"),
|
||
|
|
]
|
||
|
|
|
||
|
|
def roman_num(num: int) -> Iterator[str]:
|
||
|
|
for decimal, roman_repr in roman:
|
||
|
|
x, _ = divmod(num, decimal)
|
||
|
|
yield roman_repr * x
|
||
|
|
num -= decimal * x
|
||
|
|
if num <= 0:
|
||
|
|
break
|
||
|
|
|
||
|
|
return "".join(list(roman_num(num)))
|
||
|
|
|
||
|
|
|
||
|
|
def number2lowercase_roman_numeral(number: int) -> str:
|
||
|
|
return number2uppercase_roman_numeral(number).lower()
|
||
|
|
|
||
|
|
|
||
|
|
def number2uppercase_letter(number: int) -> str:
|
||
|
|
if number <= 0:
|
||
|
|
raise ValueError("Expecting a positive number")
|
||
|
|
alphabet = [chr(i) for i in range(ord("A"), ord("Z") + 1)]
|
||
|
|
rep = ""
|
||
|
|
while number > 0:
|
||
|
|
remainder = number % 26
|
||
|
|
if remainder == 0:
|
||
|
|
remainder = 26
|
||
|
|
rep = alphabet[remainder - 1] + rep
|
||
|
|
# update
|
||
|
|
number -= remainder
|
||
|
|
number = number // 26
|
||
|
|
return rep
|
||
|
|
|
||
|
|
|
||
|
|
def number2lowercase_letter(number: int) -> str:
|
||
|
|
return number2uppercase_letter(number).lower()
|
||
|
|
|
||
|
|
|
||
|
|
def get_label_from_nums(dictionary_object: DictionaryObject, index: int) -> str:
|
||
|
|
# [Nums] shall be an array of the form
|
||
|
|
# [ key_1 value_1 key_2 value_2 ... key_n value_n ]
|
||
|
|
# where each key_i is an integer and the corresponding
|
||
|
|
# value_i shall be the object associated with that key.
|
||
|
|
# The keys shall be sorted in numerical order,
|
||
|
|
# analogously to the arrangement of keys in a name tree
|
||
|
|
# as described in 7.9.6, "Name Trees."
|
||
|
|
nums = cast(ArrayObject, dictionary_object["/Nums"])
|
||
|
|
i = 0
|
||
|
|
value = None
|
||
|
|
start_index = 0
|
||
|
|
while i < len(nums):
|
||
|
|
start_index = nums[i]
|
||
|
|
value = nums[i + 1].get_object()
|
||
|
|
if i + 2 == len(nums):
|
||
|
|
break
|
||
|
|
if nums[i + 2] > index:
|
||
|
|
break
|
||
|
|
i += 2
|
||
|
|
m = {
|
||
|
|
None: lambda _: "",
|
||
|
|
"/D": lambda n: str(n),
|
||
|
|
"/R": number2uppercase_roman_numeral,
|
||
|
|
"/r": number2lowercase_roman_numeral,
|
||
|
|
"/A": number2uppercase_letter,
|
||
|
|
"/a": number2lowercase_letter,
|
||
|
|
}
|
||
|
|
# if /Nums array is not following the specification or if /Nums is empty
|
||
|
|
if not isinstance(value, dict):
|
||
|
|
return str(index + 1) # Fallback
|
||
|
|
start = value.get("/St", 1)
|
||
|
|
prefix = value.get("/P", "")
|
||
|
|
return prefix + m[value.get("/S")](index - start_index + start)
|
||
|
|
|
||
|
|
|
||
|
|
def index2label(reader: PdfCommonDocProtocol, index: int) -> str:
|
||
|
|
"""
|
||
|
|
See 7.9.7 "Number Trees".
|
||
|
|
|
||
|
|
Args:
|
||
|
|
reader: The PdfReader
|
||
|
|
index: The index of the page
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
The label of the page, e.g. "iv" or "4".
|
||
|
|
|
||
|
|
"""
|
||
|
|
root = cast(DictionaryObject, reader.root_object)
|
||
|
|
if "/PageLabels" not in root:
|
||
|
|
return str(index + 1) # Fallback
|
||
|
|
number_tree = cast(DictionaryObject, root["/PageLabels"].get_object())
|
||
|
|
if "/Nums" in number_tree:
|
||
|
|
return get_label_from_nums(number_tree, index)
|
||
|
|
if "/Kids" in number_tree and not isinstance(number_tree["/Kids"], NullObject):
|
||
|
|
# number_tree = {'/Kids': [IndirectObject(7333, 0, 140132998195856), ...]}
|
||
|
|
# Limit maximum depth.
|
||
|
|
level = 0
|
||
|
|
while level < 100:
|
||
|
|
kids = cast(list[DictionaryObject], number_tree["/Kids"])
|
||
|
|
for kid in kids:
|
||
|
|
# kid = {'/Limits': [0, 63], '/Nums': [0, {'/P': 'C1'}, ...]}
|
||
|
|
limits = cast(list[int], kid["/Limits"])
|
||
|
|
if limits[0] <= index <= limits[1]:
|
||
|
|
if not is_null_or_none(kid.get("/Kids", None)):
|
||
|
|
# Recursive definition.
|
||
|
|
level += 1
|
||
|
|
if level == 100: # pragma: no cover
|
||
|
|
raise NotImplementedError(
|
||
|
|
"Too deep nesting is not supported."
|
||
|
|
)
|
||
|
|
number_tree = kid
|
||
|
|
# Exit the inner `for` loop and continue at the next level with the
|
||
|
|
# next iteration of the `while` loop.
|
||
|
|
break
|
||
|
|
return get_label_from_nums(kid, index)
|
||
|
|
else:
|
||
|
|
# When there are no kids, make sure to exit the `while` loop directly
|
||
|
|
# and continue with the fallback.
|
||
|
|
break
|
||
|
|
|
||
|
|
logger_warning(f"Could not reliably determine page label for {index}.", __name__)
|
||
|
|
return str(index + 1) # Fallback if neither /Nums nor /Kids is in the number_tree
|
||
|
|
|
||
|
|
|
||
|
|
def nums_insert(
|
||
|
|
key: NumberObject,
|
||
|
|
value: DictionaryObject,
|
||
|
|
nums: ArrayObject,
|
||
|
|
) -> None:
|
||
|
|
"""
|
||
|
|
Insert a key, value pair in a Nums array.
|
||
|
|
|
||
|
|
See 7.9.7 "Number Trees".
|
||
|
|
|
||
|
|
Args:
|
||
|
|
key: number key of the entry
|
||
|
|
value: value of the entry
|
||
|
|
nums: Nums array to modify
|
||
|
|
|
||
|
|
"""
|
||
|
|
if len(nums) % 2 != 0:
|
||
|
|
raise ValueError("A nums like array must have an even number of elements")
|
||
|
|
|
||
|
|
i = len(nums)
|
||
|
|
while i != 0 and key <= nums[i - 2]:
|
||
|
|
i = i - 2
|
||
|
|
|
||
|
|
if i < len(nums) and key == nums[i]:
|
||
|
|
nums[i + 1] = value
|
||
|
|
else:
|
||
|
|
nums.insert(i, key)
|
||
|
|
nums.insert(i + 1, value)
|
||
|
|
|
||
|
|
|
||
|
|
def nums_clear_range(
|
||
|
|
key: NumberObject,
|
||
|
|
page_index_to: int,
|
||
|
|
nums: ArrayObject,
|
||
|
|
) -> None:
|
||
|
|
"""
|
||
|
|
Remove all entries in a number tree in a range after an entry.
|
||
|
|
|
||
|
|
See 7.9.7 "Number Trees".
|
||
|
|
|
||
|
|
Args:
|
||
|
|
key: number key of the entry before the range
|
||
|
|
page_index_to: The page index of the upper limit of the range
|
||
|
|
nums: Nums array to modify
|
||
|
|
|
||
|
|
"""
|
||
|
|
if len(nums) % 2 != 0:
|
||
|
|
raise ValueError("A nums like array must have an even number of elements")
|
||
|
|
if page_index_to < key:
|
||
|
|
raise ValueError("page_index_to must be greater or equal than key")
|
||
|
|
|
||
|
|
i = nums.index(key) + 2
|
||
|
|
while i < len(nums) and nums[i] <= page_index_to:
|
||
|
|
nums.pop(i)
|
||
|
|
nums.pop(i)
|
||
|
|
|
||
|
|
|
||
|
|
def nums_next(
|
||
|
|
key: NumberObject,
|
||
|
|
nums: ArrayObject,
|
||
|
|
) -> tuple[Optional[NumberObject], Optional[DictionaryObject]]:
|
||
|
|
"""
|
||
|
|
Return the (key, value) pair of the entry after the given one.
|
||
|
|
|
||
|
|
See 7.9.7 "Number Trees".
|
||
|
|
|
||
|
|
Args:
|
||
|
|
key: number key of the entry
|
||
|
|
nums: Nums array
|
||
|
|
|
||
|
|
"""
|
||
|
|
if len(nums) % 2 != 0:
|
||
|
|
raise ValueError("A nums like array must have an even number of elements")
|
||
|
|
|
||
|
|
i = nums.index(key) + 2
|
||
|
|
if i < len(nums):
|
||
|
|
return (nums[i], nums[i + 1])
|
||
|
|
return (None, None)
|