194 lines
8.3 KiB
Python
194 lines
8.3 KiB
Python
|
|
from collections.abc import Sequence
|
||
|
|
from dataclasses import dataclass, field
|
||
|
|
from typing import Any, Optional, Union, cast
|
||
|
|
|
||
|
|
from pypdf.generic import ArrayObject, DictionaryObject, IndirectObject
|
||
|
|
|
||
|
|
from .errors import ParseError
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True)
|
||
|
|
class FontDescriptor:
|
||
|
|
"""
|
||
|
|
Represents the FontDescriptor dictionary as defined in the PDF specification.
|
||
|
|
This contains both descriptive and metric information.
|
||
|
|
|
||
|
|
The defaults are derived from the mean values of the 14 core fonts, rounded
|
||
|
|
to 100.
|
||
|
|
"""
|
||
|
|
|
||
|
|
name: str = "Unknown"
|
||
|
|
family: str = "Unknown"
|
||
|
|
weight: str = "Unknown"
|
||
|
|
|
||
|
|
ascent: float = 700.0
|
||
|
|
descent: float = -200.0
|
||
|
|
cap_height: float = 600.0
|
||
|
|
x_height: float = 500.0
|
||
|
|
italic_angle: float = 0.0 # Non-italic
|
||
|
|
flags: int = 32 # Non-serif, non-symbolic, not fixed width
|
||
|
|
bbox: tuple[float, float, float, float] = field(default_factory=lambda: (-100.0, -200.0, 1000.0, 900.0))
|
||
|
|
|
||
|
|
character_widths: dict[str, int] = field(default_factory=dict)
|
||
|
|
|
||
|
|
@staticmethod
|
||
|
|
def _parse_font_descriptor(font_kwargs: dict[str, Any], font_descriptor_obj: DictionaryObject) -> dict[str, Any]:
|
||
|
|
font_descriptor_dict: DictionaryObject = (
|
||
|
|
font_descriptor_obj.get_object()
|
||
|
|
if isinstance(font_descriptor_obj, IndirectObject)
|
||
|
|
else font_descriptor_obj
|
||
|
|
)
|
||
|
|
for source_key, target_key in [
|
||
|
|
("/FontName", "name"),
|
||
|
|
("/FontFamily", "family"),
|
||
|
|
("/FontWeight", "weight"),
|
||
|
|
("/Ascent", "ascent"),
|
||
|
|
("/Descent", "descent"),
|
||
|
|
("/CapHeight", "cap_height"),
|
||
|
|
("/XHeight", "x_height"),
|
||
|
|
("/ItalicAngle", "italic_angle"),
|
||
|
|
("/Flags", "flags"),
|
||
|
|
("/FontBBox", "bbox")
|
||
|
|
]:
|
||
|
|
if source_key in font_descriptor_dict:
|
||
|
|
font_kwargs[target_key] = font_descriptor_dict[source_key]
|
||
|
|
# No need for an if statement here, bbox is a required key in a font descriptor
|
||
|
|
bbox_tuple = tuple(map(float, font_kwargs["bbox"]))
|
||
|
|
assert len(bbox_tuple) == 4, bbox_tuple
|
||
|
|
font_kwargs["bbox"] = bbox_tuple
|
||
|
|
return font_kwargs
|
||
|
|
|
||
|
|
@staticmethod
|
||
|
|
def _collect_cid_character_widths(
|
||
|
|
d_font: DictionaryObject, char_map: dict[Any, Any], current_widths: dict[str, int]
|
||
|
|
) -> None:
|
||
|
|
"""Parses the /W array from a DescendantFont dictionary and updates character widths."""
|
||
|
|
ord_map = {
|
||
|
|
ord(_target): _surrogate
|
||
|
|
for _target, _surrogate in char_map.items()
|
||
|
|
if isinstance(_target, str)
|
||
|
|
}
|
||
|
|
# /W width definitions have two valid formats which can be mixed and matched:
|
||
|
|
# (1) A character start index followed by a list of widths, e.g.
|
||
|
|
# `45 [500 600 700]` applies widths 500, 600, 700 to characters 45-47.
|
||
|
|
# (2) A character start index, a character stop index, and a width, e.g.
|
||
|
|
# `45 65 500` applies width 500 to characters 45-65.
|
||
|
|
skip_count = 0
|
||
|
|
_w = d_font.get("/W", [])
|
||
|
|
for idx, w_entry in enumerate(_w):
|
||
|
|
w_entry = w_entry.get_object()
|
||
|
|
if skip_count:
|
||
|
|
skip_count -= 1
|
||
|
|
continue
|
||
|
|
if not isinstance(w_entry, (int, float)): # pragma: no cover
|
||
|
|
# We should never get here due to skip_count above. Add a
|
||
|
|
# warning and or use reader's "strict" to force an ex???
|
||
|
|
continue
|
||
|
|
# check for format (1): `int [int int int int ...]`
|
||
|
|
w_next_entry = _w[idx + 1].get_object()
|
||
|
|
if isinstance(w_next_entry, Sequence):
|
||
|
|
start_idx, width_list = w_entry, w_next_entry
|
||
|
|
current_widths.update(
|
||
|
|
{
|
||
|
|
ord_map[_cidx]: _width
|
||
|
|
for _cidx, _width in zip(
|
||
|
|
range(
|
||
|
|
cast(int, start_idx),
|
||
|
|
cast(int, start_idx) + len(width_list),
|
||
|
|
1,
|
||
|
|
),
|
||
|
|
width_list,
|
||
|
|
)
|
||
|
|
if _cidx in ord_map
|
||
|
|
}
|
||
|
|
)
|
||
|
|
skip_count = 1
|
||
|
|
# check for format (2): `int int int`
|
||
|
|
elif isinstance(w_next_entry, (int, float)) and isinstance(
|
||
|
|
_w[idx + 2].get_object(), (int, float)
|
||
|
|
):
|
||
|
|
start_idx, stop_idx, const_width = (
|
||
|
|
w_entry,
|
||
|
|
w_next_entry,
|
||
|
|
_w[idx + 2].get_object(),
|
||
|
|
)
|
||
|
|
current_widths.update(
|
||
|
|
{
|
||
|
|
ord_map[_cidx]: const_width
|
||
|
|
for _cidx in range(
|
||
|
|
cast(int, start_idx), cast(int, stop_idx + 1), 1
|
||
|
|
)
|
||
|
|
if _cidx in ord_map
|
||
|
|
}
|
||
|
|
)
|
||
|
|
skip_count = 2
|
||
|
|
else:
|
||
|
|
# Note: this doesn't handle the case of out of bounds (reaching the end of the width definitions
|
||
|
|
# while expecting more elements). This raises an IndexError which is sufficient.
|
||
|
|
raise ParseError(
|
||
|
|
f"Invalid font width definition. Next elements: {w_entry}, {w_next_entry}, {_w[idx + 2]}"
|
||
|
|
) # pragma: no cover
|
||
|
|
|
||
|
|
@classmethod
|
||
|
|
def from_font_resource(
|
||
|
|
cls,
|
||
|
|
pdf_font_dict: DictionaryObject,
|
||
|
|
encoding: Optional[Union[str, dict[int, str]]] = None,
|
||
|
|
char_map: Optional[dict[Any, Any]] = None
|
||
|
|
) -> "FontDescriptor":
|
||
|
|
from pypdf._cmap import get_encoding # noqa: PLC0415
|
||
|
|
from pypdf._codecs.core_fontmetrics import CORE_FONT_METRICS # noqa: PLC0415
|
||
|
|
# Prioritize information from the PDF font dictionary
|
||
|
|
font_name = pdf_font_dict.get("/BaseFont", "Unknown").removeprefix("/")
|
||
|
|
font_kwargs: dict[str, Any] = {"character_widths": {}}
|
||
|
|
|
||
|
|
# Deal with fonts by type; Type1, TrueType and certain Type3
|
||
|
|
if pdf_font_dict.get("/Subtype") in ("/Type1", "/MMType1", "/TrueType", "/Type3"):
|
||
|
|
if "/FontDescriptor" in pdf_font_dict:
|
||
|
|
# Collect character widths - TrueType and Type1 fonts
|
||
|
|
# have a /Widths array mapping character codes to widths
|
||
|
|
if not (encoding and char_map):
|
||
|
|
encoding, char_map = get_encoding(pdf_font_dict)
|
||
|
|
if isinstance(encoding, dict) and "/Widths" in pdf_font_dict:
|
||
|
|
first_char = pdf_font_dict.get("/FirstChar", 0)
|
||
|
|
font_kwargs["character_widths"] = {
|
||
|
|
encoding.get(idx + first_char, chr(idx + first_char)): width
|
||
|
|
for idx, width in enumerate(cast(ArrayObject, pdf_font_dict["/Widths"]))
|
||
|
|
}
|
||
|
|
# Collect font descriptor
|
||
|
|
font_kwargs = cls._parse_font_descriptor(
|
||
|
|
font_kwargs, pdf_font_dict.get("/FontDescriptor", DictionaryObject())
|
||
|
|
)
|
||
|
|
return cls(**font_kwargs)
|
||
|
|
|
||
|
|
if font_name in CORE_FONT_METRICS:
|
||
|
|
return CORE_FONT_METRICS[font_name]
|
||
|
|
|
||
|
|
# Composite font or CID font
|
||
|
|
# CID fonts have a /W array mapping character codes to widths stashed in /DescendantFonts
|
||
|
|
if "/DescendantFonts" in pdf_font_dict:
|
||
|
|
if not (encoding and char_map):
|
||
|
|
encoding, char_map = get_encoding(pdf_font_dict)
|
||
|
|
d_font: DictionaryObject
|
||
|
|
for d_font_idx, d_font in enumerate(
|
||
|
|
cast(ArrayObject, pdf_font_dict["/DescendantFonts"])
|
||
|
|
):
|
||
|
|
d_font = cast(DictionaryObject, d_font.get_object())
|
||
|
|
cast(ArrayObject, pdf_font_dict["/DescendantFonts"])[d_font_idx] = d_font
|
||
|
|
# Collect character widths
|
||
|
|
cls._collect_cid_character_widths(
|
||
|
|
d_font, char_map, font_kwargs["character_widths"]
|
||
|
|
)
|
||
|
|
# Collect font descriptor
|
||
|
|
font_kwargs = cls._parse_font_descriptor(
|
||
|
|
font_kwargs, d_font.get("/FontDescriptor", DictionaryObject())
|
||
|
|
)
|
||
|
|
|
||
|
|
return cls(**font_kwargs)
|
||
|
|
|
||
|
|
def text_width(self, text: str) -> float:
|
||
|
|
"""Sum of character widths specified in PDF font for the supplied text."""
|
||
|
|
return sum(
|
||
|
|
[self.character_widths.get(char, self.character_widths.get("default", 0)) for char in text], 0.0
|
||
|
|
)
|