group-wbl/.venv/lib/python3.13/site-packages/pypdf/_cmap.py
2026-01-09 09:48:03 +08:00

537 lines
18 KiB
Python
Raw Permalink Blame History

import binascii
from binascii import Error as BinasciiError
from binascii import unhexlify
from math import ceil
from typing import Any, Union, cast
from ._codecs import adobe_glyphs, charset_encoding
from ._codecs.core_fontmetrics import CORE_FONT_METRICS
from ._utils import logger_error, logger_warning
from .generic import (
ArrayObject,
DecodedStreamObject,
DictionaryObject,
NullObject,
StreamObject,
is_null_or_none,
)
# code freely inspired from @twiggy ; see #711
def build_char_map(
font_name: str, space_width: float, obj: DictionaryObject
) -> tuple[str, float, Union[str, dict[int, str]], dict[Any, Any], DictionaryObject]:
"""
Determine information about a font.
Args:
font_name: font name as a string
space_width: default space width if no data is found.
obj: XObject or Page where you can find a /Resource dictionary
Returns:
Font sub-type, space_width criteria (50% of width), encoding, map character-map, font-dictionary.
The font-dictionary itself is suitable for the curious.
"""
ft: DictionaryObject = obj["/Resources"]["/Font"][font_name] # type: ignore
font_subtype, font_halfspace, font_encoding, font_map = build_char_map_from_dict(
space_width, ft
)
return font_subtype, font_halfspace, font_encoding, font_map, ft
def build_char_map_from_dict(
space_width: float, ft: DictionaryObject
) -> tuple[str, float, Union[str, dict[int, str]], dict[Any, Any]]:
"""
Determine information about a font.
Args:
space_width: default space with if no data found
(normally half the width of a character).
ft: Font Dictionary
Returns:
Font sub-type, space_width criteria(50% of width), encoding, map character-map.
The font-dictionary itself is suitable for the curious.
"""
font_type = cast(str, ft["/Subtype"].get_object())
encoding, map_dict = get_encoding(ft)
space_key_char = get_actual_str_key(" ", encoding, map_dict)
font_width_map = build_font_width_map(ft, space_width * 2.0)
half_space_width = compute_space_width(font_width_map, space_key_char) / 2.0
return (
font_type,
half_space_width,
encoding,
# https://github.com/python/mypy/issues/4374
map_dict
)
# used when missing data, e.g. font def missing
unknown_char_map: tuple[str, float, Union[str, dict[int, str]], dict[Any, Any]] = (
"Unknown",
9999,
dict.fromkeys(range(256), "<EFBFBD>"),
{},
)
_predefined_cmap: dict[str, str] = {
"/Identity-H": "utf-16-be",
"/Identity-V": "utf-16-be",
"/GB-EUC-H": "gbk",
"/GB-EUC-V": "gbk",
"/GBpc-EUC-H": "gb2312",
"/GBpc-EUC-V": "gb2312",
"/GBK-EUC-H": "gbk",
"/GBK-EUC-V": "gbk",
"/GBK2K-H": "gb18030",
"/GBK2K-V": "gb18030",
"/ETen-B5-H": "cp950",
"/ETen-B5-V": "cp950",
"/ETenms-B5-H": "cp950",
"/ETenms-B5-V": "cp950",
"/UniCNS-UTF16-H": "utf-16-be",
"/UniCNS-UTF16-V": "utf-16-be",
"/UniGB-UTF16-H": "gb18030",
"/UniGB-UTF16-V": "gb18030",
# UCS2 in code
}
def get_encoding(
ft: DictionaryObject
) -> tuple[Union[str, dict[int, str]], dict[Any, Any]]:
encoding = _parse_encoding(ft)
map_dict, int_entry = _parse_to_unicode(ft)
# Apply rule from PDF ref 1.7 §5.9.1, 1st bullet:
# if cmap not empty encoding should be discarded
# (here transformed into identity for those characters)
# If encoding is a string, it is expected to be an identity translation.
if isinstance(encoding, dict):
for x in int_entry:
if x <= 255:
encoding[x] = chr(x)
return encoding, map_dict
def _parse_encoding(
ft: DictionaryObject
) -> Union[str, dict[int, str]]:
encoding: Union[str, list[str], dict[int, str]] = []
if "/Encoding" not in ft:
if "/BaseFont" in ft and cast(str, ft["/BaseFont"]) in charset_encoding:
encoding = dict(
zip(range(256), charset_encoding[cast(str, ft["/BaseFont"])])
)
else:
encoding = "charmap"
return encoding
enc: Union[str, DictionaryObject, NullObject] = cast(
Union[str, DictionaryObject, NullObject], ft["/Encoding"].get_object()
)
if isinstance(enc, str):
try:
# already done : enc = NameObject.unnumber(enc.encode()).decode()
# for #xx decoding
if enc in charset_encoding:
encoding = charset_encoding[enc].copy()
elif enc in _predefined_cmap:
encoding = _predefined_cmap[enc]
elif "-UCS2-" in enc:
encoding = "utf-16-be"
else:
raise Exception("not found")
except Exception:
logger_error(f"Advanced encoding {enc} not implemented yet", __name__)
encoding = enc
elif isinstance(enc, DictionaryObject) and "/BaseEncoding" in enc:
try:
encoding = charset_encoding[cast(str, enc["/BaseEncoding"])].copy()
except Exception:
logger_error(
f"Advanced encoding {encoding} not implemented yet",
__name__,
)
encoding = charset_encoding["/StandardEncoding"].copy()
else:
encoding = charset_encoding["/StandardEncoding"].copy()
if isinstance(enc, DictionaryObject) and "/Differences" in enc:
x: int = 0
o: Union[int, str]
for o in cast(DictionaryObject, enc["/Differences"]):
if isinstance(o, int):
x = o
else: # isinstance(o, str):
try:
if x < len(encoding):
encoding[x] = adobe_glyphs[o] # type: ignore
except Exception:
encoding[x] = o # type: ignore
x += 1
if isinstance(encoding, list):
encoding = dict(zip(range(256), encoding))
return encoding
def _parse_to_unicode(
ft: DictionaryObject
) -> tuple[dict[Any, Any], list[int]]:
# will store all translation code
# and map_dict[-1] we will have the number of bytes to convert
map_dict: dict[Any, Any] = {}
# will provide the list of cmap keys as int to correct encoding
int_entry: list[int] = []
if "/ToUnicode" not in ft:
if ft.get("/Subtype", "") == "/Type1":
return _type1_alternative(ft, map_dict, int_entry)
return {}, []
process_rg: bool = False
process_char: bool = False
multiline_rg: Union[
None, tuple[int, int]
] = None # tuple = (current_char, remaining size) ; cf #1285 for example of file
cm = prepare_cm(ft)
for line in cm.split(b"\n"):
process_rg, process_char, multiline_rg = process_cm_line(
line.strip(b" \t"),
process_rg,
process_char,
multiline_rg,
map_dict,
int_entry,
)
return map_dict, int_entry
def get_actual_str_key(
value_char: str, encoding: Union[str, dict[int, str]], map_dict: dict[Any, Any]
) -> str:
key_dict = {}
if isinstance(encoding, dict):
key_dict = {value: chr(key) for key, value in encoding.items() if value == value_char}
else:
key_dict = {value: key for key, value in map_dict.items() if value == value_char}
return key_dict.get(value_char, value_char)
def prepare_cm(ft: DictionaryObject) -> bytes:
tu = ft["/ToUnicode"]
cm: bytes
if isinstance(tu, StreamObject):
cm = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data()
else: # if (tu is None) or cast(str, tu).startswith("/Identity"):
# the full range 0000-FFFF will be processed
cm = b"beginbfrange\n<0000> <0001> <0000>\nendbfrange"
if isinstance(cm, str):
cm = cm.encode()
# we need to prepare cm before due to missing return line in pdf printed
# to pdf from word
cm = (
cm.strip()
.replace(b"beginbfchar", b"\nbeginbfchar\n")
.replace(b"endbfchar", b"\nendbfchar\n")
.replace(b"beginbfrange", b"\nbeginbfrange\n")
.replace(b"endbfrange", b"\nendbfrange\n")
.replace(b"<<", b"\n{\n") # text between << and >> not used but
.replace(b">>", b"\n}\n") # some solution to find it back
)
ll = cm.split(b"<")
for i in range(len(ll)):
j = ll[i].find(b">")
if j >= 0:
if j == 0:
# string is empty: stash a placeholder here (see below)
# see https://github.com/py-pdf/pypdf/issues/1111
content = b"."
else:
content = ll[i][:j].replace(b" ", b"")
ll[i] = content + b" " + ll[i][j + 1 :]
cm = (
(b" ".join(ll))
.replace(b"[", b" [ ")
.replace(b"]", b" ]\n ")
.replace(b"\r", b"\n")
)
return cm
def process_cm_line(
line: bytes,
process_rg: bool,
process_char: bool,
multiline_rg: Union[None, tuple[int, int]],
map_dict: dict[Any, Any],
int_entry: list[int],
) -> tuple[bool, bool, Union[None, tuple[int, int]]]:
if line == b"" or line[0] == 37: # 37 = %
return process_rg, process_char, multiline_rg
line = line.replace(b"\t", b" ")
if b"beginbfrange" in line:
process_rg = True
elif b"endbfrange" in line:
process_rg = False
elif b"beginbfchar" in line:
process_char = True
elif b"endbfchar" in line:
process_char = False
elif process_rg:
try:
multiline_rg = parse_bfrange(line, map_dict, int_entry, multiline_rg)
except binascii.Error as error:
logger_warning(f"Skipping broken line {line!r}: {error}", __name__)
elif process_char:
parse_bfchar(line, map_dict, int_entry)
return process_rg, process_char, multiline_rg
def parse_bfrange(
line: bytes,
map_dict: dict[Any, Any],
int_entry: list[int],
multiline_rg: Union[None, tuple[int, int]],
) -> Union[None, tuple[int, int]]:
lst = [x for x in line.split(b" ") if x]
closure_found = False
if multiline_rg is not None:
fmt = b"%%0%dX" % (map_dict[-1] * 2)
a = multiline_rg[0] # a, b not in the current line
b = multiline_rg[1]
for sq in lst:
if sq == b"]":
closure_found = True
break
map_dict[
unhexlify(fmt % a).decode(
"charmap" if map_dict[-1] == 1 else "utf-16-be",
"surrogatepass",
)
] = unhexlify(sq).decode("utf-16-be", "surrogatepass")
int_entry.append(a)
a += 1
else:
a = int(lst[0], 16)
b = int(lst[1], 16)
nbi = max(len(lst[0]), len(lst[1]))
map_dict[-1] = ceil(nbi / 2)
fmt = b"%%0%dX" % (map_dict[-1] * 2)
if lst[2] == b"[":
for sq in lst[3:]:
if sq == b"]":
closure_found = True
break
map_dict[
unhexlify(fmt % a).decode(
"charmap" if map_dict[-1] == 1 else "utf-16-be",
"surrogatepass",
)
] = unhexlify(sq).decode("utf-16-be", "surrogatepass")
int_entry.append(a)
a += 1
else: # case without list
c = int(lst[2], 16)
fmt2 = b"%%0%dX" % max(4, len(lst[2]))
closure_found = True
while a <= b:
map_dict[
unhexlify(fmt % a).decode(
"charmap" if map_dict[-1] == 1 else "utf-16-be",
"surrogatepass",
)
] = unhexlify(fmt2 % c).decode("utf-16-be", "surrogatepass")
int_entry.append(a)
a += 1
c += 1
return None if closure_found else (a, b)
def parse_bfchar(line: bytes, map_dict: dict[Any, Any], int_entry: list[int]) -> None:
lst = [x for x in line.split(b" ") if x]
map_dict[-1] = len(lst[0]) // 2
while len(lst) > 1:
map_to = ""
# placeholder (see above) means empty string
if lst[1] != b".":
try:
map_to = unhexlify(lst[1]).decode(
"charmap" if len(lst[1]) < 4 else "utf-16-be", "surrogatepass"
) # join is here as some cases where the code was split
except BinasciiError as exception:
logger_warning(f"Got invalid hex string: {exception!s} ({lst[1]!r})", __name__)
map_dict[
unhexlify(lst[0]).decode(
"charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass"
)
] = map_to
int_entry.append(int(lst[0], 16))
lst = lst[2:]
def build_font_width_map(
ft: DictionaryObject, default_font_width: float
) -> dict[Any, float]:
font_width_map: dict[Any, float] = {}
st: int = 0
en: int = 0
if "/DescendantFonts" in ft: # ft["/Subtype"].startswith("/CIDFontType"):
# §9.7.4.3 of the 1.7 reference ("Glyph Metrics in CIDFonts")
# Widths for a CIDFont are defined using the DW and W entries.
# DW2 and W2 are for vertical use. Vertical type is not implemented.
ft1 = ft["/DescendantFonts"][0].get_object() # type: ignore
if "/DW" in ft1:
font_width_map["default"] = cast(float, ft1["/DW"].get_object())
else:
font_name = str(ft["/BaseFont"]).removeprefix("/")
if font_name in CORE_FONT_METRICS:
# This applies to test_tounicode_is_identity, which has a CID CourierNew font that
# apparently does not specify the width of a space.
font_width_map["default"] = CORE_FONT_METRICS[font_name].character_widths[" "] * 2
else:
font_width_map["default"] = default_font_width
if "/W" in ft1:
w = ft1["/W"].get_object()
else:
w = []
while len(w) > 0:
st = w[0] if isinstance(w[0], int) else w[0].get_object()
second = w[1].get_object()
if isinstance(second, int):
# C_first C_last same_W
en = second
width = w[2].get_object()
if not isinstance(width, (int, float)):
logger_warning(f"Expected numeric value for width, got {width}. Ignoring it.", __name__)
w = w[3:]
continue
for c_code in range(st, en + 1):
font_width_map[chr(c_code)] = width
w = w[3:]
elif isinstance(second, list):
# Starting_C [W1 W2 ... Wn]
c_code = st
for ww in second:
width = ww.get_object()
font_width_map[chr(c_code)] = width
c_code += 1
w = w[2:]
else:
logger_warning(
"unknown widths : \n" + (ft1["/W"]).__repr__(),
__name__,
)
break
elif "/Widths" in ft:
w = cast(ArrayObject, ft["/Widths"].get_object())
if "/FontDescriptor" in ft and "/MissingWidth" in cast(
DictionaryObject, ft["/FontDescriptor"]
):
font_width_map["default"] = ft["/FontDescriptor"]["/MissingWidth"].get_object() # type: ignore
else:
# will consider width of char as avg(width)
m = 0
cpt = 0
for xx in w:
xx = xx.get_object()
if xx > 0:
m += xx
cpt += 1
font_width_map["default"] = m / max(1, cpt)
st = cast(int, ft["/FirstChar"])
en = cast(int, ft["/LastChar"])
for c_code in range(st, en + 1):
try:
width = w[c_code - st].get_object()
font_width_map[chr(c_code)] = width
except (IndexError, KeyError):
# The PDF structure is invalid. The array is too small
# for the specified font width.
pass
else:
font_name = str(ft["/BaseFont"]).removeprefix("/")
if font_name in CORE_FONT_METRICS:
font_width_map = cast(dict[str, float], CORE_FONT_METRICS[font_name].character_widths)
font_width_map["default"] = font_width_map[" "] * 2
if is_null_or_none(font_width_map.get("default")):
font_width_map["default"] = 0
return font_width_map
def compute_space_width(
font_width_map: dict[Any, float], space_char: str
) -> float:
try:
sp_width = font_width_map[space_char]
if sp_width == 0:
raise ValueError("Zero width")
except (KeyError, ValueError):
sp_width = (
font_width_map["default"] / 2.0
) # if using default we consider space will be only half size
return sp_width
def compute_font_width(
font_width_map: dict[Any, float],
char: str
) -> float:
char_width: float = 0.0
try:
char_width = font_width_map[char]
except KeyError:
char_width = (
font_width_map["default"]
)
return char_width
def _type1_alternative(
ft: DictionaryObject,
map_dict: dict[Any, Any],
int_entry: list[int],
) -> tuple[dict[Any, Any], list[int]]:
if "/FontDescriptor" not in ft:
return map_dict, int_entry
ft_desc = cast(DictionaryObject, ft["/FontDescriptor"]).get("/FontFile")
if is_null_or_none(ft_desc):
return map_dict, int_entry
assert ft_desc is not None, "mypy"
txt = ft_desc.get_object().get_data()
txt = txt.split(b"eexec\n")[0] # only clear part
txt = txt.split(b"/Encoding")[1] # to get the encoding part
lines = txt.replace(b"\r", b"\n").split(b"\n")
for li in lines:
if li.startswith(b"dup"):
words = [_w for _w in li.split(b" ") if _w != b""]
if len(words) > 3 and words[3] != b"put":
continue
try:
i = int(words[1])
except ValueError: # pragma: no cover
continue
try:
v = adobe_glyphs[words[2].decode()]
except KeyError:
if words[2].startswith(b"/uni"):
try:
v = chr(int(words[2][4:], 16))
except ValueError: # pragma: no cover
continue
else:
continue
map_dict[chr(i)] = v
int_entry.append(i)
return map_dict, int_entry