578 lines
21 KiB
Python
578 lines
21 KiB
Python
"""Functions to convert an image XObject to an image"""
|
|
|
|
import sys
|
|
from io import BytesIO
|
|
from typing import Any, Literal, Optional, Union, cast
|
|
|
|
from ._utils import check_if_whitespace_only, logger_warning
|
|
from .constants import ColorSpaces, StreamAttributes
|
|
from .constants import FilterTypes as FT
|
|
from .constants import ImageAttributes as IA
|
|
from .errors import EmptyImageDataError, PdfReadError
|
|
from .generic import (
|
|
ArrayObject,
|
|
DecodedStreamObject,
|
|
EncodedStreamObject,
|
|
NullObject,
|
|
TextStringObject,
|
|
is_null_or_none,
|
|
)
|
|
|
|
if sys.version_info[:2] >= (3, 10):
|
|
from typing import TypeAlias
|
|
else:
|
|
from typing_extensions import TypeAlias
|
|
|
|
|
|
try:
|
|
from PIL import Image, UnidentifiedImageError
|
|
except ImportError:
|
|
raise ImportError(
|
|
"pillow is required to do image extraction. "
|
|
"It can be installed via 'pip install pypdf[image]'"
|
|
)
|
|
|
|
mode_str_type: TypeAlias = Literal[
|
|
"", "1", "RGB", "2bits", "4bits", "P", "L", "RGBA", "CMYK"
|
|
]
|
|
|
|
MAX_IMAGE_MODE_NESTING_DEPTH: int = 10
|
|
|
|
|
|
def _get_image_mode(
|
|
color_space: Union[str, list[Any], Any],
|
|
color_components: int,
|
|
prev_mode: mode_str_type,
|
|
depth: int = 0,
|
|
) -> tuple[mode_str_type, bool]:
|
|
"""
|
|
Returns:
|
|
Image mode, not taking into account mask (transparency).
|
|
ColorInversion is required (like for some DeviceCMYK).
|
|
|
|
"""
|
|
if depth > MAX_IMAGE_MODE_NESTING_DEPTH:
|
|
raise PdfReadError(
|
|
"Color spaces nested too deeply. If required, consider increasing MAX_IMAGE_MODE_NESTING_DEPTH."
|
|
)
|
|
if is_null_or_none(color_space):
|
|
return "", False
|
|
color_space_str: str = ""
|
|
if isinstance(color_space, str):
|
|
color_space_str = color_space
|
|
elif not isinstance(color_space, list):
|
|
raise PdfReadError(
|
|
"Cannot interpret color space", color_space
|
|
) # pragma: no cover
|
|
elif not color_space:
|
|
return "", False
|
|
elif color_space[0].startswith("/Cal"): # /CalRGB or /CalGray
|
|
color_space_str = "/Device" + color_space[0][4:]
|
|
elif color_space[0] == "/ICCBased":
|
|
icc_profile = color_space[1].get_object()
|
|
color_components = cast(int, icc_profile["/N"])
|
|
color_space_str = icc_profile.get("/Alternate", "")
|
|
elif color_space[0] == "/Indexed":
|
|
color_space_str = color_space[1].get_object()
|
|
mode, invert_color = _get_image_mode(
|
|
color_space_str, color_components, prev_mode, depth + 1
|
|
)
|
|
if mode in ("RGB", "CMYK"):
|
|
mode = "P"
|
|
return mode, invert_color
|
|
elif color_space[0] == "/Separation":
|
|
color_space_str = color_space[2].get_object()
|
|
mode, invert_color = _get_image_mode(
|
|
color_space_str, color_components, prev_mode, depth + 1
|
|
)
|
|
return mode, True
|
|
elif color_space[0] == "/DeviceN":
|
|
original_color_space = color_space
|
|
color_components = len(color_space[1])
|
|
color_space_str = color_space[2].get_object()
|
|
if color_space_str == "/DeviceCMYK" and color_components == 1:
|
|
if original_color_space[1][0] != "/Black":
|
|
logger_warning(
|
|
f"Color {original_color_space[1][0]} converted to Gray. Please share PDF with pypdf dev team",
|
|
__name__,
|
|
)
|
|
return "L", True
|
|
mode, invert_color = _get_image_mode(
|
|
color_space_str, color_components, prev_mode, depth + 1
|
|
)
|
|
return mode, invert_color
|
|
|
|
mode_map: dict[str, mode_str_type] = {
|
|
"1bit": "1", # must be zeroth position: color_components may index the values
|
|
"/DeviceGray": "L", # must be first position: color_components may index the values
|
|
"palette": "P", # must be second position: color_components may index the values
|
|
"/DeviceRGB": "RGB", # must be third position: color_components may index the values
|
|
"/DeviceCMYK": "CMYK", # must be fourth position: color_components may index the values
|
|
"2bit": "2bits",
|
|
"4bit": "4bits",
|
|
}
|
|
|
|
mode = (
|
|
mode_map.get(color_space_str)
|
|
or list(mode_map.values())[color_components]
|
|
or prev_mode
|
|
)
|
|
|
|
return mode, mode == "CMYK"
|
|
|
|
|
|
def bits2byte(data: bytes, size: tuple[int, int], bits: int) -> bytes:
|
|
mask = (1 << bits) - 1
|
|
byte_buffer = bytearray(size[0] * size[1])
|
|
data_index = 0
|
|
bit = 8 - bits
|
|
for y in range(size[1]):
|
|
if bit != 8 - bits:
|
|
data_index += 1
|
|
bit = 8 - bits
|
|
for x in range(size[0]):
|
|
byte_buffer[x + y * size[0]] = (data[data_index] >> bit) & mask
|
|
bit -= bits
|
|
if bit < 0:
|
|
data_index += 1
|
|
bit = 8 - bits
|
|
return bytes(byte_buffer)
|
|
|
|
|
|
def _extended_image_from_bytes(
|
|
mode: str, size: tuple[int, int], data: bytes
|
|
) -> Image.Image:
|
|
try:
|
|
img = Image.frombytes(mode, size, data)
|
|
except ValueError as exc:
|
|
nb_pix = size[0] * size[1]
|
|
data_length = len(data)
|
|
if data_length == 0:
|
|
raise EmptyImageDataError(
|
|
"Data is 0 bytes, cannot process an image from empty data."
|
|
) from exc
|
|
if data_length % nb_pix != 0:
|
|
raise exc
|
|
k = nb_pix * len(mode) / data_length
|
|
data = b"".join(bytes((x,) * int(k)) for x in data)
|
|
img = Image.frombytes(mode, size, data)
|
|
return img
|
|
|
|
|
|
def __handle_flate__indexed(color_space: ArrayObject) -> tuple[Any, Any, Any, Any]:
|
|
count = len(color_space)
|
|
if count == 4:
|
|
color_space, base, hival, lookup = (value.get_object() for value in color_space)
|
|
return color_space, base, hival, lookup
|
|
|
|
# Deal with strange AutoDesk files where `base` and `hival` look like this:
|
|
# /DeviceRGB\x00255
|
|
element1 = color_space[1]
|
|
element1 = element1 if isinstance(element1, str) else element1.get_object()
|
|
if count == 3 and "\x00" in element1:
|
|
color_space, lookup = color_space[0].get_object(), color_space[2].get_object()
|
|
base, hival = element1.split("\x00")
|
|
hival = int(hival)
|
|
return color_space, base, hival, lookup
|
|
raise PdfReadError(f"Expected color space with 4 values, got {count}: {color_space}")
|
|
|
|
|
|
def _handle_flate(
|
|
size: tuple[int, int],
|
|
data: bytes,
|
|
mode: mode_str_type,
|
|
color_space: str,
|
|
colors: int,
|
|
obj_as_text: str,
|
|
) -> tuple[Image.Image, str, str, bool]:
|
|
"""
|
|
Process image encoded in flateEncode
|
|
Returns img, image_format, extension, color inversion
|
|
"""
|
|
extension = ".png" # mime_type: "image/png"
|
|
image_format = "PNG"
|
|
lookup: Any
|
|
base: Any
|
|
hival: Any
|
|
if isinstance(color_space, ArrayObject) and color_space[0] == "/Indexed":
|
|
color_space, base, hival, lookup = __handle_flate__indexed(color_space)
|
|
if mode == "2bits":
|
|
mode = "P"
|
|
data = bits2byte(data, size, 2)
|
|
elif mode == "4bits":
|
|
mode = "P"
|
|
data = bits2byte(data, size, 4)
|
|
img = _extended_image_from_bytes(mode, size, data)
|
|
if color_space == "/Indexed":
|
|
if isinstance(lookup, (EncodedStreamObject, DecodedStreamObject)):
|
|
lookup = lookup.get_data()
|
|
if isinstance(lookup, TextStringObject):
|
|
lookup = lookup.original_bytes
|
|
if isinstance(lookup, str):
|
|
lookup = lookup.encode()
|
|
try:
|
|
nb, conv, mode = { # type: ignore
|
|
"1": (0, "", ""),
|
|
"L": (1, "P", "L"),
|
|
"P": (0, "", ""),
|
|
"RGB": (3, "P", "RGB"),
|
|
"CMYK": (4, "P", "CMYK"),
|
|
}[_get_image_mode(base, 0, "")[0]]
|
|
except KeyError: # pragma: no cover
|
|
logger_warning(
|
|
f"Base {base} not coded please share the pdf file with pypdf dev team",
|
|
__name__,
|
|
)
|
|
lookup = None
|
|
else:
|
|
if img.mode == "1":
|
|
# Two values ("high" and "low").
|
|
expected_count = 2 * nb
|
|
actual_count = len(lookup)
|
|
if actual_count != expected_count:
|
|
if actual_count < expected_count:
|
|
logger_warning(
|
|
f"Not enough lookup values: Expected {expected_count}, got {actual_count}.",
|
|
__name__
|
|
)
|
|
lookup += bytes([0] * (expected_count - actual_count))
|
|
elif not check_if_whitespace_only(lookup[expected_count:]):
|
|
logger_warning(
|
|
f"Too many lookup values: Expected {expected_count}, got {actual_count}.",
|
|
__name__
|
|
)
|
|
lookup = lookup[:expected_count]
|
|
colors_arr = [lookup[:nb], lookup[nb:]]
|
|
arr = b"".join(
|
|
b"".join(
|
|
colors_arr[1 if img.getpixel((x, y)) > 127 else 0]
|
|
for x in range(img.size[0])
|
|
)
|
|
for y in range(img.size[1])
|
|
)
|
|
img = Image.frombytes(mode, img.size, arr)
|
|
else:
|
|
img = img.convert(conv)
|
|
if len(lookup) != (hival + 1) * nb:
|
|
logger_warning(f"Invalid Lookup Table in {obj_as_text}", __name__)
|
|
lookup = None
|
|
elif mode == "L":
|
|
# gray lookup does not work: it is converted to a similar RGB lookup
|
|
lookup = b"".join([bytes([b, b, b]) for b in lookup])
|
|
mode = "RGB"
|
|
# TODO: https://github.com/py-pdf/pypdf/pull/2039
|
|
# this is a work around until PIL is able to process CMYK images
|
|
elif mode == "CMYK":
|
|
_rgb = []
|
|
for _c, _m, _y, _k in (
|
|
lookup[n : n + 4] for n in range(0, 4 * (len(lookup) // 4), 4)
|
|
):
|
|
_r = int(255 * (1 - _c / 255) * (1 - _k / 255))
|
|
_g = int(255 * (1 - _m / 255) * (1 - _k / 255))
|
|
_b = int(255 * (1 - _y / 255) * (1 - _k / 255))
|
|
_rgb.append(bytes((_r, _g, _b)))
|
|
lookup = b"".join(_rgb)
|
|
mode = "RGB"
|
|
if lookup is not None:
|
|
img.putpalette(lookup, rawmode=mode)
|
|
img = img.convert("L" if base == ColorSpaces.DEVICE_GRAY else "RGB")
|
|
elif not isinstance(color_space, NullObject) and color_space[0] == "/ICCBased":
|
|
# Table 65 - Additional Entries Specific to an ICC Profile Stream Dictionary
|
|
mode2 = _get_image_mode(color_space, colors, mode)[0]
|
|
if mode != mode2:
|
|
img = Image.frombytes(mode2, size, data) # reloaded as mode may have changed
|
|
if mode == "CMYK":
|
|
extension = ".tif"
|
|
image_format = "TIFF"
|
|
return img, image_format, extension, False
|
|
|
|
|
|
def _handle_jpx(
|
|
size: tuple[int, int],
|
|
data: bytes,
|
|
mode: mode_str_type,
|
|
color_space: str,
|
|
colors: int,
|
|
) -> tuple[Image.Image, str, str, bool]:
|
|
"""
|
|
Process image encoded in flateEncode
|
|
Returns img, image_format, extension, inversion
|
|
"""
|
|
extension = ".jp2" # mime_type: "image/x-jp2"
|
|
img1 = Image.open(BytesIO(data), formats=("JPEG2000",))
|
|
mode, invert_color = _get_image_mode(color_space, colors, mode)
|
|
if mode == "":
|
|
mode = cast(mode_str_type, img1.mode)
|
|
invert_color = mode in ("CMYK",)
|
|
if img1.mode == "RGBA" and mode == "RGB":
|
|
mode = "RGBA"
|
|
# we need to convert to the good mode
|
|
if img1.mode == mode or {img1.mode, mode} == {"L", "P"}: # compare (unordered) sets
|
|
# L and P are indexed modes which should not be changed.
|
|
img = img1
|
|
elif {img1.mode, mode} == {"RGBA", "CMYK"}:
|
|
# RGBA / CMYK are 4bytes encoding where
|
|
# the encoding should be corrected
|
|
img = Image.frombytes(mode, img1.size, img1.tobytes())
|
|
else: # pragma: no cover
|
|
img = img1.convert(mode)
|
|
# CMYK conversion
|
|
# https://stcom/questions/38855022/conversion-from-cmyk-to-rgb-with-pillow-is-different-from-that-of-photoshop
|
|
# not implemented for the moment as I need to get properly the ICC
|
|
if img.mode == "CMYK":
|
|
img = img.convert("RGB")
|
|
image_format = "JPEG2000"
|
|
return img, image_format, extension, invert_color
|
|
|
|
|
|
def _apply_decode(
|
|
img: Image.Image,
|
|
x_object_obj: dict[str, Any],
|
|
lfilters: FT,
|
|
color_space: Union[str, list[Any], Any],
|
|
invert_color: bool,
|
|
) -> Image.Image:
|
|
# CMYK image and other color spaces without decode
|
|
# requires reverting scale (cf p243,2§ last sentence)
|
|
decode = x_object_obj.get(
|
|
IA.DECODE,
|
|
([1.0, 0.0] * len(img.getbands()))
|
|
if (
|
|
(img.mode == "CMYK" and lfilters in (FT.DCT_DECODE, FT.JPX_DECODE))
|
|
or (invert_color and img.mode == "L")
|
|
)
|
|
else None,
|
|
)
|
|
if (
|
|
isinstance(color_space, ArrayObject)
|
|
and color_space[0].get_object() == "/Indexed"
|
|
):
|
|
decode = None # decode is meaningless if Indexed
|
|
if (
|
|
isinstance(color_space, ArrayObject)
|
|
and color_space[0].get_object() == "/Separation"
|
|
):
|
|
decode = [1.0, 0.0] * len(img.getbands())
|
|
if decode is not None and not all(decode[i] == i % 2 for i in range(len(decode))):
|
|
lut: list[int] = []
|
|
for i in range(0, len(decode), 2):
|
|
dmin = decode[i]
|
|
dmax = decode[i + 1]
|
|
lut.extend(
|
|
round(255.0 * (j / 255.0 * (dmax - dmin) + dmin)) for j in range(256)
|
|
)
|
|
img = img.point(lut)
|
|
return img
|
|
|
|
|
|
def _get_mode_and_invert_color(
|
|
x_object_obj: dict[str, Any], colors: int, color_space: Union[str, list[Any], Any]
|
|
) -> tuple[mode_str_type, bool]:
|
|
if (
|
|
IA.COLOR_SPACE in x_object_obj
|
|
and x_object_obj[IA.COLOR_SPACE] == ColorSpaces.DEVICE_RGB
|
|
):
|
|
# https://pillow.readthedocs.io/en/stable/handbook/concepts.html#modes
|
|
mode: mode_str_type = "RGB"
|
|
if x_object_obj.get("/BitsPerComponent", 8) < 8:
|
|
mode, invert_color = _get_image_mode(
|
|
f"{x_object_obj.get('/BitsPerComponent', 8)}bit", 0, ""
|
|
)
|
|
else:
|
|
mode, invert_color = _get_image_mode(
|
|
color_space,
|
|
2
|
|
if (
|
|
colors == 1
|
|
and (
|
|
not is_null_or_none(color_space)
|
|
and "Gray" not in color_space
|
|
)
|
|
)
|
|
else colors,
|
|
"",
|
|
)
|
|
return mode, invert_color
|
|
|
|
|
|
def _xobj_to_image(
|
|
x_object: dict[str, Any],
|
|
pillow_parameters: Union[dict[str, Any], None] = None
|
|
) -> tuple[Optional[str], bytes, Any]:
|
|
"""
|
|
Users need to have the pillow package installed.
|
|
|
|
It's unclear if pypdf will keep this function here, hence it's private.
|
|
It might get removed at any point.
|
|
|
|
Args:
|
|
x_object:
|
|
pillow_parameters: parameters provided to Pillow Image.save() method,
|
|
cf. <https://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.Image.save>
|
|
|
|
Returns:
|
|
Tuple[file extension, bytes, PIL.Image.Image]
|
|
|
|
"""
|
|
def _apply_alpha(
|
|
img: Image.Image,
|
|
x_object: dict[str, Any],
|
|
obj_as_text: str,
|
|
image_format: str,
|
|
extension: str,
|
|
) -> tuple[Image.Image, str, str]:
|
|
alpha = None
|
|
if IA.S_MASK in x_object: # add alpha channel
|
|
alpha = _xobj_to_image(x_object[IA.S_MASK])[2]
|
|
if img.size != alpha.size:
|
|
logger_warning(
|
|
f"image and mask size not matching: {obj_as_text}", __name__
|
|
)
|
|
else:
|
|
# TODO: implement mask
|
|
if alpha.mode != "L":
|
|
alpha = alpha.convert("L")
|
|
if img.mode == "P":
|
|
img = img.convert("RGB")
|
|
elif img.mode == "1":
|
|
img = img.convert("L")
|
|
img.putalpha(alpha)
|
|
if "JPEG" in image_format:
|
|
image_format = "JPEG2000"
|
|
extension = ".jp2"
|
|
else:
|
|
image_format = "PNG"
|
|
extension = ".png"
|
|
return img, extension, image_format
|
|
|
|
# For error reporting
|
|
obj_as_text = (
|
|
x_object.indirect_reference.__repr__()
|
|
if x_object is None # pragma: no cover
|
|
else x_object.__repr__()
|
|
)
|
|
|
|
# Get size and data
|
|
size = (cast(int, x_object[IA.WIDTH]), cast(int, x_object[IA.HEIGHT]))
|
|
data = x_object.get_data() # type: ignore
|
|
if isinstance(data, str): # pragma: no cover
|
|
data = data.encode()
|
|
if len(data) % (size[0] * size[1]) == 1 and data[-1] == 0x0A: # ie. '\n'
|
|
data = data[:-1]
|
|
|
|
# Get color properties
|
|
colors = x_object.get("/Colors", 1)
|
|
color_space: Any = x_object.get("/ColorSpace", NullObject()).get_object()
|
|
if isinstance(color_space, list) and len(color_space) == 1:
|
|
color_space = color_space[0].get_object()
|
|
|
|
mode, invert_color = _get_mode_and_invert_color(x_object, colors, color_space)
|
|
|
|
# Get filters
|
|
filters = x_object.get(StreamAttributes.FILTER, NullObject()).get_object()
|
|
lfilters = filters[-1] if isinstance(filters, list) else filters
|
|
decode_parms = x_object.get(StreamAttributes.DECODE_PARMS, None)
|
|
if decode_parms and isinstance(decode_parms, (tuple, list)):
|
|
decode_parms = decode_parms[0]
|
|
else:
|
|
decode_parms = {}
|
|
if not isinstance(decode_parms, dict):
|
|
decode_parms = {}
|
|
|
|
extension = None
|
|
if lfilters in (FT.FLATE_DECODE, FT.RUN_LENGTH_DECODE):
|
|
img, image_format, extension, _ = _handle_flate(
|
|
size,
|
|
data,
|
|
mode,
|
|
color_space,
|
|
colors,
|
|
obj_as_text,
|
|
)
|
|
elif lfilters in (FT.LZW_DECODE, FT.ASCII_85_DECODE):
|
|
# I'm not sure if the following logic is correct.
|
|
# There might not be any relationship between the filters and the
|
|
# extension
|
|
if lfilters == FT.LZW_DECODE:
|
|
image_format = "TIFF"
|
|
extension = ".tiff" # mime_type = "image/tiff"
|
|
else:
|
|
image_format = "PNG"
|
|
extension = ".png" # mime_type = "image/png"
|
|
try:
|
|
img = Image.open(BytesIO(data), formats=("TIFF", "PNG"))
|
|
except UnidentifiedImageError:
|
|
img = _extended_image_from_bytes(mode, size, data)
|
|
elif lfilters == FT.DCT_DECODE:
|
|
img, image_format, extension = Image.open(BytesIO(data)), "JPEG", ".jpg"
|
|
# invert_color kept unchanged
|
|
elif lfilters == FT.JPX_DECODE:
|
|
img, image_format, extension, invert_color = _handle_jpx(
|
|
size, data, mode, color_space, colors
|
|
)
|
|
elif lfilters == FT.CCITT_FAX_DECODE:
|
|
img, image_format, extension, invert_color = (
|
|
Image.open(BytesIO(data), formats=("TIFF",)),
|
|
"TIFF",
|
|
".tiff",
|
|
False,
|
|
)
|
|
elif lfilters == FT.JBIG2_DECODE:
|
|
img, image_format, extension, invert_color = (
|
|
Image.open(BytesIO(data), formats=("PNG", "PPM")),
|
|
"PNG",
|
|
".png",
|
|
False,
|
|
)
|
|
elif mode == "CMYK":
|
|
img, image_format, extension, invert_color = (
|
|
_extended_image_from_bytes(mode, size, data),
|
|
"TIFF",
|
|
".tif",
|
|
False,
|
|
)
|
|
elif mode == "":
|
|
raise PdfReadError(f"ColorSpace field not found in {x_object}")
|
|
else:
|
|
img, image_format, extension, invert_color = (
|
|
_extended_image_from_bytes(mode, size, data),
|
|
"PNG",
|
|
".png",
|
|
False,
|
|
)
|
|
|
|
img = _apply_decode(img, x_object, lfilters, color_space, invert_color)
|
|
img, extension, image_format = _apply_alpha(
|
|
img, x_object, obj_as_text, image_format, extension
|
|
)
|
|
|
|
if pillow_parameters is None:
|
|
pillow_parameters = {}
|
|
# Preserve JPEG image quality - see issue #3515.
|
|
if image_format == "JPEG":
|
|
# This prevents: Cannot use 'keep' when original image is not a JPEG:
|
|
# "JPEG" is the value of PIL.JpegImagePlugin.JpegImageFile.format
|
|
img.format = "JPEG" # type: ignore[misc]
|
|
if "quality" not in pillow_parameters:
|
|
pillow_parameters["quality"] = "keep"
|
|
|
|
# Save image to bytes
|
|
img_byte_arr = BytesIO()
|
|
try:
|
|
img.save(img_byte_arr, format=image_format, **pillow_parameters)
|
|
except OSError: # pragma: no cover # covered with pillow 10.3
|
|
# in case of we convert to RGBA and then to PNG
|
|
img1 = img.convert("RGBA")
|
|
image_format = "PNG"
|
|
extension = ".png"
|
|
img_byte_arr = BytesIO()
|
|
img1.save(img_byte_arr, format=image_format)
|
|
data = img_byte_arr.getvalue()
|
|
|
|
try: # temporary try/except until other fixes of images
|
|
img = Image.open(BytesIO(data))
|
|
except Exception as exception:
|
|
logger_warning(f"Failed loading image: {exception}", __name__)
|
|
img = None # type: ignore
|
|
return extension, data, img
|