315 lines
12 KiB
Python
315 lines
12 KiB
Python
# Copyright (c) 2024, pypdf contributors
|
|
# All rights reserved.
|
|
#
|
|
# Redistribution and use in source and binary forms, with or without
|
|
# modification, are permitted provided that the following conditions are
|
|
# met:
|
|
#
|
|
# * Redistributions of source code must retain the above copyright notice,
|
|
# this list of conditions and the following disclaimer.
|
|
# * Redistributions in binary form must reproduce the above copyright notice,
|
|
# this list of conditions and the following disclaimer in the documentation
|
|
# and/or other materials provided with the distribution.
|
|
# * The name of the author may not be used to endorse or promote products
|
|
# derived from this software without specific prior written permission.
|
|
#
|
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
# POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
import logging
|
|
from io import BytesIO
|
|
from typing import IO
|
|
|
|
from .._utils import (
|
|
WHITESPACES,
|
|
WHITESPACES_AS_BYTES,
|
|
StreamType,
|
|
logger_warning,
|
|
read_non_whitespace,
|
|
)
|
|
from ..errors import PdfReadError
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# An inline image should be used only for small images (4096 bytes or less),
|
|
# but allow twice this for cases where this has been exceeded.
|
|
BUFFER_SIZE = 8192
|
|
|
|
|
|
def _check_end_image_marker(stream: StreamType) -> bool:
|
|
ei_tok = read_non_whitespace(stream)
|
|
ei_tok += stream.read(2)
|
|
stream.seek(-3, 1)
|
|
return ei_tok[:2] == b"EI" and (ei_tok[2:3] == b"" or ei_tok[2:3] in WHITESPACES)
|
|
|
|
|
|
def extract_inline__ascii_hex_decode(stream: StreamType) -> bytes:
|
|
"""
|
|
Extract HexEncoded stream from inline image.
|
|
The stream will be moved onto the EI.
|
|
"""
|
|
data_out: bytes = b""
|
|
# Read data until delimiter > and EI as backup.
|
|
while True:
|
|
data_buffered = read_non_whitespace(stream) + stream.read(BUFFER_SIZE)
|
|
if not data_buffered:
|
|
raise PdfReadError("Unexpected end of stream")
|
|
pos_tok = data_buffered.find(b">")
|
|
if pos_tok >= 0: # found >
|
|
data_out += data_buffered[: pos_tok + 1]
|
|
stream.seek(-len(data_buffered) + pos_tok + 1, 1)
|
|
break
|
|
pos_ei = data_buffered.find(b"EI")
|
|
if pos_ei >= 0: # found EI
|
|
stream.seek(-len(data_buffered) + pos_ei - 1, 1)
|
|
c = stream.read(1)
|
|
while c in WHITESPACES:
|
|
stream.seek(-2, 1)
|
|
c = stream.read(1)
|
|
pos_ei -= 1
|
|
data_out += data_buffered[:pos_ei]
|
|
break
|
|
if len(data_buffered) == 2:
|
|
data_out += data_buffered
|
|
raise PdfReadError("Unexpected end of stream")
|
|
# Neither > nor EI found
|
|
data_out += data_buffered[:-2]
|
|
stream.seek(-2, 1)
|
|
|
|
if not _check_end_image_marker(stream):
|
|
raise PdfReadError("EI stream not found")
|
|
return data_out
|
|
|
|
|
|
def extract_inline__ascii85_decode(stream: StreamType) -> bytes:
|
|
"""
|
|
Extract A85 stream from inline image.
|
|
The stream will be moved onto the EI.
|
|
"""
|
|
data_out: bytes = b""
|
|
# Read data until delimiter ~>
|
|
while True:
|
|
data_buffered = read_non_whitespace(stream) + stream.read(BUFFER_SIZE)
|
|
if not data_buffered:
|
|
raise PdfReadError("Unexpected end of stream")
|
|
pos_tok = data_buffered.find(b"~>")
|
|
if pos_tok >= 0: # found!
|
|
data_out += data_buffered[: pos_tok + 2]
|
|
stream.seek(-len(data_buffered) + pos_tok + 2, 1)
|
|
break
|
|
if len(data_buffered) == 2: # end of buffer
|
|
data_out += data_buffered
|
|
raise PdfReadError("Unexpected end of stream")
|
|
data_out += data_buffered[
|
|
:-2
|
|
] # back by one char in case of in the middle of ~>
|
|
stream.seek(-2, 1)
|
|
|
|
if not _check_end_image_marker(stream):
|
|
raise PdfReadError("EI stream not found")
|
|
return data_out
|
|
|
|
|
|
def extract_inline__run_length_decode(stream: StreamType) -> bytes:
|
|
"""
|
|
Extract RL (RunLengthDecode) stream from inline image.
|
|
The stream will be moved onto the EI.
|
|
"""
|
|
data_out: bytes = b""
|
|
# Read data until delimiter 128
|
|
while True:
|
|
data_buffered = stream.read(BUFFER_SIZE)
|
|
if not data_buffered:
|
|
raise PdfReadError("Unexpected end of stream")
|
|
pos_tok = data_buffered.find(b"\x80")
|
|
if pos_tok >= 0: # found
|
|
# Ideally, we could just use plain run-length decoding here, where 80_16 = 128_10
|
|
# marks the EOD. But there apparently are cases like in issue #3517, where we have
|
|
# an inline image with up to 51 EOD markers. In these cases, be resilient here and
|
|
# use the default `EI` marker detection instead. Please note that this fallback
|
|
# still omits special `EI` handling within the stream, but for now assume that having
|
|
# both of these cases occur at the same time is very unlikely (and the image stream
|
|
# is broken anyway).
|
|
# For now, do not skip over more than one whitespace character.
|
|
after_token = data_buffered[pos_tok + 1 : pos_tok + 4]
|
|
if after_token.startswith(b"EI") or after_token.endswith(b"EI"):
|
|
data_out += data_buffered[: pos_tok + 1]
|
|
stream.seek(-len(data_buffered) + pos_tok + 1, 1)
|
|
else:
|
|
logger_warning("Early EOD in RunLengthDecode of inline image, using fallback.", __name__)
|
|
ei_marker = data_buffered.find(b"EI")
|
|
if ei_marker > 0:
|
|
data_out += data_buffered[: ei_marker]
|
|
stream.seek(-len(data_buffered) + ei_marker - 1, 1)
|
|
break
|
|
data_out += data_buffered
|
|
|
|
if not _check_end_image_marker(stream):
|
|
raise PdfReadError("EI stream not found")
|
|
return data_out
|
|
|
|
|
|
def extract_inline__dct_decode(stream: StreamType) -> bytes:
|
|
"""
|
|
Extract DCT (JPEG) stream from inline image.
|
|
The stream will be moved onto the EI.
|
|
"""
|
|
def read(length: int) -> bytes:
|
|
# If 0 bytes are returned, and *size* was not 0, this indicates end of file.
|
|
# If the object is in non-blocking mode and no bytes are available, `None` is returned.
|
|
_result = stream.read(length)
|
|
if _result is None or len(_result) != length:
|
|
raise PdfReadError("Unexpected end of stream")
|
|
return _result
|
|
|
|
data_out: bytes = b""
|
|
# Read Blocks of data (ID/Size/data) up to ID=FF/D9
|
|
# https://www.digicamsoft.com/itu/itu-t81-36.html
|
|
not_first = False
|
|
while True:
|
|
c = read(1)
|
|
if not_first or (c == b"\xff"):
|
|
data_out += c
|
|
if c != b"\xff":
|
|
continue
|
|
not_first = True
|
|
c = read(1)
|
|
data_out += c
|
|
if c == b"\xff":
|
|
stream.seek(-1, 1) # pragma: no cover
|
|
elif c == b"\x00": # stuffing
|
|
pass
|
|
elif c == b"\xd9": # end
|
|
break
|
|
elif c in (
|
|
b"\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc9\xca\xcb\xcc\xcd\xce\xcf"
|
|
b"\xda\xdb\xdc\xdd\xde\xdf"
|
|
b"\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xfe"
|
|
):
|
|
c = read(2)
|
|
data_out += c
|
|
sz = c[0] * 256 + c[1]
|
|
data_out += read(sz - 2)
|
|
|
|
if not _check_end_image_marker(stream):
|
|
raise PdfReadError("EI stream not found")
|
|
return data_out
|
|
|
|
|
|
def extract_inline_default(stream: StreamType) -> bytes:
|
|
"""Legacy method, used by default"""
|
|
stream_out = BytesIO()
|
|
# Read the inline image, while checking for EI (End Image) operator.
|
|
while True:
|
|
data_buffered = stream.read(BUFFER_SIZE)
|
|
if not data_buffered:
|
|
raise PdfReadError("Unexpected end of stream")
|
|
pos_ei = data_buffered.find(
|
|
b"E"
|
|
) # We can not look straight for "EI" because it may not have been loaded in the buffer
|
|
|
|
if pos_ei == -1:
|
|
stream_out.write(data_buffered)
|
|
else:
|
|
# Write out everything including E (the one from EI to be removed)
|
|
stream_out.write(data_buffered[0 : pos_ei + 1])
|
|
sav_pos_ei = stream_out.tell() - 1
|
|
# Seek back in the stream to read the E next
|
|
stream.seek(pos_ei + 1 - len(data_buffered), 1)
|
|
saved_pos = stream.tell()
|
|
# Check for End Image
|
|
tok2 = stream.read(1) # I of "EI"
|
|
if tok2 != b"I":
|
|
stream.seek(saved_pos, 0)
|
|
continue
|
|
tok3 = stream.read(1) # possible space after "EI"
|
|
if tok3 not in WHITESPACES:
|
|
stream.seek(saved_pos, 0)
|
|
continue
|
|
while tok3 in WHITESPACES:
|
|
tok3 = stream.read(1)
|
|
if data_buffered[pos_ei - 1 : pos_ei] not in WHITESPACES and tok3 not in {
|
|
b"Q",
|
|
b"E",
|
|
}: # for Q or EMC
|
|
stream.seek(saved_pos, 0)
|
|
continue
|
|
if is_followed_by_binary_data(stream):
|
|
# Inline image contains `EI ` sequence usually marking the end of it, but
|
|
# is followed by binary data which does not make sense for the actual end.
|
|
stream.seek(saved_pos, 0)
|
|
continue
|
|
# Data contains [\s]EI[\s](Q|EMC): 4 chars are sufficient
|
|
# remove E(I) wrongly inserted earlier
|
|
stream.seek(saved_pos - 1, 0)
|
|
stream_out.truncate(sav_pos_ei)
|
|
break
|
|
|
|
return stream_out.getvalue()
|
|
|
|
|
|
def is_followed_by_binary_data(stream: IO[bytes], length: int = 10) -> bool:
|
|
"""
|
|
Check if the next bytes of the stream look like binary image data or regular page content.
|
|
|
|
This is just some heuristics due to the PDF specification being too imprecise about
|
|
inline images containing the `EI` marker which would end an image. Starting with PDF 2.0,
|
|
we finally get a mandatory length field, but with (proper) PDF 2.0 support being very limited
|
|
everywhere, we should not expect to be able to remove such hacks in the near future - especially
|
|
considering legacy documents as well.
|
|
|
|
The actual implementation draws some inspiration from
|
|
https://github.com/itext/itext-java/blob/9.1.0/kernel/src/main/java/com/itextpdf/kernel/pdf/canvas/parser/util/InlineImageParsingUtils.java
|
|
"""
|
|
position = stream.tell()
|
|
data = stream.read(length)
|
|
stream.seek(position)
|
|
if not data:
|
|
return False
|
|
operator_start = None
|
|
operator_end = None
|
|
|
|
for index, byte in enumerate(data):
|
|
if byte < 32 and byte not in WHITESPACES_AS_BYTES:
|
|
# This covers all characters not being displayable directly, although omitting whitespace
|
|
# to allow for operator detection.
|
|
return True
|
|
is_whitespace = byte in WHITESPACES_AS_BYTES
|
|
if operator_start is None and not is_whitespace:
|
|
# Interpret all other non-whitespace characters as the start of an operation.
|
|
operator_start = index
|
|
if operator_start is not None and is_whitespace:
|
|
# A whitespace stops an operation.
|
|
# Assume that having an inline image with tons of whitespace is rather unlikely.
|
|
operator_end = index
|
|
break
|
|
|
|
if operator_start is None:
|
|
# Inline images should not have tons of whitespaces, which would lead to no operator start.
|
|
return False
|
|
if operator_end is None:
|
|
# We probably are inside an operation.
|
|
operator_end = length
|
|
operator_length = operator_end - operator_start
|
|
operator = data[operator_start:operator_end]
|
|
if operator.startswith(b"/") and operator_length > 1:
|
|
# Name object.
|
|
return False
|
|
if operator.replace(b".", b"").isdigit():
|
|
# Graphics operator, for example a move. A number (integer or float).
|
|
return False
|
|
if operator_length > 3: # noqa: SIM103
|
|
# Usually, the operators inside a content stream should not have more than three characters,
|
|
# especially after an inline image.
|
|
return True
|
|
return False
|