# -*- encoding: utf-8; frozen_string_literal: true -*-
#
#--
# This file is part of HexaPDF.
#
# HexaPDF - A Versatile PDF Creation and Manipulation Library For Ruby
# Copyright (C) 2014-2024 Thomas Leitner
#
# HexaPDF is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License version 3 as
# published by the Free Software Foundation with the addition of the
# following permission added to Section 15 as permitted in Section 7(a):
# FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY
# THOMAS LEITNER, THOMAS LEITNER DISCLAIMS THE WARRANTY OF NON
# INFRINGEMENT OF THIRD PARTY RIGHTS.
#
# HexaPDF is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
# License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with HexaPDF. If not, see .
#
# The interactive user interfaces in modified source and object code
# versions of HexaPDF must display Appropriate Legal Notices, as required
# under Section 5 of the GNU Affero General Public License version 3.
#
# In accordance with Section 7(b) of the GNU Affero General Public
# License, a covered work must retain the producer line in every PDF that
# is created or manipulated using HexaPDF.
#
# If the GNU Affero General Public License doesn't fit your need,
# commercial licenses are available at .
#++
require 'stringio'
require 'hexapdf/tokenizer'
require 'hexapdf/content/processor'
module HexaPDF
module Content
# More efficient tokenizer for content streams. This tokenizer class works directly on a
# string and not on an IO.
#
# Changes:
#
# * Since a content stream is usually parsed front to back, a StopIteration error can be raised
# instead of returning +NO_MORE_TOKENS+ once the end of the string is reached to avoid costly
# checks in each iteration. If this behaviour is wanted, pass "raise_on_eos: true" in the
# constructor.
#
# * Indirect object references are *not* supported by this tokenizer!
#
# See: PDF2.0 s7.2
class Tokenizer < HexaPDF::Tokenizer #:nodoc:
# The string that is tokenized.
attr_reader :string
# Creates a new tokenizer.
def initialize(string, raise_on_eos: false)
@ss = StringScanner.new(string)
@string = string
@raise_on_eos = raise_on_eos
end
# See: HexaPDF::Tokenizer#pos
def pos
@ss.pos
end
# See: HexaPDF::Tokenizer#pos=
def pos=(pos)
@ss.pos = pos
end
# See: HexaPDF::Tokenizer#scan_until
def scan_until(re)
@ss.scan_until(re)
end
# See: HexaPDF::Tokenizer#next_token
def next_token
@ss.skip(WHITESPACE_MULTI_RE)
byte = @string.getbyte(@ss.pos) || -1
if (48 <= byte && byte <= 57) || byte == 45 || byte == 43 || byte == 46 # 0..9 - + .
parse_number
elsif (65 <= byte && byte <= 90) || (96 <= byte && byte <= 121)
parse_keyword
elsif byte == 47 # /
parse_name
elsif byte == 40 # (
parse_literal_string
elsif byte == 60 # <
if @string.getbyte(@ss.pos + 1) == 60
@ss.pos += 2
TOKEN_DICT_START
else
parse_hex_string
end
elsif byte == 62 # >
unless @string.getbyte(@ss.pos + 1) == 62
raise HexaPDF::MalformedPDFError.new("Delimiter '>' found at invalid position", pos: pos)
end
@ss.pos += 2
TOKEN_DICT_END
elsif byte == 91 # [
@ss.pos += 1
TOKEN_ARRAY_START
elsif byte == 93 # ]
@ss.pos += 1
TOKEN_ARRAY_END
elsif byte == 123 || byte == 125 # { }
Token.new(@ss.get_byte)
elsif byte == 37 # %
unless @ss.skip_until(/(?=[\r\n])/)
(@raise_on_eos ? (raise StopIteration) : (return NO_MORE_TOKENS))
end
next_token
elsif byte == -1
@raise_on_eos ? raise(StopIteration) : NO_MORE_TOKENS
else
parse_keyword
end
end
private
# See: HexaPDF::Tokenizer#parse_number
def parse_number
if (val = @ss.scan(/[+-]?(?:\d+\.\d*|\.\d+)/))
val << '0' if val.getbyte(-1) == 46 # dot '.'
Float(val)
elsif (val = @ss.scan(/[+-]?\d++/))
val.to_i
else
parse_keyword
end
end
# Stub implementation to prevent errors for not-overridden methods.
def prepare_string_scanner(*)
end
end
# This class knows how to correctly parse a content stream.
#
# == Overview
#
# A content stream is mostly just a stream of PDF objects. However, there is one exception:
# inline images.
#
# Since inline images don't follow the normal PDF object parsing rules, they need to be
# handled specially and this is the reason for this class. Therefore only the BI operator is
# ever called for inline images because the ID and EI operators are handled by the parser.
#
# To parse some contents the #parse method needs to be called with the contents to be parsed
# and a Processor object which is used for processing the parsed operators.
class Parser
# Creates a new Parser object and calls #parse.
def self.parse(contents, processor = nil, &block)
new.parse(contents, processor, &block)
end
# Parses the contents and calls the processor object or the given block for each parsed
# operator.
#
# If a full-blown Processor is not needed (e.g. because the graphics state doesn't need to be
# maintained), one can use the block form to handle the parsed objects and their parameters.
#
# Note: The parameters array is reused for each processed operator, so duplicate it if
# necessary.
def parse(contents, processor = nil, &block) #:yields: object, params
raise ArgumentError, "Argument processor or block is needed" if processor.nil? && block.nil?
if processor.nil?
block.singleton_class.send(:alias_method, :process, :call)
processor = block
end
tokenizer = Tokenizer.new(contents, raise_on_eos: true)
params = []
loop do
obj = tokenizer.next_object(allow_keyword: true)
if obj.kind_of?(Tokenizer::Token)
if obj == 'BI'
params = parse_inline_image(tokenizer)
end
processor.process(obj.to_sym, params)
params.clear
else
params << obj
end
end
end
private
MAX_TOKEN_CHECK = 5 #:nodoc:
# Parses the inline image at the current position.
def parse_inline_image(tokenizer)
# BI has already been read, so read the image dictionary
dict = {}
while (key = tokenizer.next_object(allow_keyword: true) rescue Tokenizer::NO_MORE_TOKENS)
if key == 'ID'
break
elsif key == Tokenizer::NO_MORE_TOKENS
raise HexaPDF::Error, "EOS while trying to read dictionary key for inline image"
elsif !key.kind_of?(Symbol)
raise HexaPDF::Error, "Inline image dictionary keys must be PDF name objects"
end
value = tokenizer.next_object rescue Tokenizer::NO_MORE_TOKENS
if value == Tokenizer::NO_MORE_TOKENS
raise HexaPDF::Error, "EOS while trying to read dictionary value for inline image"
end
dict[key] = value
end
# one whitespace character after ID
tokenizer.next_byte
real_end_found = false
image_data = ''.b
# find the EI operator and handle EI appearing inside the image data
until real_end_found
data = tokenizer.scan_until(/(?=EI(?:[#{Tokenizer::WHITESPACE}]|\z))/o)
if data.nil?
raise HexaPDF::Error, "End inline image marker EI not found"
end
image_data << data
tokenizer.pos += 2
last_pos = tokenizer.pos
# Check if we found EI inside of the image data
count = 0
while count < MAX_TOKEN_CHECK
token = tokenizer.next_object(allow_keyword: true) rescue Tokenizer::NO_MORE_TOKENS
if token == Tokenizer::NO_MORE_TOKENS
count += MAX_TOKEN_CHECK
elsif token.kind_of?(Tokenizer::Token) &&
!Processor::OPERATOR_MESSAGE_NAME_MAP.key?(token.to_sym)
break # invalid token
end
count += 1
end
if count >= MAX_TOKEN_CHECK
real_end_found = true
else
image_data << "EI"
end
tokenizer.pos = last_pos
end
[dict, image_data]
end
end
end
end