# -*- encoding: utf-8; frozen_string_literal: true -*- # #-- # This file is part of HexaPDF. # # HexaPDF - A Versatile PDF Creation and Manipulation Library For Ruby # Copyright (C) 2014-2024 Thomas Leitner # # HexaPDF is free software: you can redistribute it and/or modify it # under the terms of the GNU Affero General Public License version 3 as # published by the Free Software Foundation with the addition of the # following permission added to Section 15 as permitted in Section 7(a): # FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY # THOMAS LEITNER, THOMAS LEITNER DISCLAIMS THE WARRANTY OF NON # INFRINGEMENT OF THIRD PARTY RIGHTS. # # HexaPDF is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with HexaPDF. If not, see . # # The interactive user interfaces in modified source and object code # versions of HexaPDF must display Appropriate Legal Notices, as required # under Section 5 of the GNU Affero General Public License version 3. # # In accordance with Section 7(b) of the GNU Affero General Public # License, a covered work must retain the producer line in every PDF that # is created or manipulated using HexaPDF. # # If the GNU Affero General Public License doesn't fit your need, # commercial licenses are available at . #++ require 'stringio' require 'hexapdf/tokenizer' require 'hexapdf/content/processor' module HexaPDF module Content # More efficient tokenizer for content streams. This tokenizer class works directly on a # string and not on an IO. # # Changes: # # * Since a content stream is usually parsed front to back, a StopIteration error can be raised # instead of returning +NO_MORE_TOKENS+ once the end of the string is reached to avoid costly # checks in each iteration. If this behaviour is wanted, pass "raise_on_eos: true" in the # constructor. # # * Indirect object references are *not* supported by this tokenizer! # # See: PDF2.0 s7.2 class Tokenizer < HexaPDF::Tokenizer #:nodoc: # The string that is tokenized. attr_reader :string # Creates a new tokenizer. def initialize(string, raise_on_eos: false) @ss = StringScanner.new(string) @string = string @raise_on_eos = raise_on_eos end # See: HexaPDF::Tokenizer#pos def pos @ss.pos end # See: HexaPDF::Tokenizer#pos= def pos=(pos) @ss.pos = pos end # See: HexaPDF::Tokenizer#scan_until def scan_until(re) @ss.scan_until(re) end # See: HexaPDF::Tokenizer#next_token def next_token @ss.skip(WHITESPACE_MULTI_RE) byte = @string.getbyte(@ss.pos) || -1 if (48 <= byte && byte <= 57) || byte == 45 || byte == 43 || byte == 46 # 0..9 - + . parse_number elsif (65 <= byte && byte <= 90) || (96 <= byte && byte <= 121) parse_keyword elsif byte == 47 # / parse_name elsif byte == 40 # ( parse_literal_string elsif byte == 60 # < if @string.getbyte(@ss.pos + 1) == 60 @ss.pos += 2 TOKEN_DICT_START else parse_hex_string end elsif byte == 62 # > unless @string.getbyte(@ss.pos + 1) == 62 raise HexaPDF::MalformedPDFError.new("Delimiter '>' found at invalid position", pos: pos) end @ss.pos += 2 TOKEN_DICT_END elsif byte == 91 # [ @ss.pos += 1 TOKEN_ARRAY_START elsif byte == 93 # ] @ss.pos += 1 TOKEN_ARRAY_END elsif byte == 123 || byte == 125 # { } Token.new(@ss.get_byte) elsif byte == 37 # % unless @ss.skip_until(/(?=[\r\n])/) (@raise_on_eos ? (raise StopIteration) : (return NO_MORE_TOKENS)) end next_token elsif byte == -1 @raise_on_eos ? raise(StopIteration) : NO_MORE_TOKENS else parse_keyword end end private # See: HexaPDF::Tokenizer#parse_number def parse_number if (val = @ss.scan(/[+-]?(?:\d+\.\d*|\.\d+)/)) val << '0' if val.getbyte(-1) == 46 # dot '.' Float(val) elsif (val = @ss.scan(/[+-]?\d++/)) val.to_i else parse_keyword end end # Stub implementation to prevent errors for not-overridden methods. def prepare_string_scanner(*) end end # This class knows how to correctly parse a content stream. # # == Overview # # A content stream is mostly just a stream of PDF objects. However, there is one exception: # inline images. # # Since inline images don't follow the normal PDF object parsing rules, they need to be # handled specially and this is the reason for this class. Therefore only the BI operator is # ever called for inline images because the ID and EI operators are handled by the parser. # # To parse some contents the #parse method needs to be called with the contents to be parsed # and a Processor object which is used for processing the parsed operators. class Parser # Creates a new Parser object and calls #parse. def self.parse(contents, processor = nil, &block) new.parse(contents, processor, &block) end # Parses the contents and calls the processor object or the given block for each parsed # operator. # # If a full-blown Processor is not needed (e.g. because the graphics state doesn't need to be # maintained), one can use the block form to handle the parsed objects and their parameters. # # Note: The parameters array is reused for each processed operator, so duplicate it if # necessary. def parse(contents, processor = nil, &block) #:yields: object, params raise ArgumentError, "Argument processor or block is needed" if processor.nil? && block.nil? if processor.nil? block.singleton_class.send(:alias_method, :process, :call) processor = block end tokenizer = Tokenizer.new(contents, raise_on_eos: true) params = [] loop do obj = tokenizer.next_object(allow_keyword: true) if obj.kind_of?(Tokenizer::Token) if obj == 'BI' params = parse_inline_image(tokenizer) end processor.process(obj.to_sym, params) params.clear else params << obj end end end private MAX_TOKEN_CHECK = 5 #:nodoc: # Parses the inline image at the current position. def parse_inline_image(tokenizer) # BI has already been read, so read the image dictionary dict = {} while (key = tokenizer.next_object(allow_keyword: true) rescue Tokenizer::NO_MORE_TOKENS) if key == 'ID' break elsif key == Tokenizer::NO_MORE_TOKENS raise HexaPDF::Error, "EOS while trying to read dictionary key for inline image" elsif !key.kind_of?(Symbol) raise HexaPDF::Error, "Inline image dictionary keys must be PDF name objects" end value = tokenizer.next_object rescue Tokenizer::NO_MORE_TOKENS if value == Tokenizer::NO_MORE_TOKENS raise HexaPDF::Error, "EOS while trying to read dictionary value for inline image" end dict[key] = value end # one whitespace character after ID tokenizer.next_byte real_end_found = false image_data = ''.b # find the EI operator and handle EI appearing inside the image data until real_end_found data = tokenizer.scan_until(/(?=EI(?:[#{Tokenizer::WHITESPACE}]|\z))/o) if data.nil? raise HexaPDF::Error, "End inline image marker EI not found" end image_data << data tokenizer.pos += 2 last_pos = tokenizer.pos # Check if we found EI inside of the image data count = 0 while count < MAX_TOKEN_CHECK token = tokenizer.next_object(allow_keyword: true) rescue Tokenizer::NO_MORE_TOKENS if token == Tokenizer::NO_MORE_TOKENS count += MAX_TOKEN_CHECK elsif token.kind_of?(Tokenizer::Token) && !Processor::OPERATOR_MESSAGE_NAME_MAP.key?(token.to_sym) break # invalid token end count += 1 end if count >= MAX_TOKEN_CHECK real_end_found = true else image_data << "EI" end tokenizer.pos = last_pos end [dict, image_data] end end end end