lib/hexapdf/content/parser.rb in hexapdf-0.3.0 vs lib/hexapdf/content/parser.rb in hexapdf-0.4.0

- old
+ new

@@ -31,10 +31,11 @@ # is created or manipulated using HexaPDF. #++ require 'stringio' require 'hexapdf/tokenizer' +require 'hexapdf/content/processor' module HexaPDF module Content # More efficient tokenizer for content streams. This tokenizer class works directly on a @@ -43,10 +44,13 @@ # Note: Indirect object references are *not* supported by this tokenizer! # # See: PDF1.7 s7.2 class Tokenizer < HexaPDF::Tokenizer #:nodoc: + # The string that is tokenized. + attr_reader :string + # Creates a new tokenizer. def initialize(string) @ss = StringScanner.new(string) @string = string end @@ -166,10 +170,12 @@ end end private + MAX_TOKEN_CHECK = 5 #:nodoc: + # Parses the inline image at the current position. def parse_inline_image(tokenizer) # BI has already been read, so read the image dictionary dict = {} while (key = tokenizer.next_object(allow_keyword: true)) @@ -188,16 +194,44 @@ end # one whitespace character after ID tokenizer.next_byte - # find the EI operator - data = tokenizer.scan_until(/(?=EI[#{Tokenizer::WHITESPACE}])/o) - if data.nil? - raise HexaPDF::Error, "End inline image marker EI not found" + real_end_found = false + image_data = ''.b + + # find the EI operator and handle EI appearing inside the image data + until real_end_found + data = tokenizer.scan_until(/(?=EI(?:[#{Tokenizer::WHITESPACE}]|\z))/o) + if data.nil? + raise HexaPDF::Error, "End inline image marker EI not found" + end + image_data << data + tokenizer.pos += 2 + last_pos = tokenizer.pos + + # Check if we found EI inside of the image data + count = 0 + while count < MAX_TOKEN_CHECK + token = tokenizer.next_object(allow_keyword: true) rescue break + if token == Tokenizer::NO_MORE_TOKENS + count += MAX_TOKEN_CHECK + elsif token.kind_of?(Tokenizer::Token) && + !Processor::OPERATOR_MESSAGE_NAME_MAP.key?(token.to_sym) + break # invalid token + end + count += 1 + end + + if count >= MAX_TOKEN_CHECK + real_end_found = true + else + image_data << "EI" + end + tokenizer.pos = last_pos end - tokenizer.pos += 3 - [dict, data] + + [dict, image_data] end end end