parser.rb in hexapdf-0.4.0

- old
+ new

@@ -31,10 +31,11 @@
 # is created or manipulated using HexaPDF.
 #++
 
 require 'stringio'
 require 'hexapdf/tokenizer'
+require 'hexapdf/content/processor'
 
 module HexaPDF
   module Content
 
     # More efficient tokenizer for content streams. This tokenizer class works directly on a
@@ -43,10 +44,13 @@
     # Note: Indirect object references are *not* supported by this tokenizer!
     #
     # See: PDF1.7 s7.2
     class Tokenizer < HexaPDF::Tokenizer #:nodoc:
 
+      # The string that is tokenized.
+      attr_reader :string
+
       # Creates a new tokenizer.
       def initialize(string)
         @ss = StringScanner.new(string)
         @string = string
       end
@@ -166,10 +170,12 @@
         end
       end
 
       private
 
+      MAX_TOKEN_CHECK = 5 #:nodoc:
+
       # Parses the inline image at the current position.
       def parse_inline_image(tokenizer)
         # BI has already been read, so read the image dictionary
         dict = {}
         while (key = tokenizer.next_object(allow_keyword: true))
@@ -188,16 +194,44 @@
         end
 
         # one whitespace character after ID
         tokenizer.next_byte
 
-        # find the EI operator
-        data = tokenizer.scan_until(/(?=EI[#{Tokenizer::WHITESPACE}])/o)
-        if data.nil?
-          raise HexaPDF::Error, "End inline image marker EI not found"
+        real_end_found = false
+        image_data = ''.b
+
+        # find the EI operator and handle EI appearing inside the image data
+        until real_end_found
+          data = tokenizer.scan_until(/(?=EI(?:[#{Tokenizer::WHITESPACE}]|\z))/o)
+          if data.nil?
+            raise HexaPDF::Error, "End inline image marker EI not found"
+          end
+          image_data << data
+          tokenizer.pos += 2
+          last_pos = tokenizer.pos
+
+          # Check if we found EI inside of the image data
+          count = 0
+          while count < MAX_TOKEN_CHECK
+            token = tokenizer.next_object(allow_keyword: true) rescue break
+            if token == Tokenizer::NO_MORE_TOKENS
+              count += MAX_TOKEN_CHECK
+            elsif token.kind_of?(Tokenizer::Token) &&
+                !Processor::OPERATOR_MESSAGE_NAME_MAP.key?(token.to_sym)
+              break #  invalid token
+            end
+            count += 1
+          end
+
+          if count >= MAX_TOKEN_CHECK
+            real_end_found = true
+          else
+            image_data << "EI"
+          end
+          tokenizer.pos = last_pos
         end
-        tokenizer.pos += 3
-        [dict, data]
+
+        [dict, image_data]
       end
 
     end
 
   end