parser.rb in hexapdf-0.7.0

- old
+ new

@@ -1,12 +1,12 @@
-# -*- encoding: utf-8 -*-
+# -*- encoding: utf-8; frozen_string_literal: true -*-
 #
 #--
 # This file is part of HexaPDF.
 #
 # HexaPDF - A Versatile PDF Creation and Manipulation Library For Ruby
-# Copyright (C) 2014-2017 Thomas Leitner
+# Copyright (C) 2014-2018 Thomas Leitner
 #
 # HexaPDF is free software: you can redistribute it and/or modify it
 # under the terms of the GNU Affero General Public License version 3 as
 # published by the Free Software Foundation with the addition of the
 # following permission added to Section 15 as permitted in Section 7(a):
@@ -39,22 +39,30 @@
   module Content
 
     # More efficient tokenizer for content streams. This tokenizer class works directly on a
     # string and not on an IO.
     #
-    # Note: Indirect object references are *not* supported by this tokenizer!
+    # Changes:
     #
+    # * Since a content stream is normally parsed front to back, a StopIteration error can be raised
+    #   instead of returning +NO_MORE_TOKENS+ once the end of the string is reached to avoid costly
+    #   checks in each iteration. If this behaviour is wanted, pass "raise_on_eos: true" in the
+    #   constructor.
+    #
+    # * Indirect object references are *not* supported by this tokenizer!
+    #
     # See: PDF1.7 s7.2
     class Tokenizer < HexaPDF::Tokenizer #:nodoc:
 
       # The string that is tokenized.
       attr_reader :string
 
       # Creates a new tokenizer.
-      def initialize(string)
+      def initialize(string, raise_on_eos: false)
         @ss = StringScanner.new(string)
         @string = string
+        @raise_on_eos = raise_on_eos
       end
 
       # See: HexaPDF::Tokenizer#pos
       def pos
         @ss.pos
@@ -102,28 +110,30 @@
           @ss.pos += 1
           TOKEN_ARRAY_END
         elsif byte == 123 || byte == 125 # { }
           Token.new(@ss.get_byte)
         elsif byte == 37 # %
-          return NO_MORE_TOKENS unless @ss.skip_until(/(?=[\r\n])/)
+          unless @ss.skip_until(/(?=[\r\n])/)
+            (@raise_on_eos ? (raise StopIteration) : (return NO_MORE_TOKENS))
+          end
           next_token
         elsif byte == -1
-          NO_MORE_TOKENS
+          @raise_on_eos ? raise(StopIteration) : NO_MORE_TOKENS
         else
           parse_keyword
         end
       end
 
       private
 
       # See: HexaPDF::Tokenizer#parse_number
       def parse_number
-        if (val = @ss.scan(/[+-]?\d++(?!\.)/))
-          val.to_i
-        elsif (val = @ss.scan(/[+-]?(?:\d+\.\d*|\.\d+)/))
-          val << '0'.freeze if val.getbyte(-1) == 46 # dot '.'
+        if (val = @ss.scan(/[+-]?(?:\d+\.\d*|\.\d+)/))
+          val << '0' if val.getbyte(-1) == 46 # dot '.'
           Float(val)
+        elsif (val = @ss.scan(/[+-]?\d++/))
+          val.to_i
         else
           parse_keyword
         end
       end
 
@@ -131,11 +141,10 @@
       def prepare_string_scanner(*)
       end
 
     end
 
-
     # This class knows how to correctly parse a content stream.
     #
     # == Overview
     #
     # A content stream is mostly just a stream of PDF objects. However, there is one exception:
@@ -154,15 +163,16 @@
         new.parse(contents, processor)
       end
 
       # Parses the contents and calls the processor object for each parsed operator.
       def parse(contents, processor)
-        tokenizer = Tokenizer.new(contents)
+        tokenizer = Tokenizer.new(contents, raise_on_eos: true)
         params = []
-        while (obj = tokenizer.next_object(allow_keyword: true)) != Tokenizer::NO_MORE_TOKENS
+        loop do
+          obj = tokenizer.next_object(allow_keyword: true)
           if obj.kind_of?(Tokenizer::Token)
-            if obj == 'BI'.freeze
+            if obj == 'BI'
               params = parse_inline_image(tokenizer)
             end
             processor.process(obj.to_sym, params)
             params.clear
           else
@@ -177,19 +187,19 @@
 
       # Parses the inline image at the current position.
       def parse_inline_image(tokenizer)
         # BI has already been read, so read the image dictionary
         dict = {}
-        while (key = tokenizer.next_object(allow_keyword: true))
-          if key == 'ID'.freeze
+        while (key = tokenizer.next_object(allow_keyword: true) rescue Tokenizer::NO_MORE_TOKENS)
+          if key == 'ID'
             break
           elsif key == Tokenizer::NO_MORE_TOKENS
             raise HexaPDF::Error, "EOS while trying to read dictionary key for inline image"
           elsif !key.kind_of?(Symbol)
             raise HexaPDF::Error, "Inline image dictionary keys must be PDF name objects"
           end
-          value = tokenizer.next_object
+          value = tokenizer.next_object rescue Tokenizer::NO_MORE_TOKENS
           if value == Tokenizer::NO_MORE_TOKENS
             raise HexaPDF::Error, "EOS while trying to read dictionary value for inline image"
           end
           dict[key] = value
         end
@@ -211,10 +221,10 @@
           last_pos = tokenizer.pos
 
           # Check if we found EI inside of the image data
           count = 0
           while count < MAX_TOKEN_CHECK
-            token = tokenizer.next_object(allow_keyword: true) rescue break
+            token = tokenizer.next_object(allow_keyword: true) rescue Tokenizer::NO_MORE_TOKENS
             if token == Tokenizer::NO_MORE_TOKENS
               count += MAX_TOKEN_CHECK
             elsif token.kind_of?(Tokenizer::Token) &&
                 !Processor::OPERATOR_MESSAGE_NAME_MAP.key?(token.to_sym)
               break #  invalid token