lib/hexapdf/content/parser.rb in hexapdf-0.6.0 vs lib/hexapdf/content/parser.rb in hexapdf-0.7.0

- old
+ new

@@ -1,12 +1,12 @@ -# -*- encoding: utf-8 -*- +# -*- encoding: utf-8; frozen_string_literal: true -*- # #-- # This file is part of HexaPDF. # # HexaPDF - A Versatile PDF Creation and Manipulation Library For Ruby -# Copyright (C) 2014-2017 Thomas Leitner +# Copyright (C) 2014-2018 Thomas Leitner # # HexaPDF is free software: you can redistribute it and/or modify it # under the terms of the GNU Affero General Public License version 3 as # published by the Free Software Foundation with the addition of the # following permission added to Section 15 as permitted in Section 7(a): @@ -39,22 +39,30 @@ module Content # More efficient tokenizer for content streams. This tokenizer class works directly on a # string and not on an IO. # - # Note: Indirect object references are *not* supported by this tokenizer! + # Changes: # + # * Since a content stream is normally parsed front to back, a StopIteration error can be raised + # instead of returning +NO_MORE_TOKENS+ once the end of the string is reached to avoid costly + # checks in each iteration. If this behaviour is wanted, pass "raise_on_eos: true" in the + # constructor. + # + # * Indirect object references are *not* supported by this tokenizer! + # # See: PDF1.7 s7.2 class Tokenizer < HexaPDF::Tokenizer #:nodoc: # The string that is tokenized. attr_reader :string # Creates a new tokenizer. - def initialize(string) + def initialize(string, raise_on_eos: false) @ss = StringScanner.new(string) @string = string + @raise_on_eos = raise_on_eos end # See: HexaPDF::Tokenizer#pos def pos @ss.pos @@ -102,28 +110,30 @@ @ss.pos += 1 TOKEN_ARRAY_END elsif byte == 123 || byte == 125 # { } Token.new(@ss.get_byte) elsif byte == 37 # % - return NO_MORE_TOKENS unless @ss.skip_until(/(?=[\r\n])/) + unless @ss.skip_until(/(?=[\r\n])/) + (@raise_on_eos ? (raise StopIteration) : (return NO_MORE_TOKENS)) + end next_token elsif byte == -1 - NO_MORE_TOKENS + @raise_on_eos ? raise(StopIteration) : NO_MORE_TOKENS else parse_keyword end end private # See: HexaPDF::Tokenizer#parse_number def parse_number - if (val = @ss.scan(/[+-]?\d++(?!\.)/)) - val.to_i - elsif (val = @ss.scan(/[+-]?(?:\d+\.\d*|\.\d+)/)) - val << '0'.freeze if val.getbyte(-1) == 46 # dot '.' + if (val = @ss.scan(/[+-]?(?:\d+\.\d*|\.\d+)/)) + val << '0' if val.getbyte(-1) == 46 # dot '.' Float(val) + elsif (val = @ss.scan(/[+-]?\d++/)) + val.to_i else parse_keyword end end @@ -131,11 +141,10 @@ def prepare_string_scanner(*) end end - # This class knows how to correctly parse a content stream. # # == Overview # # A content stream is mostly just a stream of PDF objects. However, there is one exception: @@ -154,15 +163,16 @@ new.parse(contents, processor) end # Parses the contents and calls the processor object for each parsed operator. def parse(contents, processor) - tokenizer = Tokenizer.new(contents) + tokenizer = Tokenizer.new(contents, raise_on_eos: true) params = [] - while (obj = tokenizer.next_object(allow_keyword: true)) != Tokenizer::NO_MORE_TOKENS + loop do + obj = tokenizer.next_object(allow_keyword: true) if obj.kind_of?(Tokenizer::Token) - if obj == 'BI'.freeze + if obj == 'BI' params = parse_inline_image(tokenizer) end processor.process(obj.to_sym, params) params.clear else @@ -177,19 +187,19 @@ # Parses the inline image at the current position. def parse_inline_image(tokenizer) # BI has already been read, so read the image dictionary dict = {} - while (key = tokenizer.next_object(allow_keyword: true)) - if key == 'ID'.freeze + while (key = tokenizer.next_object(allow_keyword: true) rescue Tokenizer::NO_MORE_TOKENS) + if key == 'ID' break elsif key == Tokenizer::NO_MORE_TOKENS raise HexaPDF::Error, "EOS while trying to read dictionary key for inline image" elsif !key.kind_of?(Symbol) raise HexaPDF::Error, "Inline image dictionary keys must be PDF name objects" end - value = tokenizer.next_object + value = tokenizer.next_object rescue Tokenizer::NO_MORE_TOKENS if value == Tokenizer::NO_MORE_TOKENS raise HexaPDF::Error, "EOS while trying to read dictionary value for inline image" end dict[key] = value end @@ -211,10 +221,10 @@ last_pos = tokenizer.pos # Check if we found EI inside of the image data count = 0 while count < MAX_TOKEN_CHECK - token = tokenizer.next_object(allow_keyword: true) rescue break + token = tokenizer.next_object(allow_keyword: true) rescue Tokenizer::NO_MORE_TOKENS if token == Tokenizer::NO_MORE_TOKENS count += MAX_TOKEN_CHECK elsif token.kind_of?(Tokenizer::Token) && !Processor::OPERATOR_MESSAGE_NAME_MAP.key?(token.to_sym) break # invalid token