lib/hexapdf/content/parser.rb in hexapdf-0.6.0 vs lib/hexapdf/content/parser.rb in hexapdf-0.7.0
- old
+ new
@@ -1,12 +1,12 @@
-# -*- encoding: utf-8 -*-
+# -*- encoding: utf-8; frozen_string_literal: true -*-
#
#--
# This file is part of HexaPDF.
#
# HexaPDF - A Versatile PDF Creation and Manipulation Library For Ruby
-# Copyright (C) 2014-2017 Thomas Leitner
+# Copyright (C) 2014-2018 Thomas Leitner
#
# HexaPDF is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License version 3 as
# published by the Free Software Foundation with the addition of the
# following permission added to Section 15 as permitted in Section 7(a):
@@ -39,22 +39,30 @@
module Content
# More efficient tokenizer for content streams. This tokenizer class works directly on a
# string and not on an IO.
#
- # Note: Indirect object references are *not* supported by this tokenizer!
+ # Changes:
#
+ # * Since a content stream is normally parsed front to back, a StopIteration error can be raised
+ # instead of returning +NO_MORE_TOKENS+ once the end of the string is reached to avoid costly
+ # checks in each iteration. If this behaviour is wanted, pass "raise_on_eos: true" in the
+ # constructor.
+ #
+ # * Indirect object references are *not* supported by this tokenizer!
+ #
# See: PDF1.7 s7.2
class Tokenizer < HexaPDF::Tokenizer #:nodoc:
# The string that is tokenized.
attr_reader :string
# Creates a new tokenizer.
- def initialize(string)
+ def initialize(string, raise_on_eos: false)
@ss = StringScanner.new(string)
@string = string
+ @raise_on_eos = raise_on_eos
end
# See: HexaPDF::Tokenizer#pos
def pos
@ss.pos
@@ -102,28 +110,30 @@
@ss.pos += 1
TOKEN_ARRAY_END
elsif byte == 123 || byte == 125 # { }
Token.new(@ss.get_byte)
elsif byte == 37 # %
- return NO_MORE_TOKENS unless @ss.skip_until(/(?=[\r\n])/)
+ unless @ss.skip_until(/(?=[\r\n])/)
+ (@raise_on_eos ? (raise StopIteration) : (return NO_MORE_TOKENS))
+ end
next_token
elsif byte == -1
- NO_MORE_TOKENS
+ @raise_on_eos ? raise(StopIteration) : NO_MORE_TOKENS
else
parse_keyword
end
end
private
# See: HexaPDF::Tokenizer#parse_number
def parse_number
- if (val = @ss.scan(/[+-]?\d++(?!\.)/))
- val.to_i
- elsif (val = @ss.scan(/[+-]?(?:\d+\.\d*|\.\d+)/))
- val << '0'.freeze if val.getbyte(-1) == 46 # dot '.'
+ if (val = @ss.scan(/[+-]?(?:\d+\.\d*|\.\d+)/))
+ val << '0' if val.getbyte(-1) == 46 # dot '.'
Float(val)
+ elsif (val = @ss.scan(/[+-]?\d++/))
+ val.to_i
else
parse_keyword
end
end
@@ -131,11 +141,10 @@
def prepare_string_scanner(*)
end
end
-
# This class knows how to correctly parse a content stream.
#
# == Overview
#
# A content stream is mostly just a stream of PDF objects. However, there is one exception:
@@ -154,15 +163,16 @@
new.parse(contents, processor)
end
# Parses the contents and calls the processor object for each parsed operator.
def parse(contents, processor)
- tokenizer = Tokenizer.new(contents)
+ tokenizer = Tokenizer.new(contents, raise_on_eos: true)
params = []
- while (obj = tokenizer.next_object(allow_keyword: true)) != Tokenizer::NO_MORE_TOKENS
+ loop do
+ obj = tokenizer.next_object(allow_keyword: true)
if obj.kind_of?(Tokenizer::Token)
- if obj == 'BI'.freeze
+ if obj == 'BI'
params = parse_inline_image(tokenizer)
end
processor.process(obj.to_sym, params)
params.clear
else
@@ -177,19 +187,19 @@
# Parses the inline image at the current position.
def parse_inline_image(tokenizer)
# BI has already been read, so read the image dictionary
dict = {}
- while (key = tokenizer.next_object(allow_keyword: true))
- if key == 'ID'.freeze
+ while (key = tokenizer.next_object(allow_keyword: true) rescue Tokenizer::NO_MORE_TOKENS)
+ if key == 'ID'
break
elsif key == Tokenizer::NO_MORE_TOKENS
raise HexaPDF::Error, "EOS while trying to read dictionary key for inline image"
elsif !key.kind_of?(Symbol)
raise HexaPDF::Error, "Inline image dictionary keys must be PDF name objects"
end
- value = tokenizer.next_object
+ value = tokenizer.next_object rescue Tokenizer::NO_MORE_TOKENS
if value == Tokenizer::NO_MORE_TOKENS
raise HexaPDF::Error, "EOS while trying to read dictionary value for inline image"
end
dict[key] = value
end
@@ -211,10 +221,10 @@
last_pos = tokenizer.pos
# Check if we found EI inside of the image data
count = 0
while count < MAX_TOKEN_CHECK
- token = tokenizer.next_object(allow_keyword: true) rescue break
+ token = tokenizer.next_object(allow_keyword: true) rescue Tokenizer::NO_MORE_TOKENS
if token == Tokenizer::NO_MORE_TOKENS
count += MAX_TOKEN_CHECK
elsif token.kind_of?(Tokenizer::Token) &&
!Processor::OPERATOR_MESSAGE_NAME_MAP.key?(token.to_sym)
break # invalid token