# -*- encoding: utf-8; frozen_string_literal: true -*- # #-- # This file is part of HexaPDF. # # HexaPDF - A Versatile PDF Creation and Manipulation Library For Ruby # Copyright (C) 2014-2020 Thomas Leitner # # HexaPDF is free software: you can redistribute it and/or modify it # under the terms of the GNU Affero General Public License version 3 as # published by the Free Software Foundation with the addition of the # following permission added to Section 15 as permitted in Section 7(a): # FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY # THOMAS LEITNER, THOMAS LEITNER DISCLAIMS THE WARRANTY OF NON # INFRINGEMENT OF THIRD PARTY RIGHTS. # # HexaPDF is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # License for more details. # # You should have received a copy of the GNU Affero General Public License # along with HexaPDF. If not, see . # # The interactive user interfaces in modified source and object code # versions of HexaPDF must display Appropriate Legal Notices, as required # under Section 5 of the GNU Affero General Public License version 3. # # In accordance with Section 7(b) of the GNU Affero General Public # License, a covered work must retain the producer line in every PDF that # is created or manipulated using HexaPDF. # # If the GNU Affero General Public License doesn't fit your need, # commercial licenses are available at . #++ require 'strscan' require 'hexapdf/error' require 'hexapdf/reference' module HexaPDF # Tokenizes the content of an IO object following the PDF rules. # # See: PDF1.7 s7.2 class Tokenizer # Represents a keyword in a PDF file. class Token < String; end TOKEN_DICT_START = Token.new('<<'.b) # :nodoc: TOKEN_DICT_END = Token.new('>>'.b) # :nodoc: TOKEN_ARRAY_START = Token.new('['.b) # :nodoc: TOKEN_ARRAY_END = Token.new(']'.b) # :nodoc: # This object is returned when there are no more tokens to read. NO_MORE_TOKENS = ::Object.new # Characters defined as whitespace. # # See: PDF1.7 s7.2.2 WHITESPACE = " \n\r\0\t\f" # Characters defined as delimiters. # # See: PDF1.7 s7.2.2 DELIMITER = "()<>{}/[]%" WHITESPACE_MULTI_RE = /[#{WHITESPACE}]+/ # :nodoc: WHITESPACE_OR_DELIMITER_RE = /(?=[#{Regexp.escape(WHITESPACE + DELIMITER)}])/ # :nodoc: # The IO object from the tokens are read. attr_reader :io # Creates a new tokenizer for the given IO stream. # # If +on_correctable_error+ is set to an object responding to +call(msg, pos)+, errors for # correctable situations are only raised if the return value of calling the object is +true+. def initialize(io, on_correctable_error: nil) @io = io @ss = StringScanner.new(''.b) @original_pos = -1 @on_correctable_error = on_correctable_error || proc { false } self.pos = 0 end # Returns the current position of the tokenizer inside in the IO object. # # Note that this position might be different from +io.pos+ since the latter could have been # changed somewhere else. def pos @original_pos + @ss.pos end # Sets the position at which the next token should be read. # # Note that this does **not** set +io.pos+ directly (at the moment of invocation)! def pos=(pos) if pos >= @original_pos && pos <= @original_pos + @ss.string.size @ss.pos = pos - @original_pos else @original_pos = pos @next_read_pos = pos @ss.string.clear @ss.reset end end # Returns a single token read from the current position and advances the scan pointer. # # Comments and a run of whitespace characters are ignored. The value +NO_MORE_TOKENS+ is # returned if there are no more tokens available. def next_token prepare_string_scanner(20) prepare_string_scanner(20) while @ss.skip(WHITESPACE_MULTI_RE) byte = @ss.string.getbyte(@ss.pos) || -1 if (48 <= byte && byte <= 57) || byte == 45 || byte == 43 || byte == 46 # 0..9 - + . parse_number elsif byte == 47 # / parse_name elsif byte == 40 # ( parse_literal_string elsif byte == 60 # < if @ss.string.getbyte(@ss.pos + 1) != 60 parse_hex_string else @ss.pos += 2 TOKEN_DICT_START end elsif byte == 62 # > unless @ss.string.getbyte(@ss.pos + 1) == 62 raise HexaPDF::MalformedPDFError.new("Delimiter '>' found at invalid position", pos: pos) end @ss.pos += 2 TOKEN_DICT_END elsif byte == 91 # [ @ss.pos += 1 TOKEN_ARRAY_START elsif byte == 93 # ] @ss.pos += 1 TOKEN_ARRAY_END elsif byte == 123 || byte == 125 # { } Token.new(@ss.get_byte) elsif byte == 37 # % until @ss.skip_until(/(?=[\r\n])/) return NO_MORE_TOKENS unless prepare_string_scanner end next_token elsif byte == -1 # we reached the end of the file NO_MORE_TOKENS else # everything else consisting of regular characters parse_keyword end end # Returns the next token but does not advance the scan pointer. def peek_token pos = self.pos tok = next_token self.pos = pos tok end # Returns the PDF object at the current position. This is different from #next_token because # references, arrays and dictionaries consist of multiple tokens. # # If the +allow_end_array_token+ argument is +true+, the ']' token is permitted to facilitate # the use of this method during array parsing. # # See: PDF1.7 s7.3 def next_object(allow_end_array_token: false, allow_keyword: false) token = next_token if token.kind_of?(Token) case token when TOKEN_DICT_START token = parse_dictionary when TOKEN_ARRAY_START token = parse_array when TOKEN_ARRAY_END unless allow_end_array_token raise HexaPDF::MalformedPDFError.new("Found invalid end array token ']'", pos: pos) end else unless allow_keyword maybe_raise("Invalid object, got token #{token}", force: token !~ /^-?(nan|inf)$/i) token = 0 end end end token end # Returns a single integer or keyword token read from the current position and advances the scan # pointer. If the current position doesn't contain such a token, +nil+ is returned without # advancing the scan pointer. The value +NO_MORE_TOKENS+ is returned if there are no more tokens # available. # # Initial runs of whitespace characters are ignored. # # Note: This is a special method meant for use with reconstructing the cross-reference table! def next_integer_or_keyword skip_whitespace byte = @ss.string.getbyte(@ss.pos) || -1 if 48 <= byte && byte <= 57 parse_number elsif (97 <= byte && byte <= 122) || (65 <= byte && byte <= 90) parse_keyword elsif byte == -1 # we reached the end of the file NO_MORE_TOKENS else nil end end # Reads the byte (an integer) at the current position and advances the scan pointer. def next_byte prepare_string_scanner(1) @ss.pos += 1 @ss.string.getbyte(@ss.pos - 1) end # Reads the cross-reference subsection entry at the current position and advances the scan # pointer. # # If a possible problem is detected, yields to caller. # # See: PDF1.7 7.5.4 def next_xref_entry #:yield: matched_size prepare_string_scanner(20) unless @ss.skip(/(\d{10}) (\d{5}) ([nf])(?: \r| \n|\r\n|\r|\n)/) && @ss.matched_size == 20 yield(@ss.matched_size) end [@ss[1].to_i, @ss[2].to_i, @ss[3]] end # Skips all whitespace at the current position. # # See: PDF1.7 s7.2.2 def skip_whitespace prepare_string_scanner prepare_string_scanner while @ss.skip(WHITESPACE_MULTI_RE) end # Utility method for scanning until the given regular expression matches. # # If the end of the file is reached in the process, +nil+ is returned. Otherwise the matched # string is returned. def scan_until(re) until (data = @ss.scan_until(re)) return nil unless prepare_string_scanner end data end private TOKEN_CACHE = Hash.new {|h, k| h[k] = Token.new(k) } # :nodoc: TOKEN_CACHE['true'] = true TOKEN_CACHE['false'] = false TOKEN_CACHE['null'] = nil # Parses the keyword at the current position. # # See: PDF1.7 s7.2 def parse_keyword str = scan_until(WHITESPACE_OR_DELIMITER_RE) || @ss.scan(/.*/) TOKEN_CACHE[str] end REFERENCE_RE = /[#{WHITESPACE}]+([+-]?\d+)[#{WHITESPACE}]+R#{WHITESPACE_OR_DELIMITER_RE}/ # :nodoc: # Parses the number (integer or real) at the current position. # # See: PDF1.7 s7.3.3 def parse_number val = scan_until(WHITESPACE_OR_DELIMITER_RE) || @ss.scan(/.*/) if val.match?(/\A[+-]?\d++(?!\.)\z/) tmp = val.to_i # Handle object references, see PDF1.7 s7.3.10 prepare_string_scanner(10) tmp = Reference.new(tmp, @ss[1].to_i) if @ss.scan(REFERENCE_RE) tmp elsif val.match?(/\A[+-]?(?:\d+\.\d*|\.\d+)\z/) val << '0' if val.getbyte(-1) == 46 # dot '.' Float(val) else TOKEN_CACHE[val] # val is keyword end end LITERAL_STRING_ESCAPE_MAP = { #:nodoc: 'n' => "\n", 'r' => "\r", 't' => "\t", 'b' => "\b", 'f' => "\f", '(' => "(", ')' => ")", '\\' => "\\", }.freeze # Parses the literal string at the current position. # # See: PDF1.7 s7.3.4.2 def parse_literal_string @ss.pos += 1 str = "".b parentheses = 1 while parentheses != 0 data = scan_until(/([()\\\r])/) char = @ss[1] unless data raise HexaPDF::MalformedPDFError.new("Unclosed literal string found", pos: pos) end str << data prepare_string_scanner if @ss.eos? case char when '(' then parentheses += 1 when ')' then parentheses -= 1 when "\r" str[-1] = "\n" @ss.pos += 1 if @ss.peek(1) == "\n" when '\\' str.chop! byte = @ss.get_byte if (data = LITERAL_STRING_ESCAPE_MAP[byte]) str << data elsif byte == "\r" || byte == "\n" @ss.pos += 1 if byte == "\r" && @ss.peek(1) == "\n" elsif byte >= '0' && byte <= '7' byte += @ss.scan(/[0-7]{0,2}/) str << byte.oct.chr else str << byte end end end str.chop! # remove last parsed closing parenthesis str end # Parses the hex string at the current position. # # See: PDF1.7 s7.3.4.3 def parse_hex_string @ss.pos += 1 data = scan_until(/(?=>)/) unless data raise HexaPDF::MalformedPDFError.new("Unclosed hex string found", pos: pos) end @ss.pos += 1 data.tr!(WHITESPACE, "") [data].pack('H*') end # Parses the name at the current position. # # See: PDF1.7 s7.3.5 def parse_name @ss.pos += 1 str = scan_until(WHITESPACE_OR_DELIMITER_RE) || @ss.scan(/.*/) str.gsub!(/#[A-Fa-f0-9]{2}/) {|m| m[1, 2].hex.chr } if str.force_encoding(Encoding::UTF_8).valid_encoding? str.to_sym else str.force_encoding(Encoding::BINARY).to_sym end end # Parses the array at the current position. # # It is assumed that the initial '[' has already been scanned. # # See: PDF1.7 s7.3.6 def parse_array result = [] while true obj = next_object(allow_end_array_token: true) break if obj.equal?(TOKEN_ARRAY_END) result << obj end result end # Parses the dictionary at the current position. # # It is assumed that the initial '<<' has already been scanned. # # See: PDF1.7 s7.3.7 def parse_dictionary result = {} while true # Use #next_token because we either need a Name or the '>>' token here, the latter would # throw an error with #next_object. key = next_token break if key.equal?(TOKEN_DICT_END) unless key.kind_of?(Symbol) raise HexaPDF::MalformedPDFError.new("Dictionary keys must be PDF name objects", pos: pos) end val = next_object next if val.nil? result[key] = val end result end # Prepares the StringScanner by filling its string instance with enough bytes. # # The number of needed bytes can be specified via the optional +needed_bytes+ argument. # # Returns +true+ if the end of the underlying IO stream has not been reached, yet. def prepare_string_scanner(needed_bytes = nil) return if needed_bytes && @ss.rest_size >= needed_bytes @io.seek(@next_read_pos) return false if @io.eof? @ss << @io.read(8192) if @ss.pos > 8192 && @ss.string.length > 16384 @ss.string.slice!(0, 8192) @ss.pos -= 8192 @original_pos += 8192 end @next_read_pos = @io.pos true end # Calls the @on_correctable_error callable object with the given message and the current # position. If the returned value is +true+, raises a HexaPDF::MalformedPDFError. Otherwise the # error is corrected (by the caller) and tokenization continues. # # If the option +force+ is used, the callable object is not called and the error is raised # immediately. def maybe_raise(msg, force: false) if force || @on_correctable_error.call(msg, pos) error = HexaPDF::MalformedPDFError.new(msg, pos: pos) error.set_backtrace(caller(1)) raise error end end end end