# -*- encoding: utf-8; frozen_string_literal: true -*-
#
#--
# This file is part of HexaPDF.
#
# HexaPDF - A Versatile PDF Creation and Manipulation Library For Ruby
# Copyright (C) 2014-2020 Thomas Leitner
#
# HexaPDF is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License version 3 as
# published by the Free Software Foundation with the addition of the
# following permission added to Section 15 as permitted in Section 7(a):
# FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY
# THOMAS LEITNER, THOMAS LEITNER DISCLAIMS THE WARRANTY OF NON
# INFRINGEMENT OF THIRD PARTY RIGHTS.
#
# HexaPDF is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
# License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with HexaPDF. If not, see .
#
# The interactive user interfaces in modified source and object code
# versions of HexaPDF must display Appropriate Legal Notices, as required
# under Section 5 of the GNU Affero General Public License version 3.
#
# In accordance with Section 7(b) of the GNU Affero General Public
# License, a covered work must retain the producer line in every PDF that
# is created or manipulated using HexaPDF.
#
# If the GNU Affero General Public License doesn't fit your need,
# commercial licenses are available at .
#++
require 'strscan'
require 'hexapdf/error'
require 'hexapdf/reference'
module HexaPDF
# Tokenizes the content of an IO object following the PDF rules.
#
# See: PDF1.7 s7.2
class Tokenizer
# Represents a keyword in a PDF file.
class Token < String; end
TOKEN_DICT_START = Token.new('<<'.b) # :nodoc:
TOKEN_DICT_END = Token.new('>>'.b) # :nodoc:
TOKEN_ARRAY_START = Token.new('['.b) # :nodoc:
TOKEN_ARRAY_END = Token.new(']'.b) # :nodoc:
# This object is returned when there are no more tokens to read.
NO_MORE_TOKENS = ::Object.new
# Characters defined as whitespace.
#
# See: PDF1.7 s7.2.2
WHITESPACE = " \n\r\0\t\f"
# Characters defined as delimiters.
#
# See: PDF1.7 s7.2.2
DELIMITER = "()<>{}/[]%"
WHITESPACE_MULTI_RE = /[#{WHITESPACE}]+/ # :nodoc:
WHITESPACE_OR_DELIMITER_RE = /(?=[#{Regexp.escape(WHITESPACE + DELIMITER)}])/ # :nodoc:
# The IO object from the tokens are read.
attr_reader :io
# Creates a new tokenizer for the given IO stream.
#
# If +on_correctable_error+ is set to an object responding to +call(msg, pos)+, errors for
# correctable situations are only raised if the return value of calling the object is +true+.
def initialize(io, on_correctable_error: nil)
@io = io
@ss = StringScanner.new(''.b)
@original_pos = -1
@on_correctable_error = on_correctable_error || proc { false }
self.pos = 0
end
# Returns the current position of the tokenizer inside in the IO object.
#
# Note that this position might be different from +io.pos+ since the latter could have been
# changed somewhere else.
def pos
@original_pos + @ss.pos
end
# Sets the position at which the next token should be read.
#
# Note that this does **not** set +io.pos+ directly (at the moment of invocation)!
def pos=(pos)
if pos >= @original_pos && pos <= @original_pos + @ss.string.size
@ss.pos = pos - @original_pos
else
@original_pos = pos
@next_read_pos = pos
@ss.string.clear
@ss.reset
end
end
# Returns a single token read from the current position and advances the scan pointer.
#
# Comments and a run of whitespace characters are ignored. The value +NO_MORE_TOKENS+ is
# returned if there are no more tokens available.
def next_token
prepare_string_scanner(20)
prepare_string_scanner(20) while @ss.skip(WHITESPACE_MULTI_RE)
byte = @ss.string.getbyte(@ss.pos) || -1
if (48 <= byte && byte <= 57) || byte == 45 || byte == 43 || byte == 46 # 0..9 - + .
parse_number
elsif byte == 47 # /
parse_name
elsif byte == 40 # (
parse_literal_string
elsif byte == 60 # <
if @ss.string.getbyte(@ss.pos + 1) != 60
parse_hex_string
else
@ss.pos += 2
TOKEN_DICT_START
end
elsif byte == 62 # >
unless @ss.string.getbyte(@ss.pos + 1) == 62
raise HexaPDF::MalformedPDFError.new("Delimiter '>' found at invalid position", pos: pos)
end
@ss.pos += 2
TOKEN_DICT_END
elsif byte == 91 # [
@ss.pos += 1
TOKEN_ARRAY_START
elsif byte == 93 # ]
@ss.pos += 1
TOKEN_ARRAY_END
elsif byte == 123 || byte == 125 # { }
Token.new(@ss.get_byte)
elsif byte == 37 # %
until @ss.skip_until(/(?=[\r\n])/)
return NO_MORE_TOKENS unless prepare_string_scanner
end
next_token
elsif byte == -1 # we reached the end of the file
NO_MORE_TOKENS
else # everything else consisting of regular characters
parse_keyword
end
end
# Returns the next token but does not advance the scan pointer.
def peek_token
pos = self.pos
tok = next_token
self.pos = pos
tok
end
# Returns the PDF object at the current position. This is different from #next_token because
# references, arrays and dictionaries consist of multiple tokens.
#
# If the +allow_end_array_token+ argument is +true+, the ']' token is permitted to facilitate
# the use of this method during array parsing.
#
# See: PDF1.7 s7.3
def next_object(allow_end_array_token: false, allow_keyword: false)
token = next_token
if token.kind_of?(Token)
case token
when TOKEN_DICT_START
token = parse_dictionary
when TOKEN_ARRAY_START
token = parse_array
when TOKEN_ARRAY_END
unless allow_end_array_token
raise HexaPDF::MalformedPDFError.new("Found invalid end array token ']'", pos: pos)
end
else
unless allow_keyword
maybe_raise("Invalid object, got token #{token}", force: token !~ /^-?(nan|inf)$/i)
token = 0
end
end
end
token
end
# Returns a single integer or keyword token read from the current position and advances the scan
# pointer. If the current position doesn't contain such a token, +nil+ is returned without
# advancing the scan pointer. The value +NO_MORE_TOKENS+ is returned if there are no more tokens
# available.
#
# Initial runs of whitespace characters are ignored.
#
# Note: This is a special method meant for use with reconstructing the cross-reference table!
def next_integer_or_keyword
skip_whitespace
byte = @ss.string.getbyte(@ss.pos) || -1
if 48 <= byte && byte <= 57
parse_number
elsif (97 <= byte && byte <= 122) || (65 <= byte && byte <= 90)
parse_keyword
elsif byte == -1 # we reached the end of the file
NO_MORE_TOKENS
else
nil
end
end
# Reads the byte (an integer) at the current position and advances the scan pointer.
def next_byte
prepare_string_scanner(1)
@ss.pos += 1
@ss.string.getbyte(@ss.pos - 1)
end
# Reads the cross-reference subsection entry at the current position and advances the scan
# pointer.
#
# If a possible problem is detected, yields to caller.
#
# See: PDF1.7 7.5.4
def next_xref_entry #:yield: matched_size
prepare_string_scanner(20)
unless @ss.skip(/(\d{10}) (\d{5}) ([nf])(?: \r| \n|\r\n|\r|\n)/) && @ss.matched_size == 20
yield(@ss.matched_size)
end
[@ss[1].to_i, @ss[2].to_i, @ss[3]]
end
# Skips all whitespace at the current position.
#
# See: PDF1.7 s7.2.2
def skip_whitespace
prepare_string_scanner
prepare_string_scanner while @ss.skip(WHITESPACE_MULTI_RE)
end
# Utility method for scanning until the given regular expression matches.
#
# If the end of the file is reached in the process, +nil+ is returned. Otherwise the matched
# string is returned.
def scan_until(re)
until (data = @ss.scan_until(re))
return nil unless prepare_string_scanner
end
data
end
private
TOKEN_CACHE = Hash.new {|h, k| h[k] = Token.new(k) } # :nodoc:
TOKEN_CACHE['true'] = true
TOKEN_CACHE['false'] = false
TOKEN_CACHE['null'] = nil
# Parses the keyword at the current position.
#
# See: PDF1.7 s7.2
def parse_keyword
str = scan_until(WHITESPACE_OR_DELIMITER_RE) || @ss.scan(/.*/)
TOKEN_CACHE[str]
end
REFERENCE_RE = /[#{WHITESPACE}]+([+-]?\d+)[#{WHITESPACE}]+R#{WHITESPACE_OR_DELIMITER_RE}/ # :nodoc:
# Parses the number (integer or real) at the current position.
#
# See: PDF1.7 s7.3.3
def parse_number
val = scan_until(WHITESPACE_OR_DELIMITER_RE) || @ss.scan(/.*/)
if val.match?(/\A[+-]?\d++(?!\.)\z/)
tmp = val.to_i
# Handle object references, see PDF1.7 s7.3.10
prepare_string_scanner(10)
tmp = Reference.new(tmp, @ss[1].to_i) if @ss.scan(REFERENCE_RE)
tmp
elsif val.match?(/\A[+-]?(?:\d+\.\d*|\.\d+)\z/)
val << '0' if val.getbyte(-1) == 46 # dot '.'
Float(val)
else
TOKEN_CACHE[val] # val is keyword
end
end
LITERAL_STRING_ESCAPE_MAP = { #:nodoc:
'n' => "\n",
'r' => "\r",
't' => "\t",
'b' => "\b",
'f' => "\f",
'(' => "(",
')' => ")",
'\\' => "\\",
}.freeze
# Parses the literal string at the current position.
#
# See: PDF1.7 s7.3.4.2
def parse_literal_string
@ss.pos += 1
str = "".b
parentheses = 1
while parentheses != 0
data = scan_until(/([()\\\r])/)
char = @ss[1]
unless data
raise HexaPDF::MalformedPDFError.new("Unclosed literal string found", pos: pos)
end
str << data
prepare_string_scanner if @ss.eos?
case char
when '(' then parentheses += 1
when ')' then parentheses -= 1
when "\r"
str[-1] = "\n"
@ss.pos += 1 if @ss.peek(1) == "\n"
when '\\'
str.chop!
byte = @ss.get_byte
if (data = LITERAL_STRING_ESCAPE_MAP[byte])
str << data
elsif byte == "\r" || byte == "\n"
@ss.pos += 1 if byte == "\r" && @ss.peek(1) == "\n"
elsif byte >= '0' && byte <= '7'
byte += @ss.scan(/[0-7]{0,2}/)
str << byte.oct.chr
else
str << byte
end
end
end
str.chop! # remove last parsed closing parenthesis
str
end
# Parses the hex string at the current position.
#
# See: PDF1.7 s7.3.4.3
def parse_hex_string
@ss.pos += 1
data = scan_until(/(?=>)/)
unless data
raise HexaPDF::MalformedPDFError.new("Unclosed hex string found", pos: pos)
end
@ss.pos += 1
data.tr!(WHITESPACE, "")
[data].pack('H*')
end
# Parses the name at the current position.
#
# See: PDF1.7 s7.3.5
def parse_name
@ss.pos += 1
str = scan_until(WHITESPACE_OR_DELIMITER_RE) || @ss.scan(/.*/)
str.gsub!(/#[A-Fa-f0-9]{2}/) {|m| m[1, 2].hex.chr }
if str.force_encoding(Encoding::UTF_8).valid_encoding?
str.to_sym
else
str.force_encoding(Encoding::BINARY).to_sym
end
end
# Parses the array at the current position.
#
# It is assumed that the initial '[' has already been scanned.
#
# See: PDF1.7 s7.3.6
def parse_array
result = []
while true
obj = next_object(allow_end_array_token: true)
break if obj.equal?(TOKEN_ARRAY_END)
result << obj
end
result
end
# Parses the dictionary at the current position.
#
# It is assumed that the initial '<<' has already been scanned.
#
# See: PDF1.7 s7.3.7
def parse_dictionary
result = {}
while true
# Use #next_token because we either need a Name or the '>>' token here, the latter would
# throw an error with #next_object.
key = next_token
break if key.equal?(TOKEN_DICT_END)
unless key.kind_of?(Symbol)
raise HexaPDF::MalformedPDFError.new("Dictionary keys must be PDF name objects", pos: pos)
end
val = next_object
next if val.nil?
result[key] = val
end
result
end
# Prepares the StringScanner by filling its string instance with enough bytes.
#
# The number of needed bytes can be specified via the optional +needed_bytes+ argument.
#
# Returns +true+ if the end of the underlying IO stream has not been reached, yet.
def prepare_string_scanner(needed_bytes = nil)
return if needed_bytes && @ss.rest_size >= needed_bytes
@io.seek(@next_read_pos)
return false if @io.eof?
@ss << @io.read(8192)
if @ss.pos > 8192 && @ss.string.length > 16384
@ss.string.slice!(0, 8192)
@ss.pos -= 8192
@original_pos += 8192
end
@next_read_pos = @io.pos
true
end
# Calls the @on_correctable_error callable object with the given message and the current
# position. If the returned value is +true+, raises a HexaPDF::MalformedPDFError. Otherwise the
# error is corrected (by the caller) and tokenization continues.
#
# If the option +force+ is used, the callable object is not called and the error is raised
# immediately.
def maybe_raise(msg, force: false)
if force || @on_correctable_error.call(msg, pos)
error = HexaPDF::MalformedPDFError.new(msg, pos: pos)
error.set_backtrace(caller(1))
raise error
end
end
end
end