# -*- encoding : utf-8 -*- ######################################################## ## Thoughts from reading the ISO 32000-1:2008 ## this file is part of the CombinePDF library and the code ## is subject to the same license. ######################################################## module CombinePDF # @!visibility private # @private #:nodoc: all protected # This is the Parser class. # # It takes PDF data and parses it. # # The information is then used to initialize a PDF object. # # This is an internal class. you don't need it. class PDFParser # @!visibility private # the array containing all the parsed data (PDF Objects) attr_reader :parsed # a Float representing the PDF version of the data parsed (if exists). attr_reader :version # the info and root objects, as found (if found) in the PDF file. # # they are mainly to used to know if the file is (was) encrypted and to get more details. attr_reader :info_object, :root_object, :names_object, :forms_object, :outlines_object, :metadata attr_reader :allow_optional_content # when creating a parser, it is important to set the data (String) we wish to parse. # # the data is required and it is not possible to set the data at a later stage # # string:: the data to be parsed, as a String object. def initialize(string, options = {}) raise TypeError, "couldn't parse data, expecting type String" unless string.is_a? String @string_to_parse = string.force_encoding(Encoding::ASCII_8BIT) @literal_strings = [].dup @hex_strings = [].dup @streams = [].dup @parsed = [].dup @references = [].dup @root_object = {}.dup @info_object = {}.dup @names_object = {}.dup @outlines_object = {}.dup @forms_object = {}.dup @metadata = nil @strings_dictionary = {}.dup # all strings are one string @version = nil @scanner = nil @allow_optional_content = options[:allow_optional_content] end # parse the data in the new parser (the data already set through the initialize / new method) def parse return [] if @string_to_parse.empty? return @parsed unless @parsed.empty? @scanner = StringScanner.new @string_to_parse @scanner.pos = 0 @scanner.skip(/[^%]*/) if @scanner.exist?(/%PDF/i) if @scanner.scan /\%PDF\-[\d\-\.]+/ @version = @scanner.matched.scan(/[\d\.]+/)[0].to_f loop do break unless @scanner.scan(/[^\d\r\n]+/) break if @scanner.check(/([\d]+[\s]+[\d]+[\s]+obj[\n\r\s]+\<\<)|([\n\r]+)/) break if @scanner.eos? @scanner.pos += 1 end end @parsed = _parse_ # puts @parsed raise 'Unknown PDF parsing error - maleformed PDF file?' unless (@parsed.select { |i| !i.is_a?(Hash) }).empty? if @root_object == {}.freeze xref_streams = @parsed.select { |obj| obj.is_a?(Hash) && obj[:Type] == :XRef } xref_streams.each do |xref_dictionary| @root_object.merge! xref_dictionary end end raise 'root is unknown - cannot determine if file is Encrypted' if @root_object == {}.freeze if @root_object[:Encrypt] # change_references_to_actual_values @root_object warn 'PDF is Encrypted! Attempting to decrypt - not yet fully supported.' decryptor = PDFDecrypt.new @parsed, @root_object decryptor.decrypt # do we really need to apply to @parsed? No, there is no need. end ## search for objects streams object_streams = @parsed.select { |obj| obj.is_a?(Hash) && obj[:Type] == :ObjStm } unless object_streams.empty? warn 'PDF 1.5 Object streams found - they are not fully supported! attempting to extract objects.' object_streams.each do |o| ## un-encode (using the correct filter) the object streams PDFFilter.inflate_object o ## extract objects from stream to top level arry @parsed @scanner = StringScanner.new o[:raw_stream_content] stream_data = _parse_ id_array = [] while stream_data[0].is_a? (Integer) id_array << stream_data.shift stream_data.shift end while id_array[0] && stream_data[0] stream_data[0] = { indirect_without_dictionary: stream_data[0] } unless stream_data[0].is_a?(Hash) stream_data[0][:indirect_reference_id] = id_array.shift stream_data[0][:indirect_generation_number] = 0 @parsed << stream_data.shift end end end # Strings were unified, we can let them go.. @strings_dictionary.clear # serialize_objects_and_references.catalog_pages # Benchmark.bm do |bm| # bm.report("serialize") {1000.times {serialize_objects_and_references} } # bm.report("serialize - old") {1000.times {old_serialize_objects_and_references} } # bm.report("catalog") {1000.times {catalog_pages} } # end serialize_objects_and_references catalog_pages # collect any missing objects from the forms_data unless @forms_object.nil? || @forms_object.empty? @forms_object[:related_objects] = (@parsed.select { |o| o[:FT] }).map! { |o| { is_reference_only: true, referenced_object: o } } @forms_object[:related_objects].delete @forms_object end @info_object = @root_object[:Info] ? (@root_object[:Info][:referenced_object] || @root_object[:Info]) : false if @info_object && @info_object.is_a?(Hash) @parsed.delete @info_object CombinePDF::PDF::PRIVATE_HASH_KEYS.each { |key| @info_object.delete key } @info_object.each { |_k, v| @info_object = v[:referenced_object] if v.is_a?(Hash) && v[:referenced_object] } else @info_object = {} end # # # ## remove object streams - if they exist # @parsed.reject! {|obj| object_streams << obj if obj.is_a?(Hash) && obj[:Type] == :ObjStm} # # # ## remove XREF dictionaries - if they exist # @parsed.reject! {|obj| object_streams << obj if obj.is_a?(Hash) && obj[:Type] == :XRef} @parsed end # the actual recoursive parsing is done here. # # this is an internal function, but it was left exposed for posible future features. def _parse_ out = [] str = '' fresh = true while @scanner.rest? # last ||= 0 # out.last.tap do |o| # if o.is_a?(Hash) # puts "[#{@scanner.pos}] Parser has a Dictionary (#{o.class.name}) with data:" # o.each do |k, v| # puts " #{k}: is #{v.class.name} with data: #{v.to_s[0..4]}#{"..." if v.to_s.length > 5}" # end # else # puts "[#{@scanner.pos}] Parser has #{o.class.name} with data: #{o.to_s[0..4]}#{"..." if o.to_s.length > 5}" # end # puts "next is #{@scanner.peek 8}" # end unless (last == out.count) || (-1 == (last = out.count)) if @scanner.scan(/\[/) out << _parse_ ########################################## ## parse a Dictionary ########################################## elsif @scanner.scan(/<>/) return out ########################################## ## parse a Stream ########################################## elsif @scanner.scan(/stream[\r\n]/) @scanner.pos += 1 if @scanner.peek(1) == "\n".freeze && @scanner.matched[-1] != "\n".freeze # the following was dicarded because some PDF files didn't have an EOL marker as required # str = @scanner.scan_until(/(\r\n|\r|\n)endstream/) # instead, a non-strict RegExp is used: str = @scanner.scan_until(/endstream/) # raise error if the stream doesn't end. raise "Parsing Error: PDF file error - a stream object wasn't properly closed using 'endstream'!" unless str # need to remove end of stream if out.last.is_a? Hash # out.last[:raw_stream_content] = str[0...-10] #cuts only one EON char (\n or \r) out.last[:raw_stream_content] = unify_string str.sub(/(\r\n|\n|\r)?endstream\z/, '').force_encoding(Encoding::ASCII_8BIT) else warn 'Stream not attached to dictionary!' out << str.sub(/(\r\n|\n|\r)?endstream\z/, '').force_encoding(Encoding::ASCII_8BIT) end ########################################## ## parse an Object after finished ########################################## elsif str = @scanner.scan(/endobj/) # what to do when this is an object? if out.last.is_a? Hash out << out.pop.merge(indirect_generation_number: out.pop, indirect_reference_id: out.pop) else out << { indirect_without_dictionary: out.pop, indirect_generation_number: out.pop, indirect_reference_id: out.pop } end fresh = true # puts "!!!!!!!!! Error with :indirect_reference_id\n\nObject #{out.last} :indirect_reference_id = #{out.last[:indirect_reference_id]}" unless out.last[:indirect_reference_id].is_a?(Integer) ########################################## ## parse a Hex String ########################################## elsif str = @scanner.scan(/<[0-9a-fA-F]*>/) # warn "Found a hex string" out << unify_string([str[1..-2]].pack('H*').force_encoding(Encoding::ASCII_8BIT)) ########################################## ## parse a Literal String ########################################## elsif @scanner.scan(/\(/) # warn "Found a literal string" str = ''.force_encoding(Encoding::ASCII_8BIT) count = 1 while count > 0 && @scanner.rest? scn = @scanner.scan_until(/[\(\)]/) unless scn warn "Unknown error parsing string at #{@scanner.pos} for string: #{str}!" count = 0 # error next end str += scn.to_s seperator_count = 0 seperator_count += 1 while str[-2 - seperator_count] == '\\' case str[-1] when '(' ## The following solution might fail when (string ends with this sign: \\) count += 1 unless seperator_count.odd? when ')' count -= 1 unless seperator_count.odd? else warn "Unknown error parsing string at #{@scanner.pos} for string: #{str}!" count = 0 # error end end # The PDF formatted string is: str[0..-2] # now starting to convert to regular string str_bytes = str.force_encoding(Encoding::ASCII_8BIT)[0..-2].bytes.to_a str = [] until str_bytes.empty? case str_bytes[0] when 13 # eol - \r # An end-of-line marker appearing within a literal string without a preceding REVERSE SOLIDUS # shall be treated as a byte value of (0Ah), # irrespective of whether the end-of-line marker was a CARRIAGE RETURN (0Dh), a LINE FEED (0Ah), or both. str_bytes.shift str_bytes.shift if str_bytes[0] == 10 str << 10 when 10 # eol - \n # An end-of-line marker appearing within a literal string without a preceding REVERSE SOLIDUS # shall be treated as a byte value of (0Ah), # irrespective of whether the end-of-line marker was a CARRIAGE RETURN (0Dh), a LINE FEED (0Ah), or both. str_bytes.shift str_bytes.shift if str_bytes[0] == 13 str << 10 when 92 # "\\".ord == 92 str_bytes.shift rep = str_bytes.shift case rep when 110 # n str << 10 # new line when 114 # r str << 13 # CR when 116 # t str << 9 # tab when 98 # b str << 8 when 102 # f, form-feed str << 12 when 48..57 # octal notation for byte? rep = rep.chr rep += str_bytes.shift.chr if str_bytes[0].between?(48, 57) rep += str_bytes.shift.chr if str_bytes[0].between?(48, 57) && ((rep + str_bytes[0].chr).to_i <= 255) str << rep.to_i when 10 # new line, ignore str_bytes.shift if str_bytes[0] == 13 true when 13 # new line (or double notation for new line), ignore str_bytes.shift if str_bytes[0] == 10 true else str << rep end else str << str_bytes.shift end end out << unify_string(str.pack('C*').force_encoding(Encoding::ASCII_8BIT)) ########################################## ## Parse a comment ########################################## elsif str = @scanner.scan(/\%/) # is a comment, skip until new line loop do # break unless @scanner.scan(/[^\d\r\n]+/) break if @scanner.check(/([\d]+[\s]+[\d]+[\s]+obj[\n\r\s]+\<\<)|([\n\r]+)/) || @scanner.eos? # || @scanner.scan(/[^\d]+[\r\n]+/) || @scanner.scan(/[^\d\r\n]+/) || @scanner.pos += 1 end # puts "AFTER COMMENT: #{@scanner.peek 8}" ########################################## ## Parse a Name ########################################## # old, probably working version: when str = @scanner.scan(/\/[\#\w\d\.\+\-\\\?\,]+/) # I don't know how to write the /[\x21-\x7e___subtract_certain_hex_values_here____]+/ # all allowed regular caracters between ! and ~ : /[\x21-\x24\x26\x27\x2a-\x2e\x30-\x3b\x3d\x3f-\x5a\x5c\x5e-\x7a\x7c\x7e]+ # all characters that aren't white space or special: /[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+ elsif str = @scanner.scan(/\/[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]*/) out << (str[1..-1].gsub(/\#[0-9a-fA-F]{2}/) { |a| a[1..2].hex.chr }).to_sym ########################################## ## Parse a Number ########################################## elsif str = @scanner.scan(/[\+\-\.\d]+/) str =~ /\./ ? (out << str.to_f) : (out << str.to_i) ########################################## ## Parse an Object Reference ########################################## elsif @scanner.scan(/R/) out << { is_reference_only: true, indirect_generation_number: out.pop, indirect_reference_id: out.pop } # @references << out.last ########################################## ## Parse Bool - true and after false ########################################## elsif @scanner.scan(/true/) out << true elsif @scanner.scan(/false/) out << false ########################################## ## Parse NULL - null ########################################## elsif @scanner.scan(/null/) out << nil ########################################## ## XREF - check for encryption... anything else? ########################################## elsif @scanner.scan(/(startxref)|(xref)/) ########## ## get root object to check for encryption @scanner.scan_until(/(trailer)|(\%EOF)/) fresh = true if @scanner.matched[-1] == 'r' if @scanner.skip_until(/< 0 obj = should_resolve.pop if obj.is_a?(Hash) obj.keys.each do |k| o = obj[k] if o.is_a?(Hash) if o[:is_reference_only] if o[:indirect_reference_id].nil? o = nil else o[:referenced_object] = obj_dir[[o[:indirect_reference_id], o[:indirect_generation_number]]] warn "Couldn't connect reference for #{o}" if o[:referenced_object].nil? o.delete :indirect_reference_id o.delete :indirect_generation_number o = (o[:referenced_object] && o[:referenced_object][:indirect_without_dictionary]) || o end obj[k] = o else should_resolve << o end elsif o.is_a?(Array) should_resolve << o end end elsif obj.is_a?(Array) obj.map! do |o| if o.is_a?(Hash) if o[:is_reference_only] if o[:indirect_reference_id].nil? o = nil else o[:referenced_object] = obj_dir[[o[:indirect_reference_id], o[:indirect_generation_number]]] warn "Couldn't connect reference for #{o}" if o[:referenced_object].nil? o.delete :indirect_reference_id o.delete :indirect_generation_number o = (o[:referenced_object] && o[:referenced_object][:indirect_without_dictionary]) || o end else should_resolve << o end elsif o.is_a?(Array) should_resolve << o end o end end end end # def serialize_objects_and_references # rec_resolve = proc do |level| # if level.is_a?(Hash) # if level[:is_reference_only] # level[:referenced_object] = get_refernced_object(level) # level = (level[:referenced_object] && level[:referenced_object][:indirect_without_dictionary]) || level # level.delete :indirect_reference_id # level.delete :indirect_generation_number # else # level.keys.each do |k| # level[k] = rec_resolve.call(level[k]) unless level[k].is_a?(Hash) && level[k][:indirect_reference_id] && level[k][:is_reference_only].nil? # end # end # elsif level.is_a?(Array) # level.map! { |o| rec_resolve.call(o) } # end # level # end # rec_resolve.call(@root_object) # rec_resolve.call(@parsed) # self # end # All Strings are one String def unify_string(str) @strings_dictionary[str] ||= str end # @private # this method reviews a Hash and updates it by merging Hash data, # preffering the old over the new. def self.hash_update_proc_for_old(_key, old_data, new_data) if old_data.is_a? Hash old_data.merge(new_data, &method(:hash_update_proc_for_old)) else old_data end end # @private # this method reviews a Hash an updates it by merging Hash data, # preffering the new over the old. def self.hash_update_proc_for_new(_key, old_data, new_data) if old_data.is_a? Hash old_data.merge(new_data, &method(:hash_update_proc_for_new)) else new_data end end # # run block of code on evey PDF object (PDF objects are class Hash) # def each_object(object, limit_references = true, already_visited = {}, &block) # unless limit_references # already_visited[object.object_id] = true # end # case # when object.is_a?(Array) # object.each {|obj| each_object(obj, limit_references, already_visited, &block)} # when object.is_a?(Hash) # yield(object) # unless limit_references && object[:is_reference_only] # object.each do |k,v| # each_object(v, limit_references, already_visited, &block) unless already_visited[v.object_id] # end # end # end # end end end