lib/combine_pdf/parser.rb in combine_pdf-0.2.37 vs lib/combine_pdf/parser.rb in combine_pdf-1.0.0

- old
+ new

@@ -105,11 +105,11 @@ PDFFilter.inflate_object o ## extract objects from stream to top level arry @parsed @scanner = StringScanner.new o[:raw_stream_content] stream_data = _parse_ id_array = [] - while stream_data[0].is_a? (Integer) + while stream_data[0].is_a? (Numeric) id_array << stream_data.shift stream_data.shift end while id_array[0] && stream_data[0] stream_data[0] = { indirect_without_dictionary: stream_data[0] } unless stream_data[0].is_a?(Hash) @@ -178,62 +178,39 @@ # puts "next is #{@scanner.peek 8}" # end unless (last == out.count) || (-1 == (last = out.count)) if @scanner.scan(/\[/) out << _parse_ ########################################## - ## parse a Dictionary + ## Parse a Name ########################################## - elsif @scanner.scan(/<</) - data = _parse_ - obj = {} - obj[data.shift] = data.shift while data[0] - out << obj + # old, probably working version: when str = @scanner.scan(/\/[\#\w\d\.\+\-\\\?\,]+/) + # I don't know how to write the /[\x21-\x7e___subtract_certain_hex_values_here____]+/ + # all allowed regular caracters between ! and ~ : /[\x21-\x24\x26\x27\x2a-\x2e\x30-\x3b\x3d\x3f-\x5a\x5c\x5e-\x7a\x7c\x7e]+ + # all characters that aren't white space or special: /[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+ + elsif str = @scanner.scan(/\/[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]*/) + out << (str[1..-1].gsub(/\#[0-9a-fA-F]{2}/) { |a| a[1..2].hex.chr }).to_sym ########################################## - ## return content of array or dictionary + ## Parse a Number ########################################## - elsif @scanner.scan(/\]/) || @scanner.scan(/>>/) - return out + elsif str = @scanner.scan(/[\+\-\.\d]+/) + str =~ /\./ ? (out << str.to_f) : (out << str.to_i) ########################################## - ## parse a Stream - ########################################## - elsif @scanner.scan(/stream[\r\n]/) - @scanner.pos += 1 if @scanner.peek(1) == "\n".freeze && @scanner.matched[-1] != "\n".freeze - # the following was dicarded because some PDF files didn't have an EOL marker as required - # str = @scanner.scan_until(/(\r\n|\r|\n)endstream/) - # instead, a non-strict RegExp is used: - str = @scanner.scan_until(/endstream/) - # raise error if the stream doesn't end. - raise "Parsing Error: PDF file error - a stream object wasn't properly closed using 'endstream'!" unless str - # need to remove end of stream - if out.last.is_a? Hash - # out.last[:raw_stream_content] = str[0...-10] #cuts only one EON char (\n or \r) - out.last[:raw_stream_content] = unify_string str.sub(/(\r\n|\n|\r)?endstream\z/, '').force_encoding(Encoding::ASCII_8BIT) - else - warn 'Stream not attached to dictionary!' - out << str.sub(/(\r\n|\n|\r)?endstream\z/, '').force_encoding(Encoding::ASCII_8BIT) - end - ########################################## - ## parse an Object after finished - ########################################## - elsif str = @scanner.scan(/endobj/) - # what to do when this is an object? - if out.last.is_a? Hash - out << out.pop.merge(indirect_generation_number: out.pop, indirect_reference_id: out.pop) - else - out << { indirect_without_dictionary: out.pop, indirect_generation_number: out.pop, indirect_reference_id: out.pop } - end - fresh = true - # fix wkhtmltopdf use of PDF 1.1 Dest using symbols instead of strings - out.last[:Dest] = unify_string(out.last[:Dest].to_s) if out.last[:Dest] && out.last[:Dest].is_a?(Symbol) - # puts "!!!!!!!!! Error with :indirect_reference_id\n\nObject #{out.last} :indirect_reference_id = #{out.last[:indirect_reference_id]}" unless out.last[:indirect_reference_id].is_a?(Integer) - ########################################## ## parse a Hex String ########################################## - elsif str = @scanner.scan(/<[0-9a-fA-F]*>/) + elsif str = @scanner.scan(/\<[0-9a-fA-F]*\>/) # warn "Found a hex string" - out << unify_string([str[1..-2]].pack('H*').force_encoding(Encoding::ASCII_8BIT)) + str = str.slice(1..-2).force_encoding(Encoding::ASCII_8BIT) + # str = "0#{str}" if str.length.odd? + out << unify_string([str].pack('H*').force_encoding(Encoding::ASCII_8BIT)) ########################################## + ## parse a space delimited Hex String + ########################################## + elsif str = @scanner.scan(/\<[0-9a-fA-F\s]*\>/) + # warn "Found a space seperated hex string" + str = str.force_encoding(Encoding::ASCII_8BIT).split(/\s/).map! {|b| b.length.odd? ? "0#{b}" : b} + out << unify_string(str.pack('H*' * str.length).force_encoding(Encoding::ASCII_8BIT)) + ########################################## ## parse a Literal String ########################################## elsif @scanner.scan(/\(/) # warn "Found a literal string" str = ''.force_encoding(Encoding::ASCII_8BIT) @@ -313,10 +290,56 @@ str << str_bytes.shift end end out << unify_string(str.pack('C*').force_encoding(Encoding::ASCII_8BIT)) ########################################## + ## parse a Dictionary + ########################################## + elsif @scanner.scan(/<</) + data = _parse_ + obj = {} + obj[data.shift] = data.shift while data[0] + out << obj + ########################################## + ## return content of array or dictionary + ########################################## + elsif @scanner.scan(/\]/) || @scanner.scan(/>>/) + return out + ########################################## + ## parse a Stream + ########################################## + elsif @scanner.scan(/stream[\r\n]/) + @scanner.pos += 1 if @scanner.peek(1) == "\n".freeze && @scanner.matched[-1] != "\n".freeze + # the following was dicarded because some PDF files didn't have an EOL marker as required + # str = @scanner.scan_until(/(\r\n|\r|\n)endstream/) + # instead, a non-strict RegExp is used: + str = @scanner.scan_until(/endstream/) + # raise error if the stream doesn't end. + raise "Parsing Error: PDF file error - a stream object wasn't properly closed using 'endstream'!" unless str + # need to remove end of stream + if out.last.is_a? Hash + # out.last[:raw_stream_content] = str[0...-10] #cuts only one EON char (\n or \r) + out.last[:raw_stream_content] = unify_string str.sub(/(\r\n|\n|\r)?endstream\z/, '').force_encoding(Encoding::ASCII_8BIT) + else + warn 'Stream not attached to dictionary!' + out << str.sub(/(\r\n|\n|\r)?endstream\z/, '').force_encoding(Encoding::ASCII_8BIT) + end + ########################################## + ## parse an Object after finished + ########################################## + elsif str = @scanner.scan(/endobj/) + # what to do when this is an object? + if out.last.is_a? Hash + out << out.pop.merge(indirect_generation_number: out.pop, indirect_reference_id: out.pop) + else + out << { indirect_without_dictionary: out.pop, indirect_generation_number: out.pop, indirect_reference_id: out.pop } + end + fresh = true + # fix wkhtmltopdf use of PDF 1.1 Dest using symbols instead of strings + out.last[:Dest] = unify_string(out.last[:Dest].to_s) if out.last[:Dest] && out.last[:Dest].is_a?(Symbol) + # puts "!!!!!!!!! Error with :indirect_reference_id\n\nObject #{out.last} :indirect_reference_id = #{out.last[:indirect_reference_id]}" unless out.last[:indirect_reference_id].is_a?(Numeric) + ########################################## ## Parse a comment ########################################## elsif str = @scanner.scan(/\%/) # is a comment, skip until new line loop do @@ -324,24 +347,10 @@ break if @scanner.check(/([\d]+[\s]+[\d]+[\s]+obj[\n\r\s]+\<\<)|([\n\r]+)/) || @scanner.eos? # || @scanner.scan(/[^\d]+[\r\n]+/) || @scanner.scan(/[^\d\r\n]+/) || @scanner.pos += 1 end # puts "AFTER COMMENT: #{@scanner.peek 8}" ########################################## - ## Parse a Name - ########################################## - # old, probably working version: when str = @scanner.scan(/\/[\#\w\d\.\+\-\\\?\,]+/) - # I don't know how to write the /[\x21-\x7e___subtract_certain_hex_values_here____]+/ - # all allowed regular caracters between ! and ~ : /[\x21-\x24\x26\x27\x2a-\x2e\x30-\x3b\x3d\x3f-\x5a\x5c\x5e-\x7a\x7c\x7e]+ - # all characters that aren't white space or special: /[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+ - elsif str = @scanner.scan(/\/[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]*/) - out << (str[1..-1].gsub(/\#[0-9a-fA-F]{2}/) { |a| a[1..2].hex.chr }).to_sym - ########################################## - ## Parse a Number - ########################################## - elsif str = @scanner.scan(/[\+\-\.\d]+/) - str =~ /\./ ? (out << str.to_f) : (out << str.to_i) - ########################################## ## Parse an Object Reference ########################################## elsif @scanner.scan(/R/) out << { is_reference_only: true, indirect_generation_number: out.pop, indirect_reference_id: out.pop } # @references << out.last @@ -560,10 +569,10 @@ if o[:is_reference_only] if o[:indirect_reference_id].nil? o = nil else o[:referenced_object] = obj_dir[[o[:indirect_reference_id], o[:indirect_generation_number]]] - warn "Couldn't connect reference for #{o}" if o[:referenced_object].nil? + warn "Couldn't connect reference for #{o}" if o[:referenced_object].nil? && (o[:indirect_reference_id] + o[:indirect_generation_number] != 0) o.delete :indirect_reference_id o.delete :indirect_generation_number o = (o[:referenced_object] && o[:referenced_object][:indirect_without_dictionary]) || o end obj[k] = o