lib/combine_pdf/parser.rb in combine_pdf-0.2.37 vs lib/combine_pdf/parser.rb in combine_pdf-1.0.0
- old
+ new
@@ -105,11 +105,11 @@
PDFFilter.inflate_object o
## extract objects from stream to top level arry @parsed
@scanner = StringScanner.new o[:raw_stream_content]
stream_data = _parse_
id_array = []
- while stream_data[0].is_a? (Integer)
+ while stream_data[0].is_a? (Numeric)
id_array << stream_data.shift
stream_data.shift
end
while id_array[0] && stream_data[0]
stream_data[0] = { indirect_without_dictionary: stream_data[0] } unless stream_data[0].is_a?(Hash)
@@ -178,62 +178,39 @@
# puts "next is #{@scanner.peek 8}"
# end unless (last == out.count) || (-1 == (last = out.count))
if @scanner.scan(/\[/)
out << _parse_
##########################################
- ## parse a Dictionary
+ ## Parse a Name
##########################################
- elsif @scanner.scan(/<</)
- data = _parse_
- obj = {}
- obj[data.shift] = data.shift while data[0]
- out << obj
+ # old, probably working version: when str = @scanner.scan(/\/[\#\w\d\.\+\-\\\?\,]+/)
+ # I don't know how to write the /[\x21-\x7e___subtract_certain_hex_values_here____]+/
+ # all allowed regular caracters between ! and ~ : /[\x21-\x24\x26\x27\x2a-\x2e\x30-\x3b\x3d\x3f-\x5a\x5c\x5e-\x7a\x7c\x7e]+
+ # all characters that aren't white space or special: /[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+
+ elsif str = @scanner.scan(/\/[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]*/)
+ out << (str[1..-1].gsub(/\#[0-9a-fA-F]{2}/) { |a| a[1..2].hex.chr }).to_sym
##########################################
- ## return content of array or dictionary
+ ## Parse a Number
##########################################
- elsif @scanner.scan(/\]/) || @scanner.scan(/>>/)
- return out
+ elsif str = @scanner.scan(/[\+\-\.\d]+/)
+ str =~ /\./ ? (out << str.to_f) : (out << str.to_i)
##########################################
- ## parse a Stream
- ##########################################
- elsif @scanner.scan(/stream[\r\n]/)
- @scanner.pos += 1 if @scanner.peek(1) == "\n".freeze && @scanner.matched[-1] != "\n".freeze
- # the following was dicarded because some PDF files didn't have an EOL marker as required
- # str = @scanner.scan_until(/(\r\n|\r|\n)endstream/)
- # instead, a non-strict RegExp is used:
- str = @scanner.scan_until(/endstream/)
- # raise error if the stream doesn't end.
- raise "Parsing Error: PDF file error - a stream object wasn't properly closed using 'endstream'!" unless str
- # need to remove end of stream
- if out.last.is_a? Hash
- # out.last[:raw_stream_content] = str[0...-10] #cuts only one EON char (\n or \r)
- out.last[:raw_stream_content] = unify_string str.sub(/(\r\n|\n|\r)?endstream\z/, '').force_encoding(Encoding::ASCII_8BIT)
- else
- warn 'Stream not attached to dictionary!'
- out << str.sub(/(\r\n|\n|\r)?endstream\z/, '').force_encoding(Encoding::ASCII_8BIT)
- end
- ##########################################
- ## parse an Object after finished
- ##########################################
- elsif str = @scanner.scan(/endobj/)
- # what to do when this is an object?
- if out.last.is_a? Hash
- out << out.pop.merge(indirect_generation_number: out.pop, indirect_reference_id: out.pop)
- else
- out << { indirect_without_dictionary: out.pop, indirect_generation_number: out.pop, indirect_reference_id: out.pop }
- end
- fresh = true
- # fix wkhtmltopdf use of PDF 1.1 Dest using symbols instead of strings
- out.last[:Dest] = unify_string(out.last[:Dest].to_s) if out.last[:Dest] && out.last[:Dest].is_a?(Symbol)
- # puts "!!!!!!!!! Error with :indirect_reference_id\n\nObject #{out.last} :indirect_reference_id = #{out.last[:indirect_reference_id]}" unless out.last[:indirect_reference_id].is_a?(Integer)
- ##########################################
## parse a Hex String
##########################################
- elsif str = @scanner.scan(/<[0-9a-fA-F]*>/)
+ elsif str = @scanner.scan(/\<[0-9a-fA-F]*\>/)
# warn "Found a hex string"
- out << unify_string([str[1..-2]].pack('H*').force_encoding(Encoding::ASCII_8BIT))
+ str = str.slice(1..-2).force_encoding(Encoding::ASCII_8BIT)
+ # str = "0#{str}" if str.length.odd?
+ out << unify_string([str].pack('H*').force_encoding(Encoding::ASCII_8BIT))
##########################################
+ ## parse a space delimited Hex String
+ ##########################################
+ elsif str = @scanner.scan(/\<[0-9a-fA-F\s]*\>/)
+ # warn "Found a space seperated hex string"
+ str = str.force_encoding(Encoding::ASCII_8BIT).split(/\s/).map! {|b| b.length.odd? ? "0#{b}" : b}
+ out << unify_string(str.pack('H*' * str.length).force_encoding(Encoding::ASCII_8BIT))
+ ##########################################
## parse a Literal String
##########################################
elsif @scanner.scan(/\(/)
# warn "Found a literal string"
str = ''.force_encoding(Encoding::ASCII_8BIT)
@@ -313,10 +290,56 @@
str << str_bytes.shift
end
end
out << unify_string(str.pack('C*').force_encoding(Encoding::ASCII_8BIT))
##########################################
+ ## parse a Dictionary
+ ##########################################
+ elsif @scanner.scan(/<</)
+ data = _parse_
+ obj = {}
+ obj[data.shift] = data.shift while data[0]
+ out << obj
+ ##########################################
+ ## return content of array or dictionary
+ ##########################################
+ elsif @scanner.scan(/\]/) || @scanner.scan(/>>/)
+ return out
+ ##########################################
+ ## parse a Stream
+ ##########################################
+ elsif @scanner.scan(/stream[\r\n]/)
+ @scanner.pos += 1 if @scanner.peek(1) == "\n".freeze && @scanner.matched[-1] != "\n".freeze
+ # the following was dicarded because some PDF files didn't have an EOL marker as required
+ # str = @scanner.scan_until(/(\r\n|\r|\n)endstream/)
+ # instead, a non-strict RegExp is used:
+ str = @scanner.scan_until(/endstream/)
+ # raise error if the stream doesn't end.
+ raise "Parsing Error: PDF file error - a stream object wasn't properly closed using 'endstream'!" unless str
+ # need to remove end of stream
+ if out.last.is_a? Hash
+ # out.last[:raw_stream_content] = str[0...-10] #cuts only one EON char (\n or \r)
+ out.last[:raw_stream_content] = unify_string str.sub(/(\r\n|\n|\r)?endstream\z/, '').force_encoding(Encoding::ASCII_8BIT)
+ else
+ warn 'Stream not attached to dictionary!'
+ out << str.sub(/(\r\n|\n|\r)?endstream\z/, '').force_encoding(Encoding::ASCII_8BIT)
+ end
+ ##########################################
+ ## parse an Object after finished
+ ##########################################
+ elsif str = @scanner.scan(/endobj/)
+ # what to do when this is an object?
+ if out.last.is_a? Hash
+ out << out.pop.merge(indirect_generation_number: out.pop, indirect_reference_id: out.pop)
+ else
+ out << { indirect_without_dictionary: out.pop, indirect_generation_number: out.pop, indirect_reference_id: out.pop }
+ end
+ fresh = true
+ # fix wkhtmltopdf use of PDF 1.1 Dest using symbols instead of strings
+ out.last[:Dest] = unify_string(out.last[:Dest].to_s) if out.last[:Dest] && out.last[:Dest].is_a?(Symbol)
+ # puts "!!!!!!!!! Error with :indirect_reference_id\n\nObject #{out.last} :indirect_reference_id = #{out.last[:indirect_reference_id]}" unless out.last[:indirect_reference_id].is_a?(Numeric)
+ ##########################################
## Parse a comment
##########################################
elsif str = @scanner.scan(/\%/)
# is a comment, skip until new line
loop do
@@ -324,24 +347,10 @@
break if @scanner.check(/([\d]+[\s]+[\d]+[\s]+obj[\n\r\s]+\<\<)|([\n\r]+)/) || @scanner.eos? # || @scanner.scan(/[^\d]+[\r\n]+/) ||
@scanner.scan(/[^\d\r\n]+/) || @scanner.pos += 1
end
# puts "AFTER COMMENT: #{@scanner.peek 8}"
##########################################
- ## Parse a Name
- ##########################################
- # old, probably working version: when str = @scanner.scan(/\/[\#\w\d\.\+\-\\\?\,]+/)
- # I don't know how to write the /[\x21-\x7e___subtract_certain_hex_values_here____]+/
- # all allowed regular caracters between ! and ~ : /[\x21-\x24\x26\x27\x2a-\x2e\x30-\x3b\x3d\x3f-\x5a\x5c\x5e-\x7a\x7c\x7e]+
- # all characters that aren't white space or special: /[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+
- elsif str = @scanner.scan(/\/[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]*/)
- out << (str[1..-1].gsub(/\#[0-9a-fA-F]{2}/) { |a| a[1..2].hex.chr }).to_sym
- ##########################################
- ## Parse a Number
- ##########################################
- elsif str = @scanner.scan(/[\+\-\.\d]+/)
- str =~ /\./ ? (out << str.to_f) : (out << str.to_i)
- ##########################################
## Parse an Object Reference
##########################################
elsif @scanner.scan(/R/)
out << { is_reference_only: true, indirect_generation_number: out.pop, indirect_reference_id: out.pop }
# @references << out.last
@@ -560,10 +569,10 @@
if o[:is_reference_only]
if o[:indirect_reference_id].nil?
o = nil
else
o[:referenced_object] = obj_dir[[o[:indirect_reference_id], o[:indirect_generation_number]]]
- warn "Couldn't connect reference for #{o}" if o[:referenced_object].nil?
+ warn "Couldn't connect reference for #{o}" if o[:referenced_object].nil? && (o[:indirect_reference_id] + o[:indirect_generation_number] != 0)
o.delete :indirect_reference_id
o.delete :indirect_generation_number
o = (o[:referenced_object] && o[:referenced_object][:indirect_without_dictionary]) || o
end
obj[k] = o