lib/combine_pdf/parser.rb in combine_pdf-0.2.11 vs lib/combine_pdf/parser.rb in combine_pdf-0.2.12
- old
+ new
@@ -65,14 +65,19 @@
return @parsed unless @parsed.empty?
@scanner = StringScanner.new @string_to_parse
@scanner.pos = 0
if @scanner.scan /\%PDF\-[\d\-\.]+/
@version = @scanner.matched.scan(/[\d\.]+/)[0].to_f
- @scanner.skip_until /[\n\r]+/
- # @scanner.skip /[^\d]*/
+ loop do
+ break unless @scanner.scan(/[^\d\r\n]+/)
+ break if @scanner.check(/([\d]+[\s]+[\d]+[\s]+obj[\n\r\s]+\<\<)|([\n\r]+)/)
+ break if @scanner.eos?
+ @scanner.pos += 1
+ end
end
@parsed = _parse_
+ # puts @parsed
raise "Unknown PDF parsing error - maleformed PDF file?" unless (@parsed.select {|i| !i.is_a?(Hash)}).empty?
if @root_object == {}
xref_streams = @parsed.select {|obj| obj.is_a?(Hash) && obj[:Type] == :XRef}
@@ -149,11 +154,24 @@
#
# this is an internal function, but it was left exposed for posible future features.
def _parse_
out = []
str = ''
+ fresh = true
while @scanner.rest? do
+ # last ||= 0
+ # out.last.tap do |o|
+ # if o.is_a?(Hash)
+ # puts "[#{@scanner.pos}] Parser has a Dictionary (#{o.class.name}) with data:"
+ # o.each do |k, v|
+ # puts " #{k}: is #{v.class.name} with data: #{v.to_s[0..4]}#{"..." if v.to_s.length > 5}"
+ # end
+ # else
+ # puts "[#{@scanner.pos}] Parser has #{o.class.name} with data: #{o.to_s[0..4]}#{"..." if o.to_s.length > 5}"
+ # end
+ # puts "next is #{@scanner.peek 8}"
+ # end unless (last == out.count) || (-1 == (last = out.count))
case
##########################################
## parse an Array
##########################################
when @scanner.scan(/\[/)
@@ -172,11 +190,12 @@
when @scanner.scan(/\]/), @scanner.scan(/>>/)
return out
##########################################
## parse a Stream
##########################################
- when @scanner.scan(/stream[\r]?[\n]/)
+ when @scanner.scan(/stream[\r\n]/)
+ @scanner.pos += 1 if @scanner.peek(1) == "\n".freeze && @scanner.matched[-1] != "\n".freeze
# the following was dicarded because some PDF files didn't have an EOL marker as required
# str = @scanner.scan_until(/(\r\n|\r|\n)endstream/)
# instead, a non-strict RegExp is used:
str = @scanner.scan_until(/endstream/)
# raise error if the stream doesn't end.
@@ -197,10 +216,12 @@
if out.last.is_a? Hash
out << out.pop.merge({indirect_generation_number: out.pop, indirect_reference_id: out.pop})
else
out << {indirect_without_dictionary: out.pop, indirect_generation_number: out.pop, indirect_reference_id: out.pop}
end
+ fresh = true
+ # puts "!!!!!!!!! Error with :indirect_reference_id\n\nObject #{out.last} :indirect_reference_id = #{out.last[:indirect_reference_id]}" unless out.last[:indirect_reference_id].is_a?(Fixnum)
##########################################
## parse a Hex String
##########################################
when str = @scanner.scan(/<[0-9a-fA-F]+>/)
# warn "Found a hex string"
@@ -291,11 +312,17 @@
##########################################
## Parse a comment
##########################################
when str = @scanner.scan(/\%/)
#is a comment, skip until new line
- @scanner.skip_until /[\n\r]+/
+ loop do
+ break unless @scanner.scan(/[^\d\r\n]+/)
+ break if @scanner.check(/([\d]+ [\d]+ obj)?[\n\r]+/)
+ break if @scanner.eos?
+ @scanner.pos += 1
+ end
+ # puts "AFTER COMMENT: #{@scanner.peek 8}"
##########################################
## Parse a Name
##########################################
# old, probably working version: when str = @scanner.scan(/\/[\#\w\d\.\+\-\\\?\,]+/)
# I don't know how to write the /[\x21-\x7e___subtract_certain_hex_values_here____]+/
@@ -331,11 +358,11 @@
##########################################
when @scanner.scan(/xref/)
##########
## get root object to check for encryption
@scanner.scan_until(/(trailer)|(\%EOF)/)
-
+ fresh = true
if @scanner.matched[-1] == 'r'
if @scanner.skip_until(/<</)
data = _parse_
@root_object ||= {}
@root_object[data.shift] = data.shift while data[0]
@@ -348,14 +375,14 @@
when @scanner.scan(/[\s]+/)
# Generally, do nothing
nil
when @scanner.scan(/obj[\s]*/)
# Fix wkhtmltopdf PDF authoring issue - missing 'endobj' keywords
- unless out[-4].nil? || out[-4].is_a?(Hash)
+ unless fresh || (out[-4].nil? || out[-4].is_a?(Hash))
keep = []
- keep << out.pop
- keep << out.pop
+ keep << out.pop # .tap {|i| puts "#{i} is an ID"}
+ keep << out.pop # .tap {|i| puts "#{i} is a REF"}
if out.last.is_a? Hash
out << out.pop.merge({indirect_generation_number: out.pop, indirect_reference_id: out.pop})
else
out << {indirect_without_dictionary: out.pop, indirect_generation_number: out.pop, indirect_reference_id: out.pop}
@@ -363,12 +390,14 @@
warn "'endobj' keyword was missing for Object ID: #{out.last[:indirect_reference_id]}, trying to auto-fix issue, but might fail."
out << keep.pop
out << keep.pop
end
+ fresh = false
else
# always advance
- # warn "Advnacing for unknown reason..."
+ # warn "Advnacing for unknown reason... #{@scanner.peek(4)}" unless @scanner.peek(1) =~ /[\s\n]/
+ warn "Warning: parser advnacing for unknown reason. Potential data-loss."
@scanner.pos = @scanner.pos + 1
end
end
out
end
\ No newline at end of file