lib/combine_pdf/parser.rb in combine_pdf-0.2.11 vs lib/combine_pdf/parser.rb in combine_pdf-0.2.12

- old
+ new

@@ -65,14 +65,19 @@ return @parsed unless @parsed.empty? @scanner = StringScanner.new @string_to_parse @scanner.pos = 0 if @scanner.scan /\%PDF\-[\d\-\.]+/ @version = @scanner.matched.scan(/[\d\.]+/)[0].to_f - @scanner.skip_until /[\n\r]+/ - # @scanner.skip /[^\d]*/ + loop do + break unless @scanner.scan(/[^\d\r\n]+/) + break if @scanner.check(/([\d]+[\s]+[\d]+[\s]+obj[\n\r\s]+\<\<)|([\n\r]+)/) + break if @scanner.eos? + @scanner.pos += 1 + end end @parsed = _parse_ + # puts @parsed raise "Unknown PDF parsing error - maleformed PDF file?" unless (@parsed.select {|i| !i.is_a?(Hash)}).empty? if @root_object == {} xref_streams = @parsed.select {|obj| obj.is_a?(Hash) && obj[:Type] == :XRef} @@ -149,11 +154,24 @@ # # this is an internal function, but it was left exposed for posible future features. def _parse_ out = [] str = '' + fresh = true while @scanner.rest? do + # last ||= 0 + # out.last.tap do |o| + # if o.is_a?(Hash) + # puts "[#{@scanner.pos}] Parser has a Dictionary (#{o.class.name}) with data:" + # o.each do |k, v| + # puts " #{k}: is #{v.class.name} with data: #{v.to_s[0..4]}#{"..." if v.to_s.length > 5}" + # end + # else + # puts "[#{@scanner.pos}] Parser has #{o.class.name} with data: #{o.to_s[0..4]}#{"..." if o.to_s.length > 5}" + # end + # puts "next is #{@scanner.peek 8}" + # end unless (last == out.count) || (-1 == (last = out.count)) case ########################################## ## parse an Array ########################################## when @scanner.scan(/\[/) @@ -172,11 +190,12 @@ when @scanner.scan(/\]/), @scanner.scan(/>>/) return out ########################################## ## parse a Stream ########################################## - when @scanner.scan(/stream[\r]?[\n]/) + when @scanner.scan(/stream[\r\n]/) + @scanner.pos += 1 if @scanner.peek(1) == "\n".freeze && @scanner.matched[-1] != "\n".freeze # the following was dicarded because some PDF files didn't have an EOL marker as required # str = @scanner.scan_until(/(\r\n|\r|\n)endstream/) # instead, a non-strict RegExp is used: str = @scanner.scan_until(/endstream/) # raise error if the stream doesn't end. @@ -197,10 +216,12 @@ if out.last.is_a? Hash out << out.pop.merge({indirect_generation_number: out.pop, indirect_reference_id: out.pop}) else out << {indirect_without_dictionary: out.pop, indirect_generation_number: out.pop, indirect_reference_id: out.pop} end + fresh = true + # puts "!!!!!!!!! Error with :indirect_reference_id\n\nObject #{out.last} :indirect_reference_id = #{out.last[:indirect_reference_id]}" unless out.last[:indirect_reference_id].is_a?(Fixnum) ########################################## ## parse a Hex String ########################################## when str = @scanner.scan(/<[0-9a-fA-F]+>/) # warn "Found a hex string" @@ -291,11 +312,17 @@ ########################################## ## Parse a comment ########################################## when str = @scanner.scan(/\%/) #is a comment, skip until new line - @scanner.skip_until /[\n\r]+/ + loop do + break unless @scanner.scan(/[^\d\r\n]+/) + break if @scanner.check(/([\d]+ [\d]+ obj)?[\n\r]+/) + break if @scanner.eos? + @scanner.pos += 1 + end + # puts "AFTER COMMENT: #{@scanner.peek 8}" ########################################## ## Parse a Name ########################################## # old, probably working version: when str = @scanner.scan(/\/[\#\w\d\.\+\-\\\?\,]+/) # I don't know how to write the /[\x21-\x7e___subtract_certain_hex_values_here____]+/ @@ -331,11 +358,11 @@ ########################################## when @scanner.scan(/xref/) ########## ## get root object to check for encryption @scanner.scan_until(/(trailer)|(\%EOF)/) - + fresh = true if @scanner.matched[-1] == 'r' if @scanner.skip_until(/<</) data = _parse_ @root_object ||= {} @root_object[data.shift] = data.shift while data[0] @@ -348,14 +375,14 @@ when @scanner.scan(/[\s]+/) # Generally, do nothing nil when @scanner.scan(/obj[\s]*/) # Fix wkhtmltopdf PDF authoring issue - missing 'endobj' keywords - unless out[-4].nil? || out[-4].is_a?(Hash) + unless fresh || (out[-4].nil? || out[-4].is_a?(Hash)) keep = [] - keep << out.pop - keep << out.pop + keep << out.pop # .tap {|i| puts "#{i} is an ID"} + keep << out.pop # .tap {|i| puts "#{i} is a REF"} if out.last.is_a? Hash out << out.pop.merge({indirect_generation_number: out.pop, indirect_reference_id: out.pop}) else out << {indirect_without_dictionary: out.pop, indirect_generation_number: out.pop, indirect_reference_id: out.pop} @@ -363,12 +390,14 @@ warn "'endobj' keyword was missing for Object ID: #{out.last[:indirect_reference_id]}, trying to auto-fix issue, but might fail." out << keep.pop out << keep.pop end + fresh = false else # always advance - # warn "Advnacing for unknown reason..." + # warn "Advnacing for unknown reason... #{@scanner.peek(4)}" unless @scanner.peek(1) =~ /[\s\n]/ + warn "Warning: parser advnacing for unknown reason. Potential data-loss." @scanner.pos = @scanner.pos + 1 end end out end \ No newline at end of file