lib/combine_pdf/parser.rb in combine_pdf-1.0.9 vs lib/combine_pdf/parser.rb in combine_pdf-1.0.10

- old
+ new

@@ -78,10 +78,11 @@ end @parsed = _parse_ # puts @parsed unless (@parsed.select { |i| !i.is_a?(Hash) }).empty? + # p @parsed.select raise ParsingError, 'Unknown PDF parsing error - malformed PDF file?' end if @root_object == {}.freeze xref_streams = @parsed.select { |obj| obj.is_a?(Hash) && obj[:Type] == :XRef } @@ -383,21 +384,10 @@ fresh = true # fix wkhtmltopdf use of PDF 1.1 Dest using symbols instead of strings out.last[:Dest] = unify_string(out.last[:Dest].to_s) if out.last[:Dest] && out.last[:Dest].is_a?(Symbol) # puts "!!!!!!!!! Error with :indirect_reference_id\n\nObject #{out.last} :indirect_reference_id = #{out.last[:indirect_reference_id]}" unless out.last[:indirect_reference_id].is_a?(Numeric) ########################################## - ## Parse a comment - ########################################## - elsif str = @scanner.scan(/\%/) - # is a comment, skip until new line - loop do - # break unless @scanner.scan(/[^\d\r\n]+/) - break if @scanner.check(/([\d]+[\s]+[\d]+[\s]+obj[\s]+\<\<)|([\n\r]+)/) || @scanner.eos? # || @scanner.scan(/[^\d]+[\r\n]+/) || - @scanner.scan(/[^\d\r\n]+/) || @scanner.pos += 1 - end - # puts "AFTER COMMENT: #{@scanner.peek 8}" - ########################################## ## Parse an Object Reference ########################################## elsif @scanner.scan(/R/) out << { is_reference_only: true, indirect_generation_number: out.pop, indirect_reference_id: out.pop } # @references << out.last @@ -412,36 +402,61 @@ ## Parse NULL - null ########################################## elsif @scanner.scan(/null/) out << nil ########################################## + ## Parse file trailer + ########################################## + elsif @scanner.scan(/trailer/) + if @scanner.skip_until(/<</) + data = _parse_ + (@root_object ||= {}).clear + @root_object[data.shift] = data.shift while data[0] + end + ########################################## ## XREF - check for encryption... anything else? ########################################## - elsif @scanner.scan(/(startxref)|(xref)/) - ########## - ## get root object to check for encryption - @scanner.scan_until(/(trailer)|(\%EOF)/) - fresh = true - if @scanner.matched[-1] == 'r' - if @scanner.skip_until(/<</) - data = _parse_ - (@root_object ||= {}).clear - @root_object[data.shift] = data.shift while data[0] - end - ########## - ## skip untill end of segment, maked by %%EOF - @scanner.skip_until(/\%\%EOF/) - ########## - ## If this was the last valid segment, ignore any trailing garbage - ## (issue #49 resolution) - break unless @scanner.exist?(/\%\%EOF/) - + elsif @scanner.scan(/xref/) + # skip first xref line + @scanner.scan(/[\s]+[\d]+[\s]+[\d]+[\s]+/) + while @scanner.scan(/[\d]+[\s][\d]+[\s]+[nf][\s]+/) + # skip all xref lines + nil end - + ########################################## + ## XREF location can be ignored + ########################################## + elsif @scanner.scan(/startxref/) + @scanner.scan(/[\s]+[\d]+[\s]+/) + ########################################## + ## Skip Whitespace + ########################################## elsif @scanner.scan(/[\s]+/) # Generally, do nothing nil + ########################################## + ## EOF? + ########################################## + elsif @scanner.scan(/\%\%EOF/) + ########## + ## If this was the last valid segment, ignore any trailing garbage + ## (issue #49 resolution) + break unless @scanner.exist?(/\%\%EOF/) + ########################################## + ## Parse a comment + ########################################## + elsif str = @scanner.scan(/\%/) + # is a comment, skip until new line + loop do + # break unless @scanner.scan(/[^\d\r\n]+/) + break if @scanner.check(/([\d]+[\s]+[\d]+[\s]+obj[\s]+\<\<)|([\n\r]+)/) || @scanner.eos? # || @scanner.scan(/[^\d]+[\r\n]+/) || + @scanner.scan(/[^\d\r\n]+/) || @scanner.pos += 1 + end + # puts "AFTER COMMENT: #{@scanner.peek 8}" + ########################################## + ## Fix wkhtmltopdf - missing 'endobj' keywords + ########################################## elsif @scanner.scan(/obj[\s]*/) # Fix wkhtmltopdf PDF authoring issue - missing 'endobj' keywords unless fresh || (out[-4].nil? || out[-4].is_a?(Hash)) keep = [] keep << out.pop # .tap {|i| puts "#{i} is an ID"} @@ -458,9 +473,12 @@ out << keep.pop out << keep.pop end fresh = false + ########################################## + ## Unknown, warn and advance + ########################################## else # always advance # warn "Advancing for unknown reason... #{@scanner.string[@scanner.pos - 4, 8]} ... #{@scanner.peek(4)}" unless @scanner.peek(1) =~ /[\s\n]/ warn 'Warning: parser advancing for unknown reason. Potential data-loss.' @scanner.pos = @scanner.pos + 1