lib/combine_pdf/parser.rb in combine_pdf-0.2.15 vs lib/combine_pdf/parser.rb in combine_pdf-0.2.16

- old
+ new

@@ -97,14 +97,14 @@ ## search for objects streams object_streams = @parsed.select {|obj| obj.is_a?(Hash) && obj[:Type] == :ObjStm} unless object_streams.empty? warn "PDF 1.5 Object streams found - they are not fully supported! attempting to extract objects." - + object_streams.each do |o| ## un-encode (using the correct filter) the object streams - PDFFilter.inflate_object o + PDFFilter.inflate_object o ## extract objects from stream to top level arry @parsed @scanner = StringScanner.new o[:raw_stream_content] stream_data = _parse_ id_array = [] while stream_data[0].is_a? Fixnum @@ -121,11 +121,10 @@ end # Strings were unified, we can let them go.. @strings_dictionary.clear - # serialize_objects_and_references.catalog_pages # Benchmark.bm do |bm| # bm.report("serialize") {1000.times {serialize_objects_and_references} } # bm.report("serialize - old") {1000.times {old_serialize_objects_and_references} } @@ -314,11 +313,11 @@ ########################################## when str = @scanner.scan(/\%/) #is a comment, skip until new line loop do # break unless @scanner.scan(/[^\d\r\n]+/) - break if @scanner.check(/([\d]+[\s]+[\d]+[\s]+obj[\n\r\s]+\<\<)|([\n\r]+)/) || @scanner.eos? # || @scanner.scan(/[^\d]+[\r\n]+/) || + break if @scanner.check(/([\d]+[\s]+[\d]+[\s]+obj[\n\r\s]+\<\<)|([\n\r]+)/) || @scanner.eos? # || @scanner.scan(/[^\d]+[\r\n]+/) || @scanner.scan(/[^\d\r\n]+/) || @scanner.pos += 1 end # puts "AFTER COMMENT: #{@scanner.peek 8}" ########################################## ## Parse a Name @@ -362,26 +361,31 @@ fresh = true if @scanner.matched[-1] == 'r' if @scanner.skip_until(/<</) data = _parse_ @root_object ||= {} - @root_object[data.shift] = data.shift while data[0] + @root_object[data.shift] = data.shift while data[0] end ########## ## skip untill end of segment, maked by %%EOF @scanner.skip_until(/\%\%EOF/) + ########## + ## If this was the last valid segment, ignore any trailing garbage + ## (issue #49 resolution) + break unless @scanner.exist?(/\%\%EOF/) + end - + when @scanner.scan(/[\s]+/) # Generally, do nothing nil when @scanner.scan(/obj[\s]*/) # Fix wkhtmltopdf PDF authoring issue - missing 'endobj' keywords unless fresh || (out[-4].nil? || out[-4].is_a?(Hash)) keep = [] - keep << out.pop # .tap {|i| puts "#{i} is an ID"} - keep << out.pop # .tap {|i| puts "#{i} is a REF"} + keep << out.pop # .tap {|i| puts "#{i} is an ID"} + keep << out.pop # .tap {|i| puts "#{i} is a REF"} if out.last.is_a? Hash out << out.pop.merge({indirect_generation_number: out.pop, indirect_reference_id: out.pop}) else out << {indirect_without_dictionary: out.pop, indirect_generation_number: out.pop, indirect_reference_id: out.pop} @@ -391,11 +395,11 @@ out << keep.pop out << keep.pop end fresh = false else - # always advance + # always advance # warn "Advnacing for unknown reason... #{@scanner.peek(4)}" unless @scanner.peek(1) =~ /[\s\n]/ warn "Warning: parser advnacing for unknown reason. Potential data-loss." @scanner.pos = @scanner.pos + 1 end end @@ -418,11 +422,11 @@ @parsed.delete_if {|obj| obj[:Type] == :Catalog} @parsed << catalogs raise "Unknown error - parsed data doesn't contain a cataloged object!" unless catalogs end - case + case when catalogs.is_a?(Array) catalogs.each {|c| catalog_pages(c, inheritance_hash ) unless c.nil?} when catalogs.is_a?(Hash) if catalogs[:is_reference_only] if catalogs[:referenced_object] @@ -576,6 +580,6 @@ # end # end # end end -end \ No newline at end of file +end