lib/combine_pdf/parser.rb in combine_pdf-0.2.15 vs lib/combine_pdf/parser.rb in combine_pdf-0.2.16
- old
+ new
@@ -97,14 +97,14 @@
## search for objects streams
object_streams = @parsed.select {|obj| obj.is_a?(Hash) && obj[:Type] == :ObjStm}
unless object_streams.empty?
warn "PDF 1.5 Object streams found - they are not fully supported! attempting to extract objects."
-
+
object_streams.each do |o|
## un-encode (using the correct filter) the object streams
- PDFFilter.inflate_object o
+ PDFFilter.inflate_object o
## extract objects from stream to top level arry @parsed
@scanner = StringScanner.new o[:raw_stream_content]
stream_data = _parse_
id_array = []
while stream_data[0].is_a? Fixnum
@@ -121,11 +121,10 @@
end
# Strings were unified, we can let them go..
@strings_dictionary.clear
-
# serialize_objects_and_references.catalog_pages
# Benchmark.bm do |bm|
# bm.report("serialize") {1000.times {serialize_objects_and_references} }
# bm.report("serialize - old") {1000.times {old_serialize_objects_and_references} }
@@ -314,11 +313,11 @@
##########################################
when str = @scanner.scan(/\%/)
#is a comment, skip until new line
loop do
# break unless @scanner.scan(/[^\d\r\n]+/)
- break if @scanner.check(/([\d]+[\s]+[\d]+[\s]+obj[\n\r\s]+\<\<)|([\n\r]+)/) || @scanner.eos? # || @scanner.scan(/[^\d]+[\r\n]+/) ||
+ break if @scanner.check(/([\d]+[\s]+[\d]+[\s]+obj[\n\r\s]+\<\<)|([\n\r]+)/) || @scanner.eos? # || @scanner.scan(/[^\d]+[\r\n]+/) ||
@scanner.scan(/[^\d\r\n]+/) || @scanner.pos += 1
end
# puts "AFTER COMMENT: #{@scanner.peek 8}"
##########################################
## Parse a Name
@@ -362,26 +361,31 @@
fresh = true
if @scanner.matched[-1] == 'r'
if @scanner.skip_until(/<</)
data = _parse_
@root_object ||= {}
- @root_object[data.shift] = data.shift while data[0]
+ @root_object[data.shift] = data.shift while data[0]
end
##########
## skip untill end of segment, maked by %%EOF
@scanner.skip_until(/\%\%EOF/)
+ ##########
+ ## If this was the last valid segment, ignore any trailing garbage
+ ## (issue #49 resolution)
+ break unless @scanner.exist?(/\%\%EOF/)
+
end
-
+
when @scanner.scan(/[\s]+/)
# Generally, do nothing
nil
when @scanner.scan(/obj[\s]*/)
# Fix wkhtmltopdf PDF authoring issue - missing 'endobj' keywords
unless fresh || (out[-4].nil? || out[-4].is_a?(Hash))
keep = []
- keep << out.pop # .tap {|i| puts "#{i} is an ID"}
- keep << out.pop # .tap {|i| puts "#{i} is a REF"}
+ keep << out.pop # .tap {|i| puts "#{i} is an ID"}
+ keep << out.pop # .tap {|i| puts "#{i} is a REF"}
if out.last.is_a? Hash
out << out.pop.merge({indirect_generation_number: out.pop, indirect_reference_id: out.pop})
else
out << {indirect_without_dictionary: out.pop, indirect_generation_number: out.pop, indirect_reference_id: out.pop}
@@ -391,11 +395,11 @@
out << keep.pop
out << keep.pop
end
fresh = false
else
- # always advance
+ # always advance
# warn "Advnacing for unknown reason... #{@scanner.peek(4)}" unless @scanner.peek(1) =~ /[\s\n]/
warn "Warning: parser advnacing for unknown reason. Potential data-loss."
@scanner.pos = @scanner.pos + 1
end
end
@@ -418,11 +422,11 @@
@parsed.delete_if {|obj| obj[:Type] == :Catalog}
@parsed << catalogs
raise "Unknown error - parsed data doesn't contain a cataloged object!" unless catalogs
end
- case
+ case
when catalogs.is_a?(Array)
catalogs.each {|c| catalog_pages(c, inheritance_hash ) unless c.nil?}
when catalogs.is_a?(Hash)
if catalogs[:is_reference_only]
if catalogs[:referenced_object]
@@ -576,6 +580,6 @@
# end
# end
# end
end
-end
\ No newline at end of file
+end