lib/combine_pdf/parser.rb in combine_pdf-1.0.3 vs lib/combine_pdf/parser.rb in combine_pdf-1.0.4
- old
+ new
@@ -50,10 +50,11 @@
@names_object = {}.dup
@outlines_object = {}.dup
@forms_object = {}.dup
@metadata = nil
@strings_dictionary = {}.dup # all strings are one string
+ @resolution_hash = {}.dup
@version = nil
@scanner = nil
@allow_optional_content = options[:allow_optional_content]
end
@@ -93,35 +94,68 @@
decryptor = PDFDecrypt.new @parsed, @root_object
decryptor.decrypt
# do we really need to apply to @parsed? No, there is no need.
end
- ## search for objects streams
- object_streams = @parsed.select { |obj| obj.is_a?(Hash) && obj[:Type] == :ObjStm }
- unless object_streams.empty?
- warn 'PDF 1.5 Object streams found - they are not fully supported! attempting to extract objects.'
-
- object_streams.each do |o|
+ # search for objects streams and replace them "in-place"
+ # the inplace resolution prevents versioning errors
+ while (true)
+ found_object_streams = false
+ @parsed.length.times do |i|
+ o = @parsed[i]
+ next unless o.is_a?(Hash) && o[:Type] == :ObjStm
## un-encode (using the correct filter) the object streams
PDFFilter.inflate_object o
- ## extract objects from stream to top level arry @parsed
+ ## extract objects from stream
@scanner = StringScanner.new o[:raw_stream_content]
stream_data = _parse_
id_array = []
+ collection = [nil]
while stream_data[0].is_a? (Numeric)
id_array << stream_data.shift
stream_data.shift
end
while id_array[0] && stream_data[0]
stream_data[0] = { indirect_without_dictionary: stream_data[0] } unless stream_data[0].is_a?(Hash)
stream_data[0][:indirect_reference_id] = id_array.shift
stream_data[0][:indirect_generation_number] = 0
- @parsed << stream_data.shift
+ collection << (stream_data.shift)
end
+ # place new objects right after this one (removing this one as well)
+ @parsed[i] = collection
+ found_object_streams = true
end
+ break unless found_object_streams
+ @parsed.flatten!
+ @parsed.compact!
end
+ #
+ # object_streams = @parsed.select { |obj| obj.is_a?(Hash) && obj[:Type] == :ObjStm }
+ # unless object_streams.empty?
+ # warn 'PDF 1.5 Object streams found - they are not fully supported! attempting to extract objects.'
+ #
+ # object_streams.each do |o|
+ # ## un-encode (using the correct filter) the object streams
+ # PDFFilter.inflate_object o
+ # ## extract objects from stream to top level arry @parsed
+ # @scanner = StringScanner.new o[:raw_stream_content]
+ # stream_data = _parse_
+ # id_array = []
+ # while stream_data[0].is_a? (Numeric)
+ # id_array << stream_data.shift
+ # stream_data.shift
+ # end
+ # while id_array[0] && stream_data[0]
+ # stream_data[0] = { indirect_without_dictionary: stream_data[0] } unless stream_data[0].is_a?(Hash)
+ # stream_data[0][:indirect_reference_id] = id_array.shift
+ # stream_data[0][:indirect_generation_number] = 0
+ # @parsed << stream_data.shift
+ # end
+ # end
+ # end
+
# serialize_objects_and_references.catalog_pages
# Benchmark.bm do |bm|
# bm.report("serialize") {1000.times {serialize_objects_and_references} }
# bm.report("serialize - old") {1000.times {old_serialize_objects_and_references} }
@@ -147,10 +181,13 @@
CombinePDF::PDF::PRIVATE_HASH_KEYS.each { |key| @info_object.delete key }
@info_object.each { |_k, v| @info_object = v[:referenced_object] if v.is_a?(Hash) && v[:referenced_object] }
else
@info_object = {}
end
+
+ # we can clear the resolution hash now
+ @resolution_hash.clear if @resolution_hash
# # # ## remove object streams - if they exist
# @parsed.reject! {|obj| object_streams << obj if obj.is_a?(Hash) && obj[:Type] == :ObjStm}
# # # ## remove XREF dictionaries - if they exist
# @parsed.reject! {|obj| object_streams << obj if obj.is_a?(Hash) && obj[:Type] == :XRef}
@@ -375,11 +412,11 @@
@scanner.scan_until(/(trailer)|(\%EOF)/)
fresh = true
if @scanner.matched[-1] == 'r'
if @scanner.skip_until(/<</)
data = _parse_
- @root_object ||= {}
+ (@root_object ||= {}).clear
@root_object[data.shift] = data.shift while data[0]
end
##########
## skip untill end of segment, maked by %%EOF
@scanner.skip_until(/\%\%EOF/)
@@ -512,54 +549,35 @@
end
end
self
end
- def get_refernced_object(reference_hash = {})
- @parsed.each do |stored_object|
- return stored_object if stored_object.is_a?(Hash) &&
- reference_hash[:indirect_reference_id] == stored_object[:indirect_reference_id] &&
- reference_hash[:indirect_generation_number] == stored_object[:indirect_generation_number]
- # return (stored_object[:indirect_without_dictionary] || stored_object) if stored_object.is_a?(Hash) &&
- # reference_hash[:indirect_reference_id] == stored_object[:indirect_reference_id] &&
- # reference_hash[:indirect_generation_number] == stored_object[:indirect_generation_number]
- end
- warn "didn't find reference #{reference_hash}"
- nil
- end
-
- # # @private
- # # connects references and objects, according to their reference id's.
- # #
- # # should be moved to the parser's workflow.
- # #
- # def serialize_objects_and_references_old
- # obj_dir = {}
- # # create a dictionary for referenced objects (no value resolution at this point)
- # @parsed.each { |o| obj_dir[[o.delete(:indirect_reference_id), o.delete(:indirect_generation_number)]] = o }
- # # @parsed.each {|o| obj_dir[ [ o.[](:indirect_reference_id), o.[](:indirect_generation_number) ] ] = o }
- # @references.each do |obj|
- # obj[:referenced_object] = obj_dir[[obj[:indirect_reference_id], obj[:indirect_generation_number]]]
- # warn "couldn't connect a reference!!! could be a null or removed (empty) object, Silent error!!!\n Object raising issue: #{obj}" unless obj[:referenced_object]
- # obj.delete(:indirect_reference_id); obj.delete(:indirect_generation_number)
- # end
- # obj_dir.clear
- # @references.clear
- # self
- # end
-
# @private
# connects references and objects, according to their reference id's.
#
# Also replaces :indirect_without_dictionary objects with their actual values. Strings, Hashes and Arrays still share memory space.
#
# should be moved to the parser's workflow.
#
def serialize_objects_and_references
obj_dir = {}
+ objid_cache = {}
# create a dictionary for referenced objects (no value resolution at this point)
- # @parsed.each { |o| obj_dir[[o.delete(:indirect_reference_id), o.delete(:indirect_generation_number)]] = o }
- @parsed.each { |o| obj_dir[[o[:indirect_reference_id], o[:indirect_generation_number]]] = o }
+ # at the same time, delete duplicates and old versions when objects have multiple versions
+ @parsed.uniq!
+ @parsed.length.times do |i|
+ o = @parsed[i]
+ objid_cache[o.object_id] = i
+ tmp_key = [o[:indirect_reference_id], o[:indirect_generation_number]]
+ if tmp_found = obj_dir[tmp_key]
+ tmp_found.clear
+ @parsed[objid_cache[tmp_found.object_id]] = nil
+ end
+ obj_dir[tmp_key] = o
+ end
+ @parsed.compact!
+ objid_cache.clear
+
should_resolve = [@parsed, @root_object]
while should_resolve.count > 0
obj = should_resolve.pop
if obj.is_a?(Hash)
obj.keys.each do |k|