lib/combine_pdf/parser.rb in combine_pdf-0.2.6 vs lib/combine_pdf/parser.rb in combine_pdf-0.2.7

- old
+ new

@@ -34,11 +34,11 @@ # a Float representing the PDF version of the data parsed (if exists). attr_reader :version # the info and root objects, as found (if found) in the PDF file. # # they are mainly to used to know if the file is (was) encrypted and to get more details. - attr_reader :info_object, :root_object + attr_reader :info_object, :root_object, :names_object # when creating a parser, it is important to set the data (String) we wish to parse. # # <b>the data is required and it is not possible to set the data at a later stage</b> # @@ -51,10 +51,12 @@ @streams = [] @parsed = [] @references = [] @root_object = {} @info_object = {} + @names_object = {} + @strings_dictionary = {} # all strings are one string @version = nil @scanner = nil end # parse the data in the new parser (the data already set through the initialize / new method) @@ -111,10 +113,13 @@ @parsed << stream_data.shift end end end + # Strings were unified, we can let them go.. + @strings_dictionary.clear + # serialize_objects_and_references.catalog_pages # Benchmark.bm do |bm| # bm.report("serialize") {1000.times {serialize_objects_and_references} } @@ -177,14 +182,14 @@ # raise error if the stream doesn't end. raise "Parsing Error: PDF file error - a stream object wasn't properly colsed using 'endstream'!" unless str # need to remove end of stream if out.last.is_a? Hash # out.last[:raw_stream_content] = str[0...-10] #cuts only one EON char (\n or \r) - out.last[:raw_stream_content] = str.gsub(/[\n\r]?[\n\r]endstream\z/, "") + out.last[:raw_stream_content] = unify_string str.sub(/[\n\r]?[\n\r]endstream\z/, "").force_encoding(Encoding::ASCII_8BIT) else warn "Stream not attached to dictionary!" - out << str[0...-10].force_encoding(Encoding::ASCII_8BIT) + out << str.sub(/[\n\r]?[\n\r]endstream\z/, "").force_encoding(Encoding::ASCII_8BIT) end ########################################## ## parse an Object after finished ########################################## when str = @scanner.scan(/endobj/) @@ -197,27 +202,33 @@ ########################################## ## parse a Hex String ########################################## when str = @scanner.scan(/<[0-9a-fA-F]+>/) # warn "Found a hex string" - out << [str[1..-2]].pack('H*') + out << unify_string([str[1..-2]].pack('H*').force_encoding(Encoding::ASCII_8BIT)) ########################################## ## parse a Literal String ########################################## when @scanner.scan(/\(/) # warn "Found a literal string" str = ''.force_encoding(Encoding::ASCII_8BIT) count = 1 while count > 0 && @scanner.rest? do - str += @scanner.scan_until(/[\(\)]/).to_s + scn = @scanner.scan_until(/[\(\)]/) + unless scn + warn "Unknown error parsing string at #{@scanner.pos} for string: #{str}!" + count = 0 # error + next + end + + str += scn.to_s seperator_count = 0 seperator_count += 1 while str[-2-seperator_count] == "\\" case str[-1] when '(' - ## The following solution fails when (string ends with this sign: \\) - + ## The following solution might fail when (string ends with this sign: \\) count += 1 unless seperator_count.odd? when ')' count -= 1 unless seperator_count.odd? else warn "Unknown error parsing string at #{@scanner.pos} for string: #{str}!" @@ -274,11 +285,11 @@ end else str << str_bytes.shift end end - out << str.pack('C*').force_encoding(Encoding::ASCII_8BIT) + out << unify_string(str.pack('C*').force_encoding(Encoding::ASCII_8BIT)) ########################################## ## Parse a comment ########################################## when str = @scanner.scan(/\%/) #is a comment, skip until new line @@ -366,11 +377,11 @@ protected # resets cataloging and pages - def catalog_pages(catalogs = nil, secure_injection = false, inheritance_hash = {}) + def catalog_pages(catalogs = nil, inheritance_hash = {}) unless catalogs if root_object[:Root] catalogs = root_object[:Root][:referenced_object] || root_object[:Root] else @@ -381,15 +392,15 @@ raise "Unknown error - parsed data doesn't contain a cataloged object!" unless catalogs end case when catalogs.is_a?(Array) - catalogs.each {|c| catalog_pages(c, secure_injection, inheritance_hash ) unless c.nil?} + catalogs.each {|c| catalog_pages(c, inheritance_hash ) unless c.nil?} when catalogs.is_a?(Hash) if catalogs[:is_reference_only] if catalogs[:referenced_object] - catalog_pages(catalogs[:referenced_object], secure_injection, inheritance_hash) + catalog_pages(catalogs[:referenced_object], inheritance_hash) else warn "couldn't follow reference!!! #{catalogs} not found!" end else unless catalogs[:Type] == :Page @@ -422,15 +433,15 @@ catalogs[:MediaBox] = catalogs[:MediaBox][:referenced_object][:indirect_without_dictionary] if catalogs[:MediaBox].is_a?(Hash) && catalogs[:MediaBox][:referenced_object].is_a?(Hash) && catalogs[:MediaBox][:referenced_object][:indirect_without_dictionary] catalogs[:CropBox] = catalogs[:CropBox][:referenced_object][:indirect_without_dictionary] if catalogs[:CropBox].is_a?(Hash) && catalogs[:CropBox][:referenced_object].is_a?(Hash) && catalogs[:CropBox][:referenced_object][:indirect_without_dictionary] catalogs[:Rotate] = catalogs[:Rotate][:referenced_object][:indirect_without_dictionary] if catalogs[:Rotate].is_a?(Hash) && catalogs[:Rotate][:referenced_object].is_a?(Hash) && catalogs[:Rotate][:referenced_object][:indirect_without_dictionary] catalogs.instance_eval {extend Page_Methods} - catalogs.secure_injection = secure_injection when :Pages - catalog_pages(catalogs[:Kids], secure_injection, inheritance_hash.dup ) unless catalogs[:Kids].nil? + catalog_pages(catalogs[:Kids], inheritance_hash.dup ) unless catalogs[:Kids].nil? when :Catalog - catalog_pages(catalogs[:Pages], secure_injection, inheritance_hash.dup ) unless catalogs[:Pages].nil? + @names_object.update( (catalogs[:Names][:referenced_object] || catalogs[:Names]), &self.class.method(:hash_update_proc_for_new) ) if catalogs[:Names] + catalog_pages(catalogs[:Pages], inheritance_hash.dup ) unless catalogs[:Pages].nil? end end end self end @@ -471,14 +482,14 @@ obj[:referenced_object] = obj_dir[ [obj[:indirect_reference_id], obj[:indirect_generation_number] ] ] warn "couldn't connect a reference!!! could be a null or removed (empty) object, Silent error!!!\n Object raising issue: #{obj.to_s}" unless obj[:referenced_object] obj.delete(:indirect_reference_id); obj.delete(:indirect_generation_number) end self - # rescue => e - # puts (@parsed.select {|o| !o.is_a?(Hash)}) - # puts (@parsed) - # puts (@references) - # raise e + end + + # All Strings are one String + def unify_string str + @strings_dictionary[str] ||= str end # @private # this method reviews a Hash and updates it by merging Hash data, # preffering the old over the new. \ No newline at end of file