lib/combine_pdf/parser.rb in combine_pdf-1.0.7 vs lib/combine_pdf/parser.rb in combine_pdf-1.0.8

- old
+ new

@@ -4,10 +4,12 @@ ## this file is part of the CombinePDF library and the code ## is subject to the same license. ######################################################## module CombinePDF + ParsingError = Class.new(StandardError) + # @!visibility private # @private #:nodoc: all protected @@ -75,20 +77,24 @@ end end @parsed = _parse_ # puts @parsed - raise 'Unknown PDF parsing error - malformed PDF file?' unless (@parsed.select { |i| !i.is_a?(Hash) }).empty? + unless (@parsed.select { |i| !i.is_a?(Hash) }).empty? + raise ParsingError, 'Unknown PDF parsing error - malformed PDF file?' + end if @root_object == {}.freeze xref_streams = @parsed.select { |obj| obj.is_a?(Hash) && obj[:Type] == :XRef } xref_streams.each do |xref_dictionary| @root_object.merge! xref_dictionary end end - raise 'root is unknown - cannot determine if file is Encrypted' if @root_object == {}.freeze + if @root_object == {}.freeze + raise ParsingError, 'root is unknown - cannot determine if file is Encrypted' + end if @root_object[:Encrypt] # change_references_to_actual_values @root_object warn 'PDF is Encrypted! Attempting to decrypt - not yet fully supported.' decryptor = PDFDecrypt.new @parsed, @root_object @@ -308,14 +314,14 @@ when 98 # b str << 8 when 102 # f, form-feed str << 12 when 48..57 # octal notation for byte? - rep = rep.chr - rep += str_bytes.shift.chr if str_bytes[0].between?(48, 57) - rep += str_bytes.shift.chr if str_bytes[0].between?(48, 57) && ((rep + str_bytes[0].chr).to_i <= 255) - str << rep.to_i + rep -= 48 + rep = (rep << 3) + (str_bytes.shift-48) if str_bytes[0].between?(48, 57) + rep = (rep << 3) + (str_bytes.shift-48) if str_bytes[0].between?(48, 57) && (((rep << 3) + (str_bytes[0] - 48)) <= 255) + str << rep when 10 # new line, ignore str_bytes.shift if str_bytes[0] == 13 true when 13 # new line (or double notation for new line), ignore str_bytes.shift if str_bytes[0] == 10 @@ -348,12 +354,16 @@ @scanner.pos += 1 if @scanner.peek(1) == "\n".freeze && @scanner.matched[-1] != "\n".freeze # the following was dicarded because some PDF files didn't have an EOL marker as required # str = @scanner.scan_until(/(\r\n|\r|\n)endstream/) # instead, a non-strict RegExp is used: str = @scanner.scan_until(/endstream/) + # raise error if the stream doesn't end. - raise "Parsing Error: PDF file error - a stream object wasn't properly closed using 'endstream'!" unless str + unless str + raise ParsingError, "Parsing Error: PDF file error - a stream object wasn't properly closed using 'endstream'!" + end + # need to remove end of stream if out.last.is_a? Hash # out.last[:raw_stream_content] = str[0...-10] #cuts only one EON char (\n or \r) out.last[:raw_stream_content] = unify_string str.sub(/(\r\n|\n|\r)?endstream\z/, '').force_encoding(Encoding::ASCII_8BIT) else @@ -473,11 +483,13 @@ end @parsed.delete_if { |obj| obj.nil? || obj[:Type] == :Catalog } @parsed << catalogs - raise "Unknown error - parsed data doesn't contain a cataloged object!" unless catalogs + unless catalogs + raise ParsingError, "Unknown error - parsed data doesn't contain a cataloged object!" + end end if catalogs.is_a?(Array) catalogs.each { |c| catalog_pages(c, inheritance_hash) unless c.nil? } elsif catalogs.is_a?(Hash) if catalogs[:is_reference_only] @@ -486,24 +498,27 @@ else warn "couldn't follow reference!!! #{catalogs} not found!" end else unless catalogs[:Type] == :Page - raise "Optional Content PDF files aren't supported and their pages cannot be safely extracted." if (catalogs[:AS] || catalogs[:OCProperties]) && !@allow_optional_content + if (catalogs[:AS] || catalogs[:OCProperties]) && !@allow_optional_content + raise ParsingError, "Optional Content PDF files aren't supported and their pages cannot be safely extracted." + end + inheritance_hash[:MediaBox] = catalogs[:MediaBox] if catalogs[:MediaBox] inheritance_hash[:CropBox] = catalogs[:CropBox] if catalogs[:CropBox] inheritance_hash[:Rotate] = catalogs[:Rotate] if catalogs[:Rotate] if catalogs[:Resources] inheritance_hash[:Resources] ||= { referenced_object: {}, is_reference_only: true }.dup - (inheritance_hash[:Resources][:referenced_object] || inheritance_hash[:Resources]).update((catalogs[:Resources][:referenced_object] || catalogs[:Resources]), &self.class.method(:hash_update_proc_for_old)) + (inheritance_hash[:Resources][:referenced_object] || inheritance_hash[:Resources]).update((catalogs[:Resources][:referenced_object] || catalogs[:Resources]), &HASH_UPDATE_PROC_FOR_OLD) end if catalogs[:ColorSpace] inheritance_hash[:ColorSpace] ||= { referenced_object: {}, is_reference_only: true }.dup - (inheritance_hash[:ColorSpace][:referenced_object] || inheritance_hash[:ColorSpace]).update((catalogs[:ColorSpace][:referenced_object] || catalogs[:ColorSpace]), &self.class.method(:hash_update_proc_for_old)) + (inheritance_hash[:ColorSpace][:referenced_object] || inheritance_hash[:ColorSpace]).update((catalogs[:ColorSpace][:referenced_object] || catalogs[:ColorSpace]), &HASH_UPDATE_PROC_FOR_OLD) end - # (inheritance_hash[:Resources] ||= {}).update((catalogs[:Resources][:referenced_object] || catalogs[:Resources]), &self.class.method(:hash_update_proc_for_new)) if catalogs[:Resources] - # (inheritance_hash[:ColorSpace] ||= {}).update((catalogs[:ColorSpace][:referenced_object] || catalogs[:ColorSpace]), &self.class.method(:hash_update_proc_for_new)) if catalogs[:ColorSpace] + # (inheritance_hash[:Resources] ||= {}).update((catalogs[:Resources][:referenced_object] || catalogs[:Resources]), &HASH_UPDATE_PROC_FOR_NEW) if catalogs[:Resources] + # (inheritance_hash[:ColorSpace] ||= {}).update((catalogs[:ColorSpace][:referenced_object] || catalogs[:ColorSpace]), &HASH_UPDATE_PROC_FOR_NEW) if catalogs[:ColorSpace] # inheritance_hash[:Order] = catalogs[:Order] if catalogs[:Order] # inheritance_hash[:OCProperties] = catalogs[:OCProperties] if catalogs[:OCProperties] # inheritance_hash[:AS] = catalogs[:AS] if catalogs[:AS] end @@ -515,18 +530,18 @@ catalogs[:CropBox] ||= inheritance_hash[:CropBox] if inheritance_hash[:CropBox] catalogs[:Rotate] ||= inheritance_hash[:Rotate] if inheritance_hash[:Rotate] if inheritance_hash[:Resources] catalogs[:Resources] ||= { referenced_object: {}, is_reference_only: true }.dup catalogs[:Resources] = { referenced_object: catalogs[:Resources], is_reference_only: true } unless catalogs[:Resources][:referenced_object] - catalogs[:Resources][:referenced_object].update((inheritance_hash[:Resources][:referenced_object] || inheritance_hash[:Resources]), &self.class.method(:hash_update_proc_for_old)) + catalogs[:Resources][:referenced_object].update((inheritance_hash[:Resources][:referenced_object] || inheritance_hash[:Resources]), &HASH_UPDATE_PROC_FOR_OLD) end if inheritance_hash[:ColorSpace] catalogs[:ColorSpace] ||= { referenced_object: {}, is_reference_only: true }.dup catalogs[:ColorSpace] = { referenced_object: catalogs[:ColorSpace], is_reference_only: true } unless catalogs[:ColorSpace][:referenced_object] - catalogs[:ColorSpace][:referenced_object].update((inheritance_hash[:ColorSpace][:referenced_object] || inheritance_hash[:ColorSpace]), &self.class.method(:hash_update_proc_for_old)) + catalogs[:ColorSpace][:referenced_object].update((inheritance_hash[:ColorSpace][:referenced_object] || inheritance_hash[:ColorSpace]), &HASH_UPDATE_PROC_FOR_OLD) end - # (catalogs[:ColorSpace] ||= {}).update(inheritance_hash[:ColorSpace], &self.class.method(:hash_update_proc_for_old)) if inheritance_hash[:ColorSpace] + # (catalogs[:ColorSpace] ||= {}).update(inheritance_hash[:ColorSpace], &HASH_UPDATE_PROC_FOR_OLD) if inheritance_hash[:ColorSpace] # catalogs[:Order] ||= inheritance_hash[:Order] if inheritance_hash[:Order] # catalogs[:AS] ||= inheritance_hash[:AS] if inheritance_hash[:AS] # catalogs[:OCProperties] ||= inheritance_hash[:OCProperties] if inheritance_hash[:OCProperties] # avoide references on MediaBox, CropBox and Rotate @@ -536,13 +551,13 @@ catalogs.instance_eval { extend Page_Methods } when :Pages catalog_pages(catalogs[:Kids], inheritance_hash.dup) unless catalogs[:Kids].nil? when :Catalog - @forms_object.update((catalogs[:AcroForm][:referenced_object] || catalogs[:AcroForm]), &self.class.method(:hash_update_proc_for_new)) if catalogs[:AcroForm] - @names_object.update((catalogs[:Names][:referenced_object] || catalogs[:Names]), &self.class.method(:hash_update_proc_for_new)) if catalogs[:Names] - @outlines_object.update((catalogs[:Outlines][:referenced_object] || catalogs[:Outlines]), &self.class.method(:hash_update_proc_for_new)) if catalogs[:Outlines] + @forms_object.update((catalogs[:AcroForm][:referenced_object] || catalogs[:AcroForm]), &HASH_UPDATE_PROC_FOR_NEW) if catalogs[:AcroForm] + @names_object.update((catalogs[:Names][:referenced_object] || catalogs[:Names]), &HASH_UPDATE_PROC_FOR_NEW) if catalogs[:Names] + @outlines_object.update((catalogs[:Outlines][:referenced_object] || catalogs[:Outlines]), &HASH_UPDATE_PROC_FOR_NEW) if catalogs[:Outlines] if catalogs[:Dests] # convert PDF 1.1 Dests to PDF 1.2+ Dests dests_arry = (@names_object[:Dests] ||= {}) dests_arry = ((dests_arry[:referenced_object] || dests_arry)[:Names] ||= []) ((catalogs[:Dests][:referenced_object] || catalogs[:Dests])[:referenced_object] || (catalogs[:Dests][:referenced_object] || catalogs[:Dests])).each {|k,v| next if CombinePDF::PDF::PRIVATE_HASH_KEYS.include?(k); dests_arry << unify_string(k.to_s); dests_arry << v; } end @@ -652,33 +667,48 @@ # self # end # All Strings are one String def unify_string(str) + str.force_encoding(Encoding::ASCII_8BIT) @strings_dictionary[str] ||= str end # @private # this method reviews a Hash and updates it by merging Hash data, # preffering the old over the new. - def self.hash_update_proc_for_old(_key, old_data, new_data) + HASH_UPDATE_PROC_FOR_OLD = Proc.new do |_key, old_data, new_data| if old_data.is_a? Hash - old_data.merge(new_data, &method(:hash_update_proc_for_old)) + old_data.merge(new_data, &HASH_UPDATE_PROC_FOR_OLD) else old_data end end + # def self.hash_update_proc_for_old(_key, old_data, new_data) + # if old_data.is_a? Hash + # old_data.merge(new_data, &method(:hash_update_proc_for_old)) + # else + # old_data + # end + # end # @private # this method reviews a Hash an updates it by merging Hash data, # preffering the new over the old. - def self.hash_update_proc_for_new(_key, old_data, new_data) + HASH_UPDATE_PROC_FOR_NEW = Proc.new do |_key, old_data, new_data| if old_data.is_a? Hash - old_data.merge(new_data, &method(:hash_update_proc_for_new)) + old_data.merge(new_data, &HASH_UPDATE_PROC_FOR_NEW) else new_data end end + # def self.hash_update_proc_for_new(_key, old_data, new_data) + # if old_data.is_a? Hash + # old_data.merge(new_data, &method(:hash_update_proc_for_new)) + # else + # new_data + # end + # end # # run block of code on evey PDF object (PDF objects are class Hash) # def each_object(object, limit_references = true, already_visited = {}, &block) # unless limit_references # already_visited[object.object_id] = true