lib/origami/pdf.rb in origami-2.0.0 vs lib/origami/pdf.rb in origami-2.0.1

- old
+ new

@@ -43,12 +43,10 @@ require 'origami/annotations' require 'origami/actions' require 'origami/3d' require 'origami/signature' require 'origami/webcapture' -require 'origami/export' -require 'origami/webcapture' require 'origami/encryption' require 'origami/linearization' require 'origami/obfuscation' require 'origami/javascript' require 'origami/outline' @@ -145,39 +143,27 @@ pdf = PDF.new yield(pdf) if block_given? pdf.save(output, options) end alias write create - - # - # Deserializes a PDF dump. - # - def deserialize(filename) - Zlib::GzipReader.open(filename) { |gz| - return Marshal.load(gz.read) - } - end end # # Creates a new PDF instance. # _parser_:: The Parser object creating the document. # If none is specified, some default structures are automatically created to get a minimal working document. # def initialize(parser = nil) @header = PDF::Header.new @revisions = [] + @parser = parser + @loaded = false add_new_revision @revisions.first.trailer = Trailer.new - if parser - @loaded = false - @parser = parser - else - init - end + init if parser.nil? end # # Original file name if parsed from disk, nil otherwise. # @@ -198,25 +184,10 @@ def original_data @parser.target_data if @parser end # - # Serializes the current PDF. - # - def serialize(filename) - parser = @parser - @parser = nil # do not serialize the parser - - Zlib::GzipWriter.open(filename) { |gz| - gz.write Marshal.dump(self) - } - - @parser = parser - self - end - - # # Saves the current document. # _filename_:: The path where to save this PDF. # def save(path, params = {}) options = @@ -275,39 +246,13 @@ pattern = /#{Regexp.escape(pattern)}/i if pattern.is_a?(::String) raise TypeError, "Expected a String or Regexp" unless pattern.is_a?(Regexp) result = [] - search_object = -> (object) do - case object - when Stream - result.concat object.dictionary.strings_cache.select{|str| pattern === str} - result.concat object.dictionary.names_cache.select{|name| pattern === name.value} - - begin - result.push object if streams and object.data.match(pattern) - rescue Filter::Error - next # Skip object if a decoding error occured. - end - - next if object.is_a?(ObjectStream) and not object_streams - - object.each do |subobject| - search_object.call(subobject) - end - - when Name, String - result.push object if object.value.match(pattern) - - when Dictionary, Array then - result.concat object.strings_cache.select{|str| pattern === str} - result.concat object.names_cache.select{|name| pattern === name.value} - end - end - self.indirect_objects.each do |object| - search_object.call(object) + result.concat search_object(object, pattern, + streams: streams, object_streams: object_streams) end result end @@ -327,46 +272,26 @@ # # Iterates over the objects of the document. # _compressed_: iterates over the objects inside object streams. # _recursive_: iterates recursively inside objects like arrays and dictionaries. # - def each_object(compressed: false, recursive: false) + def each_object(compressed: false, recursive: false, &block) return enum_for(__method__, compressed: compressed, recursive: recursive ) unless block_given? - walk_object = -> (object) do - case object - when Dictionary - object.each_value do |value| - yield(value) - walk_object.call(value) - end - - when Array - object.each do |child| - yield(child) - walk_object.call(child) - end - - when Stream - yield(object.dictionary) - walk_object.call(object.dictionary) - end - end - @revisions.each do |revision| revision.each_object do |object| - yield(object) + block.call(object) - walk_object.call(object) if recursive + walk_object(object, &block) if recursive if object.is_a?(ObjectStream) and compressed object.each do |child_obj| - yield(child_obj) + block.call(child_obj) - walk_object.call(child_obj) if recursive + walk_object(child_obj) if recursive end end end end end @@ -537,11 +462,11 @@ next if xref.nil? # # We found a matching XRef. # - if xref.is_a?(XRefToCompressedObj) + if xref.is_a?(XRefToCompressedObject) objstm = get_object(xref.objstmno, 0, use_xrefstm: use_xrefstm) object = objstm.extract_by_index(xref.index) if object.is_a?(Origami::Object) and object.no == target.refno return object @@ -568,39 +493,39 @@ # # Casts a PDF object into another object type. # The target type must be a subtype of the original type. # - def cast_object(reference, type, parser = nil) #:nodoc: + def cast_object(reference, type) #:nodoc: @revisions.each do |rev| - if rev.body.include?(reference) and type < rev.body[reference].class - rev.body[reference] = rev.body[reference].cast_to(type, parser) + if rev.body.include?(reference) + object = rev.body[reference] + return object if object.is_a?(type) - rev.body[reference] - else - nil + if type < rev.body[reference].class + rev.body[reference] = object.cast_to(type, @parser) + + return rev.body[reference] + end end end + + nil end # # Returns a new number/generation for future object. # def allocate_new_object_number - no = 1 - # Deprecated number allocation policy (first available) - #no = no + 1 while get_object(no) - - objset = self.indirect_objects - self.indirect_objects.find_all{|obj| obj.is_a?(ObjectStream)}.each do |objstm| - objstm.each{|obj| objset << obj} + last_object = self.each_object(compressed: true).max_by {|object| object.no } + if last_object.nil? + no = 1 + else + no = last_object.no + 1 end - allocated = objset.collect{|obj| obj.no}.compact - no = allocated.max + 1 unless allocated.empty? - [ no, 0 ] end # # Mark the document as complete. @@ -613,10 +538,74 @@ ########################## private ########################## # + # Iterates over the children of an object, avoiding cycles. + # + def walk_object(object, excludes: []) + return enum_for(__method__, object, excludes: excludes) unless block_given? + + return if excludes.include?(object) + excludes.push(object) + + case object + when Dictionary + object.each_value do |value| + yield(value) + walk_object(value, excludes: excludes) + end + + when Array + object.each do |child| + yield(child) + walk_object(child, excludes: excludes) + end + + when Stream + yield(object.dictionary) + walk_object(object.dictionary, excludes: excludes) + end + end + + # + # Searches through an object, possibly going into object streams. + # Returns an array of matching strings, names and streams. + # + def search_object(object, pattern, streams: true, object_streams: true) + result = [] + + case object + when Stream + result.concat object.dictionary.strings_cache.select{|str| pattern === str} + result.concat object.dictionary.names_cache.select{|name| pattern === name.value} + + begin + result.push object if streams and object.data.match(pattern) + rescue Filter::Error + return result # Skip object if a decoding error occured. + end + + return result unless object.is_a?(ObjectStream) and object_streams + + object.each do |child| + result.concat search_object(child, pattern, + streams: streams, object_streams: object_streams) + end + + when Name, String + result.push object if object.value.match(pattern) + + when Dictionary, Array + result.concat object.strings_cache.select{|str| pattern === str} + result.concat object.names_cache.select{|name| pattern === name.value} + end + + result + end + + # # Load an object from its given file offset. # The document must have an associated Parser. # def load_object_at_offset(revision, offset) return nil if @loaded or @parser.nil? @@ -625,23 +614,11 @@ begin object = @parser.parse_object(offset) return nil if object.nil? if self.is_a?(Encryption::EncryptedDocument) - case object - when String - object.extend(Encryption::EncryptedString) - object.decrypted = false - when Stream - object.extend(Encryption::EncryptedStream) - object.decrypted = false - when Dictionary, Array - object.strings_cache.each do |string| - string.extend(Encryption::EncryptedString) - string.decrypted = false - end - end + make_encrypted_object(object) end add_to_revision(object, revision) ensure @parser.pos = pos @@ -649,10 +626,26 @@ object end # + # Method called on encrypted objects loaded into the document. + # + def make_encrypted_object(object) + case object + when String + object.extend(Encryption::EncryptedString) + when Stream + object.extend(Encryption::EncryptedStream) + when Dictionary, Array + object.strings_cache.each do |string| + string.extend(Encryption::EncryptedString) + end + end + end + + # # Force the loading of all objects in the document. # def load_all_objects return if @loaded or @parser.nil? @@ -663,12 +656,12 @@ xrefs = revision.xrefstm else next end - xrefs.each_with_number do |_, no| - self.get_object(no) + xrefs.each_with_number do |xref, no| + self.get_object(no) unless xref.free? end end @loaded = true end @@ -714,15 +707,11 @@ # # Allocates object numbers and creates references. # Invokes object finalization methods. # - if self.is_a?(Encryption::EncryptedDocument) - physicalize(options) - else - physicalize - end + physicalize(options) # # Sets the PDF version header. # version, level = version_required @@ -733,104 +722,62 @@ self end # - # Cleans the document from its references. - # Indirects objects are made direct whenever possible. - # TODO: Circuit-checking to avoid infinite induction + # Converts a logical PDF view into a physical view ready for writing. # - def logicalize #:nodoc: - raise NotImplementedError + def physicalize(options = {}) - processed = [] - - convert = -> (root) do - replaced = [] - if root.is_a?(Dictionary) or root.is_a?(Array) - root.each do |obj| - convert[obj] - end - - root.map! do |obj| - if obj.is_a?(Reference) - target = obj.solve - # Streams can't be direct objects - if target.is_a?(Stream) - obj - else - replaced << obj - target - end - else - obj - end - end - end - - replaced + indirect_objects_by_rev.each do |obj, revision| + build_object(obj, revision, options) end - @revisions.each do |revision| - revision.objects.each do |obj| - processed.concat(convert[obj]) - end - end + self end - # - # Converts a logical PDF view into a physical view ready for writing. - # - def physicalize - - # - # Indirect objects are added to the revision and assigned numbers. - # - build = -> (obj, revision) do - # - # Finalize any subobjects before building the stream. - # - if obj.is_a?(ObjectStream) - obj.each do |subobj| - build.call(subobj, revision) - end + def build_object(object, revision, options) + # Build any compressed object before building the object stream. + if object.is_a?(ObjectStream) + object.each do |compressed_obj| + build_object(compressed_obj, revision, options) end + end - obj.pre_build + object.pre_build - if obj.is_a?(Dictionary) or obj.is_a?(Array) + case object + when Stream + build_object(object.dictionary, revision, options) + when Dictionary, Array + build_compound_object(object, revision, options) + end - obj.map! do |subobj| - if subobj.indirect? - if get_object(subobj.reference) - subobj.reference - else - ref = add_to_revision(subobj, revision) - build.call(subobj, revision) - ref - end - else - subobj - end - end + object.post_build + end - obj.each do |subobj| - build.call(subobj, revision) - end + def build_compound_object(object, revision, options) + return unless object.is_a?(Dictionary) or object.is_a?(Array) - elsif obj.is_a?(Stream) - build.call(obj.dictionary, revision) - end + # Flatten the object by adding indirect objects to the revision and + # replacing them with their reference. + object.map! do |child| + next(child) unless child.indirect? - obj.post_build + if get_object(child.reference) + child.reference + else + reference = add_to_revision(child, revision) + build_object(child, revision, options) + reference + end end - indirect_objects_by_rev.each do |obj, revision| - build.call(obj, revision) - end - - self + # Finalize all the children objects. + object.each do |child| + build_object(child, revision, options) + end end # # Returns the final binary representation of the current document. # @@ -856,13 +803,11 @@ options[:use_xrefstm] = has_objstm options[:use_xreftable] = (not has_objstm) end # Get trailer dictionary - trailer_info = get_trailer_info - raise InvalidPDFError, "No trailer information found" if trailer_info.nil? - trailer_dict = trailer_info.dictionary + trailer_dict = self.trailer.dictionary prev_xref_offset = nil xrefstm_offset = nil # Header @@ -937,11 +882,11 @@ # Process embedded objects if options[:use_xrefstm] and obj.parent != obj and obj.parent.is_a?(ObjectStream) index = obj.parent.index(obj.no) - xrefs_stm << XRefToCompressedObj.new(obj.parent.no, index) + xrefs_stm << XRefToCompressedObject.new(obj.parent.no, index) lastno_stm = obj.no else xrefs_stm << XRef.new(bin.size, obj.generation, XRef::USED) xrefs_table << XRef.new(bin.size, obj.generation, XRef::USED) @@ -1020,10 +965,9 @@ # # Instanciates basic structures required for a valid PDF file. # def init catalog = (self.Catalog = (trailer_key(:Root) || Catalog.new)) - catalog.Pages = PageTreeNode.new.set_indirect(true) @revisions.last.trailer.Root = catalog.reference @loaded = true self