pdf.rb in origami-2.0.1

- old
+ new

@@ -43,12 +43,10 @@
 require 'origami/annotations'
 require 'origami/actions'
 require 'origami/3d'
 require 'origami/signature'
 require 'origami/webcapture'
-require 'origami/export'
-require 'origami/webcapture'
 require 'origami/encryption'
 require 'origami/linearization'
 require 'origami/obfuscation'
 require 'origami/javascript'
 require 'origami/outline'
@@ -145,39 +143,27 @@
                 pdf = PDF.new
                 yield(pdf) if block_given?
                 pdf.save(output, options)
             end
             alias write create
-
-            #
-            # Deserializes a PDF dump.
-            #
-            def deserialize(filename)
-                Zlib::GzipReader.open(filename) { |gz|
-                    return Marshal.load(gz.read)
-                }
-            end
         end
 
         #
         # Creates a new PDF instance.
         # _parser_:: The Parser object creating the document.
         #            If none is specified, some default structures are automatically created to get a minimal working document.
         #
         def initialize(parser = nil)
             @header = PDF::Header.new
             @revisions = []
+            @parser = parser
+            @loaded = false
 
             add_new_revision
             @revisions.first.trailer = Trailer.new
 
-            if parser
-                @loaded = false
-                @parser = parser
-            else
-                init
-            end
+            init if parser.nil?
         end
 
         #
         # Original file name if parsed from disk, nil otherwise.
         #
@@ -198,25 +184,10 @@
         def original_data
             @parser.target_data if @parser
         end
 
         #
-        # Serializes the current PDF.
-        #
-        def serialize(filename)
-            parser = @parser
-            @parser = nil # do not serialize the parser
-
-            Zlib::GzipWriter.open(filename) { |gz|
-                gz.write Marshal.dump(self)
-            }
-
-            @parser = parser
-            self
-        end
-
-        #
         # Saves the current document.
         # _filename_:: The path where to save this PDF.
         #
         def save(path, params = {})
             options =
@@ -275,39 +246,13 @@
             pattern = /#{Regexp.escape(pattern)}/i if pattern.is_a?(::String)
             raise TypeError, "Expected a String or Regexp" unless pattern.is_a?(Regexp)
 
             result = []
 
-            search_object = -> (object) do
-                case object
-                when Stream
-                    result.concat object.dictionary.strings_cache.select{|str| pattern === str}
-                    result.concat object.dictionary.names_cache.select{|name| pattern === name.value}
-
-                    begin
-                        result.push object if streams and object.data.match(pattern)
-                    rescue Filter::Error
-                        next # Skip object if a decoding error occured.
-                    end
-
-                    next if object.is_a?(ObjectStream) and not object_streams
-
-                    object.each do |subobject|
-                        search_object.call(subobject)
-                    end
-
-                when Name, String
-                    result.push object if object.value.match(pattern)
-
-                when Dictionary, Array then
-                    result.concat object.strings_cache.select{|str| pattern === str}
-                    result.concat object.names_cache.select{|name| pattern === name.value}
-                end
-            end
-
             self.indirect_objects.each do |object|
-                search_object.call(object)
+                result.concat search_object(object, pattern,
+                                            streams: streams, object_streams: object_streams)
             end
 
             result
         end
 
@@ -327,46 +272,26 @@
         #
         # Iterates over the objects of the document.
         # _compressed_: iterates over the objects inside object streams.
         # _recursive_: iterates recursively inside objects like arrays and dictionaries.
         #
-        def each_object(compressed: false, recursive: false)
+        def each_object(compressed: false, recursive: false, &block)
             return enum_for(__method__, compressed: compressed,
                                         recursive: recursive
                            ) unless block_given?
 
-            walk_object = -> (object) do
-                case object
-                when Dictionary
-                    object.each_value do |value|
-                        yield(value)
-                        walk_object.call(value)
-                    end
-
-                when Array
-                    object.each do |child|
-                        yield(child)
-                        walk_object.call(child)
-                    end
-
-                when Stream
-                    yield(object.dictionary)
-                    walk_object.call(object.dictionary)
-                end
-            end
-
             @revisions.each do |revision|
                 revision.each_object do |object|
-                    yield(object)
+                    block.call(object)
 
-                    walk_object.call(object) if recursive
+                    walk_object(object, &block) if recursive
 
                     if object.is_a?(ObjectStream) and compressed
                         object.each do |child_obj|
-                            yield(child_obj)
+                            block.call(child_obj)
 
-                            walk_object.call(child_obj) if recursive
+                            walk_object(child_obj) if recursive
                         end
                     end
                 end
             end
         end
@@ -537,11 +462,11 @@
                 next if xref.nil?
 
                 #
                 # We found a matching XRef.
                 #
-                if xref.is_a?(XRefToCompressedObj)
+                if xref.is_a?(XRefToCompressedObject)
                     objstm = get_object(xref.objstmno, 0, use_xrefstm: use_xrefstm)
 
                     object = objstm.extract_by_index(xref.index)
                     if object.is_a?(Origami::Object) and object.no == target.refno
                         return object
@@ -568,39 +493,39 @@
 
         #
         # Casts a PDF object into another object type.
         # The target type must be a subtype of the original type.
         #
-        def cast_object(reference, type, parser = nil) #:nodoc:
+        def cast_object(reference, type) #:nodoc:
             @revisions.each do |rev|
-                if rev.body.include?(reference) and type < rev.body[reference].class
-                    rev.body[reference] = rev.body[reference].cast_to(type, parser)
+                if rev.body.include?(reference)
+                    object = rev.body[reference]
+                    return object if object.is_a?(type)
 
-                    rev.body[reference]
-                else
-                    nil
+                    if type < rev.body[reference].class
+                        rev.body[reference] = object.cast_to(type, @parser)
+
+                        return rev.body[reference]
+                    end
                 end
             end
+
+            nil
         end
 
         #
         # Returns a new number/generation for future object.
         #
         def allocate_new_object_number
-            no = 1
 
-            # Deprecated number allocation policy (first available)
-            #no = no + 1 while get_object(no)
-
-            objset = self.indirect_objects
-            self.indirect_objects.find_all{|obj| obj.is_a?(ObjectStream)}.each do |objstm|
-                objstm.each{|obj| objset << obj}
+            last_object = self.each_object(compressed: true).max_by {|object| object.no }
+            if last_object.nil?
+                no = 1
+            else
+                no = last_object.no + 1
             end
 
-            allocated = objset.collect{|obj| obj.no}.compact
-            no = allocated.max + 1 unless allocated.empty?
-
             [ no, 0 ]
         end
 
         #
         # Mark the document as complete.
@@ -613,10 +538,74 @@
         ##########################
         private
         ##########################
 
         #
+        # Iterates over the children of an object, avoiding cycles.
+        #
+        def walk_object(object, excludes: [])
+            return enum_for(__method__, object, excludes: excludes) unless block_given?
+
+            return if excludes.include?(object)
+            excludes.push(object)
+
+            case object
+            when Dictionary
+                object.each_value do |value|
+                    yield(value)
+                    walk_object(value, excludes: excludes)
+                end
+
+            when Array
+                object.each do |child|
+                    yield(child)
+                    walk_object(child, excludes: excludes)
+                end
+
+            when Stream
+                yield(object.dictionary)
+                walk_object(object.dictionary, excludes: excludes)
+            end
+        end
+
+        #
+        # Searches through an object, possibly going into object streams.
+        # Returns an array of matching strings, names and streams.
+        #
+        def search_object(object, pattern, streams: true, object_streams: true)
+            result = []
+
+            case object
+            when Stream
+                result.concat object.dictionary.strings_cache.select{|str| pattern === str}
+                result.concat object.dictionary.names_cache.select{|name| pattern === name.value}
+
+                begin
+                    result.push object if streams and object.data.match(pattern)
+                rescue Filter::Error
+                    return result # Skip object if a decoding error occured.
+                end
+
+                return result unless object.is_a?(ObjectStream) and object_streams
+
+                object.each do |child|
+                    result.concat search_object(child, pattern,
+                                                streams: streams, object_streams: object_streams)
+                end
+
+            when Name, String
+                result.push object if object.value.match(pattern)
+
+            when Dictionary, Array
+                result.concat object.strings_cache.select{|str| pattern === str}
+                result.concat object.names_cache.select{|name| pattern === name.value}
+            end
+
+            result
+        end
+
+        #
         # Load an object from its given file offset.
         # The document must have an associated Parser.
         #
         def load_object_at_offset(revision, offset)
             return nil if @loaded or @parser.nil?
@@ -625,23 +614,11 @@
             begin
                 object = @parser.parse_object(offset)
                 return nil if object.nil?
 
                 if self.is_a?(Encryption::EncryptedDocument)
-                    case object
-                    when String
-                        object.extend(Encryption::EncryptedString)
-                        object.decrypted = false
-                    when Stream
-                        object.extend(Encryption::EncryptedStream)
-                        object.decrypted = false
-                    when Dictionary, Array
-                        object.strings_cache.each do |string|
-                            string.extend(Encryption::EncryptedString)
-                            string.decrypted = false
-                        end
-                    end
+                    make_encrypted_object(object)
                 end
 
                 add_to_revision(object, revision)
             ensure
                 @parser.pos = pos
@@ -649,10 +626,26 @@
 
             object
         end
 
         #
+        # Method called on encrypted objects loaded into the document.
+        #
+        def make_encrypted_object(object)
+            case object
+            when String
+                object.extend(Encryption::EncryptedString)
+            when Stream
+                object.extend(Encryption::EncryptedStream)
+            when Dictionary, Array
+                object.strings_cache.each do |string|
+                    string.extend(Encryption::EncryptedString)
+                end
+            end
+        end
+
+        #
         # Force the loading of all objects in the document.
         #
         def load_all_objects
             return if @loaded or @parser.nil?
 
@@ -663,12 +656,12 @@
                     xrefs = revision.xrefstm
                 else
                     next
                 end
 
-                xrefs.each_with_number do |_, no|
-                    self.get_object(no)
+                xrefs.each_with_number do |xref, no|
+                    self.get_object(no) unless xref.free?
                 end
             end
 
             @loaded = true
         end
@@ -714,15 +707,11 @@
 
             #
             # Allocates object numbers and creates references.
             # Invokes object finalization methods.
             #
-            if self.is_a?(Encryption::EncryptedDocument)
-                physicalize(options)
-            else
-                physicalize
-            end
+            physicalize(options)
 
             #
             # Sets the PDF version header.
             #
             version, level = version_required
@@ -733,104 +722,62 @@
 
             self
         end
 
         #
-        # Cleans the document from its references.
-        # Indirects objects are made direct whenever possible.
-        # TODO: Circuit-checking to avoid infinite induction
+        # Converts a logical PDF view into a physical view ready for writing.
         #
-        def logicalize #:nodoc:
-            raise NotImplementedError
+        def physicalize(options = {})
 
-            processed = []
-
-            convert = -> (root) do
-                replaced = []
-                if root.is_a?(Dictionary) or root.is_a?(Array)
-                    root.each do |obj|
-                        convert[obj]
-                    end
-
-                    root.map! do |obj|
-                        if obj.is_a?(Reference)
-                            target = obj.solve
-                            # Streams can't be direct objects
-                            if target.is_a?(Stream)
-                                obj
-                            else
-                                replaced << obj
-                                target
-                            end
-                        else
-                            obj
-                        end
-                    end
-                end
-
-                replaced
+            indirect_objects_by_rev.each do |obj, revision|
+                build_object(obj, revision, options)
             end
 
-            @revisions.each do |revision|
-                revision.objects.each do |obj|
-                    processed.concat(convert[obj])
-                end
-            end
+            self
         end
 
-        #
-        # Converts a logical PDF view into a physical view ready for writing.
-        #
-        def physicalize
-
-            #
-            # Indirect objects are added to the revision and assigned numbers.
-            #
-            build = -> (obj, revision) do
-                #
-                # Finalize any subobjects before building the stream.
-                #
-                if obj.is_a?(ObjectStream)
-                    obj.each do |subobj|
-                        build.call(subobj, revision)
-                    end
+        def build_object(object, revision, options)
+            # Build any compressed object before building the object stream.
+            if object.is_a?(ObjectStream)
+                object.each do |compressed_obj|
+                    build_object(compressed_obj, revision, options)
                 end
+            end
 
-                obj.pre_build
+            object.pre_build
 
-                if obj.is_a?(Dictionary) or obj.is_a?(Array)
+            case object
+            when Stream
+                build_object(object.dictionary, revision, options)
+            when Dictionary, Array
+                build_compound_object(object, revision, options)
+            end
 
-                    obj.map! do |subobj|
-                        if subobj.indirect?
-                            if get_object(subobj.reference)
-                                subobj.reference
-                            else
-                                ref = add_to_revision(subobj, revision)
-                                build.call(subobj, revision)
-                                ref
-                            end
-                        else
-                            subobj
-                        end
-                    end
+            object.post_build
+        end
 
-                    obj.each do |subobj|
-                        build.call(subobj, revision)
-                    end
+        def build_compound_object(object, revision, options)
+            return unless object.is_a?(Dictionary) or object.is_a?(Array)
 
-                elsif obj.is_a?(Stream)
-                    build.call(obj.dictionary, revision)
-                end
+            # Flatten the object by adding indirect objects to the revision and
+            # replacing them with their reference.
+            object.map! do |child|
+                next(child) unless child.indirect?
 
-                obj.post_build
+                if get_object(child.reference)
+                    child.reference
+                else
+                    reference = add_to_revision(child, revision)
+                    build_object(child, revision, options)
+                    reference
+                end
             end
 
-          indirect_objects_by_rev.each do |obj, revision|
-              build.call(obj, revision)
-          end
-
-          self
+            # Finalize all the children objects.
+            object.each do |child|
+                build_object(child, revision, options)
+            end
         end
 
         #
         # Returns the final binary representation of the current document.
         #
@@ -856,13 +803,11 @@
                 options[:use_xrefstm] = has_objstm
                 options[:use_xreftable] = (not has_objstm)
             end
 
             # Get trailer dictionary
-            trailer_info = get_trailer_info
-            raise InvalidPDFError, "No trailer information found" if trailer_info.nil?
-            trailer_dict = trailer_info.dictionary
+            trailer_dict = self.trailer.dictionary
 
             prev_xref_offset = nil
             xrefstm_offset = nil
 
             # Header
@@ -937,11 +882,11 @@
 
                         # Process embedded objects
                         if options[:use_xrefstm] and obj.parent != obj and obj.parent.is_a?(ObjectStream)
                             index = obj.parent.index(obj.no)
 
-                            xrefs_stm << XRefToCompressedObj.new(obj.parent.no, index)
+                            xrefs_stm << XRefToCompressedObject.new(obj.parent.no, index)
 
                             lastno_stm = obj.no
                         else
                             xrefs_stm << XRef.new(bin.size, obj.generation, XRef::USED)
                             xrefs_table << XRef.new(bin.size, obj.generation, XRef::USED)
@@ -1020,10 +965,9 @@
         #
         # Instanciates basic structures required for a valid PDF file.
         #
         def init
             catalog = (self.Catalog = (trailer_key(:Root) || Catalog.new))
-            catalog.Pages = PageTreeNode.new.set_indirect(true)
             @revisions.last.trailer.Root = catalog.reference
 
             @loaded = true
 
             self