processor.rb in pupa-0.0.5

- old
+ new

@@ -15,10 +15,12 @@
     include Helper
 
     class_attribute :tasks
     self.tasks = []
 
+    attr_reader :report
+
     def_delegators :@logger, :debug, :info, :warn, :error, :fatal
 
     # @param [String] output_dir the directory in which to dump JSON documents
     # @param [String] cache_dir the directory in which to cache HTTP responses
     # @param [Integer] expires_in the cache's expiration time in seconds
@@ -29,10 +31,11 @@
       @output_dir = output_dir
       @options    = options
       @level      = level
       @logger     = Logger.new('pupa', level: level, logdev: logdev)
       @client     = Client.new(cache_dir: cache_dir, expires_in: expires_in, level: level)
+      @report     = {}
     end
 
     # Retrieves and parses a document with a GET request.
     #
     # @param [String] url a URL to an HTML document
@@ -98,24 +101,30 @@
     end
 
     # Dumps scraped objects to disk.
     #
     # @param [Symbol] task_name the name of the scraping task to perform
+    # @return [Integer] the number of scraped objects
     def dump_scraped_objects(task_name)
+      count = 0
       send(task_name).each do |object|
+        count += 1 # we don't know the size of the enumeration
         dump_scraped_object(object)
       end
+      count
     end
 
     # Saves scraped objects to a database.
     #
     # @raises [TSort::Cyclic] if the dependency graph is cyclic
     # @raises [Pupa::Errors::UnprocessableEntity] if an object's foreign keys or
     #   foreign objects cannot be resolved
     # @raises [Pupa::Errors::DuplicateDocumentError] if duplicate objects were
     #   inadvertently saved to the database
     def import
+      @report[:import] = {}
+
       objects = deduplicate(load_scraped_objects)
 
       object_id_to_database_id = {}
 
       if use_dependency_graph?(objects)
@@ -124,11 +133,11 @@
         # Replace object IDs with database IDs in foreign keys and save objects.
         dependency_graph.tsort.each do |id|
           object = objects[id]
           resolve_foreign_keys(object, object_id_to_database_id)
           # The dependency graph strategy only works if there are no foreign objects.
-          object_id_to_database_id[id] = Persistence.new(object).save
+          object_id_to_database_id[id] = import_object(object)
         end
       else
         size = objects.size
 
         # Should be O(n²). If there are foreign objects, we do not know all the
@@ -156,11 +165,11 @@
 
             if resolvable
               progress_made = true
               resolve_foreign_keys(object, object_id_to_database_id)
               resolve_foreign_objects(object)
-              object_id_to_database_id[id] = Persistence.new(object).save
+              object_id_to_database_id[id] = import_object(object)
             end
           end
 
           break if objects.empty? || !progress_made
         end
@@ -202,11 +211,11 @@
     #
     # @param [Object] object an scraped object
     # @raises [Pupa::Errors::DuplicateObjectIdError]
     def dump_scraped_object(object)
       type = object.class.to_s.demodulize.underscore
-      basename = "#{type}_#{object._id}.json"
+      basename = "#{type}_#{object._id.gsub(File::SEPARATOR, '_')}.json"
       path = File.join(@output_dir, basename)
 
       if File.exist?(path)
         raise Errors::DuplicateObjectIdError, "duplicate object ID: #{object._id} (was the same objected yielded twice?)"
       end
@@ -334,18 +343,30 @@
       end
     end
 
     # Resolves an object's foreign objects to database IDs.
     #
-    # @param [Object] an object
+    # @param [Object] object an object
     # @raises [Pupa::Errors::MissingDatabaseIdError]
     def resolve_foreign_objects(object)
       object.foreign_objects.each do |property|
         selector = object[property]
         if selector.present?
           # This method will not be called unless the foreign key is resolvable.
           object["#{property}_id"] = Persistence.find(selector)['_id']
         end
       end
+    end
+
+    # @param [Object] object an object
+    def import_object(object)
+      id = Persistence.new(object).save
+      @report[:import][object._type] ||= Hash.new(0)
+      if id == object._id
+        @report[:import][object._type][:insert] += 1
+      else
+        @report[:import][object._type][:update] += 1
+      end
+      id
     end
   end
 end