lib/pupa/processor.rb in pupa-0.0.4 vs lib/pupa/processor.rb in pupa-0.0.5

- old
+ new

@@ -15,10 +15,12 @@ include Helper class_attribute :tasks self.tasks = [] + attr_reader :report + def_delegators :@logger, :debug, :info, :warn, :error, :fatal # @param [String] output_dir the directory in which to dump JSON documents # @param [String] cache_dir the directory in which to cache HTTP responses # @param [Integer] expires_in the cache's expiration time in seconds @@ -29,10 +31,11 @@ @output_dir = output_dir @options = options @level = level @logger = Logger.new('pupa', level: level, logdev: logdev) @client = Client.new(cache_dir: cache_dir, expires_in: expires_in, level: level) + @report = {} end # Retrieves and parses a document with a GET request. # # @param [String] url a URL to an HTML document @@ -98,24 +101,30 @@ end # Dumps scraped objects to disk. # # @param [Symbol] task_name the name of the scraping task to perform + # @return [Integer] the number of scraped objects def dump_scraped_objects(task_name) + count = 0 send(task_name).each do |object| + count += 1 # we don't know the size of the enumeration dump_scraped_object(object) end + count end # Saves scraped objects to a database. # # @raises [TSort::Cyclic] if the dependency graph is cyclic # @raises [Pupa::Errors::UnprocessableEntity] if an object's foreign keys or # foreign objects cannot be resolved # @raises [Pupa::Errors::DuplicateDocumentError] if duplicate objects were # inadvertently saved to the database def import + @report[:import] = {} + objects = deduplicate(load_scraped_objects) object_id_to_database_id = {} if use_dependency_graph?(objects) @@ -124,11 +133,11 @@ # Replace object IDs with database IDs in foreign keys and save objects. dependency_graph.tsort.each do |id| object = objects[id] resolve_foreign_keys(object, object_id_to_database_id) # The dependency graph strategy only works if there are no foreign objects. - object_id_to_database_id[id] = Persistence.new(object).save + object_id_to_database_id[id] = import_object(object) end else size = objects.size # Should be O(n²). If there are foreign objects, we do not know all the @@ -156,11 +165,11 @@ if resolvable progress_made = true resolve_foreign_keys(object, object_id_to_database_id) resolve_foreign_objects(object) - object_id_to_database_id[id] = Persistence.new(object).save + object_id_to_database_id[id] = import_object(object) end end break if objects.empty? || !progress_made end @@ -202,11 +211,11 @@ # # @param [Object] object an scraped object # @raises [Pupa::Errors::DuplicateObjectIdError] def dump_scraped_object(object) type = object.class.to_s.demodulize.underscore - basename = "#{type}_#{object._id}.json" + basename = "#{type}_#{object._id.gsub(File::SEPARATOR, '_')}.json" path = File.join(@output_dir, basename) if File.exist?(path) raise Errors::DuplicateObjectIdError, "duplicate object ID: #{object._id} (was the same objected yielded twice?)" end @@ -334,18 +343,30 @@ end end # Resolves an object's foreign objects to database IDs. # - # @param [Object] an object + # @param [Object] object an object # @raises [Pupa::Errors::MissingDatabaseIdError] def resolve_foreign_objects(object) object.foreign_objects.each do |property| selector = object[property] if selector.present? # This method will not be called unless the foreign key is resolvable. object["#{property}_id"] = Persistence.find(selector)['_id'] end end + end + + # @param [Object] object an object + def import_object(object) + id = Persistence.new(object).save + @report[:import][object._type] ||= Hash.new(0) + if id == object._id + @report[:import][object._type][:insert] += 1 + else + @report[:import][object._type][:update] += 1 + end + id end end end