lib/pupa/processor.rb in pupa-0.0.4 vs lib/pupa/processor.rb in pupa-0.0.5
- old
+ new
@@ -15,10 +15,12 @@
include Helper
class_attribute :tasks
self.tasks = []
+ attr_reader :report
+
def_delegators :@logger, :debug, :info, :warn, :error, :fatal
# @param [String] output_dir the directory in which to dump JSON documents
# @param [String] cache_dir the directory in which to cache HTTP responses
# @param [Integer] expires_in the cache's expiration time in seconds
@@ -29,10 +31,11 @@
@output_dir = output_dir
@options = options
@level = level
@logger = Logger.new('pupa', level: level, logdev: logdev)
@client = Client.new(cache_dir: cache_dir, expires_in: expires_in, level: level)
+ @report = {}
end
# Retrieves and parses a document with a GET request.
#
# @param [String] url a URL to an HTML document
@@ -98,24 +101,30 @@
end
# Dumps scraped objects to disk.
#
# @param [Symbol] task_name the name of the scraping task to perform
+ # @return [Integer] the number of scraped objects
def dump_scraped_objects(task_name)
+ count = 0
send(task_name).each do |object|
+ count += 1 # we don't know the size of the enumeration
dump_scraped_object(object)
end
+ count
end
# Saves scraped objects to a database.
#
# @raises [TSort::Cyclic] if the dependency graph is cyclic
# @raises [Pupa::Errors::UnprocessableEntity] if an object's foreign keys or
# foreign objects cannot be resolved
# @raises [Pupa::Errors::DuplicateDocumentError] if duplicate objects were
# inadvertently saved to the database
def import
+ @report[:import] = {}
+
objects = deduplicate(load_scraped_objects)
object_id_to_database_id = {}
if use_dependency_graph?(objects)
@@ -124,11 +133,11 @@
# Replace object IDs with database IDs in foreign keys and save objects.
dependency_graph.tsort.each do |id|
object = objects[id]
resolve_foreign_keys(object, object_id_to_database_id)
# The dependency graph strategy only works if there are no foreign objects.
- object_id_to_database_id[id] = Persistence.new(object).save
+ object_id_to_database_id[id] = import_object(object)
end
else
size = objects.size
# Should be O(n²). If there are foreign objects, we do not know all the
@@ -156,11 +165,11 @@
if resolvable
progress_made = true
resolve_foreign_keys(object, object_id_to_database_id)
resolve_foreign_objects(object)
- object_id_to_database_id[id] = Persistence.new(object).save
+ object_id_to_database_id[id] = import_object(object)
end
end
break if objects.empty? || !progress_made
end
@@ -202,11 +211,11 @@
#
# @param [Object] object an scraped object
# @raises [Pupa::Errors::DuplicateObjectIdError]
def dump_scraped_object(object)
type = object.class.to_s.demodulize.underscore
- basename = "#{type}_#{object._id}.json"
+ basename = "#{type}_#{object._id.gsub(File::SEPARATOR, '_')}.json"
path = File.join(@output_dir, basename)
if File.exist?(path)
raise Errors::DuplicateObjectIdError, "duplicate object ID: #{object._id} (was the same objected yielded twice?)"
end
@@ -334,18 +343,30 @@
end
end
# Resolves an object's foreign objects to database IDs.
#
- # @param [Object] an object
+ # @param [Object] object an object
# @raises [Pupa::Errors::MissingDatabaseIdError]
def resolve_foreign_objects(object)
object.foreign_objects.each do |property|
selector = object[property]
if selector.present?
# This method will not be called unless the foreign key is resolvable.
object["#{property}_id"] = Persistence.find(selector)['_id']
end
end
+ end
+
+ # @param [Object] object an object
+ def import_object(object)
+ id = Persistence.new(object).save
+ @report[:import][object._type] ||= Hash.new(0)
+ if id == object._id
+ @report[:import][object._type][:insert] += 1
+ else
+ @report[:import][object._type][:update] += 1
+ end
+ id
end
end
end