lib/pupa/processor.rb in pupa-0.0.11 vs lib/pupa/processor.rb in pupa-0.0.12
- old
+ new
@@ -149,11 +149,14 @@
# Replace object IDs with database IDs in foreign keys and save objects.
dependency_graph.tsort.each do |id|
object = objects[id]
resolve_foreign_keys(object, object_id_to_database_id)
# The dependency graph strategy only works if there are no foreign objects.
- object_id_to_database_id[id] = import_object(object)
+
+ database_id = import_object(object)
+ object_id_to_database_id[id] = database_id
+ object_id_to_database_id[database_id] = database_id
end
else
size = objects.size
# Should be O(n²). If there are foreign objects, we do not know all the
@@ -165,42 +168,37 @@
# exception may not be raised.
loop do
progress_made = false
objects.delete_if do |id,object|
- resolvable = true
+ begin
+ resolve_foreign_keys(object, object_id_to_database_id)
+ resolve_foreign_objects(object, object_id_to_database_id)
+ progress_made = true
- resolvable &= object.foreign_keys.all? do |property|
- value = object[property]
- value.nil? || object_id_to_database_id.key?(value)
+ database_id = import_object(object)
+ object_id_to_database_id[id] = database_id
+ object_id_to_database_id[database_id] = database_id
+ rescue Pupa::Errors::MissingDatabaseIdError
+ false
end
-
- resolvable &= object.foreign_objects.all? do |property|
- selector = object[property]
- selector.blank? || Persistence.find(selector)
- end
-
- if resolvable
- progress_made = true
- resolve_foreign_keys(object, object_id_to_database_id)
- resolve_foreign_objects(object)
- object_id_to_database_id[id] = import_object(object)
- end
end
break if objects.empty? || !progress_made
end
unless objects.empty?
- raise Errors::UnprocessableEntity, "couldn't resolve #{objects.size}/#{size} objects:\n #{objects.values.map{|object| MultiJson.dump(object.foreign_properties)}.join("\n ")}"
+ raise Errors::UnprocessableEntity, "couldn't resolve #{objects.size}/#{size} objects:\n #{objects.values.map{|object| JSON.dump(object.foreign_properties)}.join("\n ")}"
end
end
# Ensure that fingerprints uniquely identified objects.
counts = {}
object_id_to_database_id.each do |object_id,database_id|
- (counts[database_id] ||= []) << object_id
+ unless object_id == database_id
+ (counts[database_id] ||= []) << object_id
+ end
end
duplicates = counts.select do |_,object_ids|
object_ids.size > 1
end
unless duplicates.empty?
@@ -249,17 +247,32 @@
# Loads scraped objects from disk.
#
# @return [Hash] a hash of scraped objects keyed by ID
def load_scraped_objects
{}.tap do |objects|
- @store.read_multi(@store.entries).each do |data|
- object = data['_type'].camelize.constantize.new(data)
+ @store.read_multi(@store.entries).each do |properties|
+ object = load_scraped_object(properties)
objects[object._id] = object
end
end
end
+ # Loads a scraped object from its properties.
+ #
+ # @param [Hash] properties the object's properties
+ # @return [Object] a scraped object
+ # @raises [Pupa::Errors::MissingObjectTypeError] if the scraped object is
+ # missing a `_type` property.
+ def load_scraped_object(properties)
+ type = properties['_type'] || properties[:_type]
+ if type
+ type.camelize.constantize.new(properties)
+ else
+ raise Errors::MissingObjectTypeError, "missing _type: #{JSON.dump(properties)}"
+ end
+ end
+
# Removes all duplicate objects and re-assigns any foreign keys.
#
# @param [Hash] objects a hash of scraped objects keyed by ID
# @return [Hash] the objects without duplicates
def deduplicate(objects)
@@ -339,34 +352,43 @@
# Resolves an object's foreign keys from object IDs to database IDs.
#
# @param [Object] an object
# @param [Hash] a map from object ID to database ID
- # @raises [Pupa::Errors::MissingDatabaseIdError]
+ # @raises [Pupa::Errors::MissingDatabaseIdError] if a foreign key cannot be
+ # resolved
def resolve_foreign_keys(object, map)
object.foreign_keys.each do |property|
value = object[property]
if value
- # If using a dependency graph, any foreign key that cannot be resolved
- # will cause a key error while building the dependency graph.
- #
- # If not using a dependency graph, this method will not be called
- # unless the foreign key is resolvable.
- object[property] = map[value]
+ if map.key?(value)
+ object[property] = map[value]
+ else
+ raise Errors::MissingDatabaseIdError, "couldn't resolve foreign key: #{property} #{value}"
+ end
end
end
end
# Resolves an object's foreign objects to database IDs.
#
# @param [Object] object an object
- # @raises [Pupa::Errors::MissingDatabaseIdError]
- def resolve_foreign_objects(object)
+ # @param [Hash] a map from object ID to database ID
+ # @raises [Pupa::Errors::MissingDatabaseIdError] if a foreign object cannot
+ # be resolved
+ def resolve_foreign_objects(object, map)
object.foreign_objects.each do |property|
- selector = object[property]
- if selector.present?
- # This method will not be called unless the foreign key is resolvable.
- object["#{property}_id"] = Persistence.find(selector)['_id']
+ value = object[property]
+ if value.present?
+ foreign_object = ForeignObject.new(value)
+ resolve_foreign_keys(foreign_object, map)
+ document = Persistence.find(foreign_object.to_h)
+
+ if document
+ object["#{property}_id"] = document['_id']
+ else
+ raise Errors::MissingDatabaseIdError, "couldn't resolve foreign object: #{property} #{value}"
+ end
end
end
end
# @param [Object] object an object