lib/pupa/processor.rb in pupa-0.0.11 vs lib/pupa/processor.rb in pupa-0.0.12

- old
+ new

@@ -149,11 +149,14 @@ # Replace object IDs with database IDs in foreign keys and save objects. dependency_graph.tsort.each do |id| object = objects[id] resolve_foreign_keys(object, object_id_to_database_id) # The dependency graph strategy only works if there are no foreign objects. - object_id_to_database_id[id] = import_object(object) + + database_id = import_object(object) + object_id_to_database_id[id] = database_id + object_id_to_database_id[database_id] = database_id end else size = objects.size # Should be O(n²). If there are foreign objects, we do not know all the @@ -165,42 +168,37 @@ # exception may not be raised. loop do progress_made = false objects.delete_if do |id,object| - resolvable = true + begin + resolve_foreign_keys(object, object_id_to_database_id) + resolve_foreign_objects(object, object_id_to_database_id) + progress_made = true - resolvable &= object.foreign_keys.all? do |property| - value = object[property] - value.nil? || object_id_to_database_id.key?(value) + database_id = import_object(object) + object_id_to_database_id[id] = database_id + object_id_to_database_id[database_id] = database_id + rescue Pupa::Errors::MissingDatabaseIdError + false end - - resolvable &= object.foreign_objects.all? do |property| - selector = object[property] - selector.blank? || Persistence.find(selector) - end - - if resolvable - progress_made = true - resolve_foreign_keys(object, object_id_to_database_id) - resolve_foreign_objects(object) - object_id_to_database_id[id] = import_object(object) - end end break if objects.empty? || !progress_made end unless objects.empty? - raise Errors::UnprocessableEntity, "couldn't resolve #{objects.size}/#{size} objects:\n #{objects.values.map{|object| MultiJson.dump(object.foreign_properties)}.join("\n ")}" + raise Errors::UnprocessableEntity, "couldn't resolve #{objects.size}/#{size} objects:\n #{objects.values.map{|object| JSON.dump(object.foreign_properties)}.join("\n ")}" end end # Ensure that fingerprints uniquely identified objects. counts = {} object_id_to_database_id.each do |object_id,database_id| - (counts[database_id] ||= []) << object_id + unless object_id == database_id + (counts[database_id] ||= []) << object_id + end end duplicates = counts.select do |_,object_ids| object_ids.size > 1 end unless duplicates.empty? @@ -249,17 +247,32 @@ # Loads scraped objects from disk. # # @return [Hash] a hash of scraped objects keyed by ID def load_scraped_objects {}.tap do |objects| - @store.read_multi(@store.entries).each do |data| - object = data['_type'].camelize.constantize.new(data) + @store.read_multi(@store.entries).each do |properties| + object = load_scraped_object(properties) objects[object._id] = object end end end + # Loads a scraped object from its properties. + # + # @param [Hash] properties the object's properties + # @return [Object] a scraped object + # @raises [Pupa::Errors::MissingObjectTypeError] if the scraped object is + # missing a `_type` property. + def load_scraped_object(properties) + type = properties['_type'] || properties[:_type] + if type + type.camelize.constantize.new(properties) + else + raise Errors::MissingObjectTypeError, "missing _type: #{JSON.dump(properties)}" + end + end + # Removes all duplicate objects and re-assigns any foreign keys. # # @param [Hash] objects a hash of scraped objects keyed by ID # @return [Hash] the objects without duplicates def deduplicate(objects) @@ -339,34 +352,43 @@ # Resolves an object's foreign keys from object IDs to database IDs. # # @param [Object] an object # @param [Hash] a map from object ID to database ID - # @raises [Pupa::Errors::MissingDatabaseIdError] + # @raises [Pupa::Errors::MissingDatabaseIdError] if a foreign key cannot be + # resolved def resolve_foreign_keys(object, map) object.foreign_keys.each do |property| value = object[property] if value - # If using a dependency graph, any foreign key that cannot be resolved - # will cause a key error while building the dependency graph. - # - # If not using a dependency graph, this method will not be called - # unless the foreign key is resolvable. - object[property] = map[value] + if map.key?(value) + object[property] = map[value] + else + raise Errors::MissingDatabaseIdError, "couldn't resolve foreign key: #{property} #{value}" + end end end end # Resolves an object's foreign objects to database IDs. # # @param [Object] object an object - # @raises [Pupa::Errors::MissingDatabaseIdError] - def resolve_foreign_objects(object) + # @param [Hash] a map from object ID to database ID + # @raises [Pupa::Errors::MissingDatabaseIdError] if a foreign object cannot + # be resolved + def resolve_foreign_objects(object, map) object.foreign_objects.each do |property| - selector = object[property] - if selector.present? - # This method will not be called unless the foreign key is resolvable. - object["#{property}_id"] = Persistence.find(selector)['_id'] + value = object[property] + if value.present? + foreign_object = ForeignObject.new(value) + resolve_foreign_keys(foreign_object, map) + document = Persistence.find(foreign_object.to_h) + + if document + object["#{property}_id"] = document['_id'] + else + raise Errors::MissingDatabaseIdError, "couldn't resolve foreign object: #{property} #{value}" + end end end end # @param [Object] object an object