lib/rdf/isomorphic.rb in rdf-isomorphic-0.0.1 vs lib/rdf/isomorphic.rb in rdf-isomorphic-0.1.1

- old
+ new

@@ -14,43 +14,45 @@ # Returns `true` if this RDF::Enumerable is isomorphic with another. # @return [Boolean] # @example # repository_a.isomorphic_with repository_b #=> true - def isomorphic_with(other) + def isomorphic_with?(other) !(bijection_to(other).nil?) end - alias_method :isomorphic?, :isomorphic_with - alias_method :isomorphic_with?, :isomorphic_with + alias_method :isomorphic?, :isomorphic_with? # Returns a hash of RDF::Nodes => RDF::Nodes representing an isomorphic - # bijection of this RDF::Enumerable's blank nodes, or nil if a bijection - # cannot be found. + # bijection of this RDF::Enumerable's to another RDF::Enumerable's blank + # nodes, or nil if a bijection cannot be found. # @example # repository_a.bijection_to repository_b # @param other [RDF::Enumerable] # @return [Hash, nil] def bijection_to(other) - named_statements_match = true - each_statement do |statement| - unless statement.has_blank_nodes? - named_statements_match = other.has_statement?(statement) - end - break unless named_statements_match + + grounded_stmts_match = each_statement.all? do | stmt | + stmt.has_blank_nodes? || other.has_statement?(stmt) end - unless named_statements_match - nil - else + if grounded_stmts_match + # blank_stmts and other_blank_stmts are just a performance + # consideration--we could just as well pass in self and other. But we + # will be iterating over this list quite a bit during the algorithm, so + # we break it down to the parts we're interested in. blank_stmts = find_all { |statement| statement.has_blank_nodes? } other_blank_stmts = other.find_all { |statement| statement.has_blank_nodes? } - nodes = blank_nodes_in(blank_stmts) - other_nodes = blank_nodes_in(other_blank_stmts) + + nodes = RDF::Isomorphic.blank_nodes_in(blank_stmts) + other_nodes = RDF::Isomorphic.blank_nodes_in(other_blank_stmts) build_bijection_to blank_stmts, nodes, other_blank_stmts, other_nodes + else + nil end + end private # The main recursive bijection algorithm. @@ -58,102 +60,127 @@ # This algorithm is very similar to the one explained by Jeremy Carroll in # http://www.hpl.hp.com/techreports/2001/HPL-2001-293.pdf. Page 12 has the # relevant pseudocode. # # Many more comments are in the method itself. + # + # @param [RDF::Enumerable] anon_stmts + # @param [Array] nodes + # @param [RDF::Enumerable] other_anon_stmts + # @param [Array] other_nodes + # @param [Hash] these_grounded_hashes + # @param [Hash] other_grounded_hashes + # @return [nil,Hash] # @private - def build_bijection_to(anon_stmts, nodes, other_anon_stmts, other_nodes, hashes = {}) + def build_bijection_to(anon_stmts, nodes, other_anon_stmts, other_nodes, these_grounded_hashes = {}, other_grounded_hashes = {}) - # Some variable descriptions: - # anon_stmts, other_anon_stmts: All statements from this and other with anonymous nodes - # nodes, other_nodes: All anonymous nodes from this and other - # hashes: hashes of signature of an anonymous nodes' relevant statements. Only contains hashes for grounded nodes. - # potential_hashes: as hashes, but not limited to grounded nodes - # bijection: node => node mapping representing an anonymous node bijection - # bijection_hashes: duplicate of hashes from which we remove hashes to make sure bijection is one to one - - # A grounded node, the difference between the contents of - # potential_hashes and hashes, is a node which has no ungrounded - # anonymous neighbors in a relevant statement. - potential_hashes = {} - [ [anon_stmts,nodes], [other_anon_stmts,other_nodes] ].each do | tuple | - hash_needed = true - while hash_needed - hash_needed = false - tuple.last.each do | node | - unless hashes.member? node - grounded, hash = node_hash_for(node, tuple.first, hashes) unless hashes.member? node - if grounded - hash_needed = true - hashes[node] = hash - end - potential_hashes[node] = hash - end - end - end - end - # see variables above - bijection = {} - bijection_hashes = hashes.dup + # Create a hash signature of every node, based on the signature of + # statements it exists in. + # We also save hashes of nodes that cannot be reliably known; we will use + # that information to eliminate possible recursion combinations. + # + # Any mappings given in the method parameters are considered grounded. + these_hashes, these_ungrounded_hashes = RDF::Isomorphic.hash_nodes(anon_stmts, nodes, these_grounded_hashes) + other_hashes, other_ungrounded_hashes = RDF::Isomorphic.hash_nodes(other_anon_stmts, other_nodes, other_grounded_hashes) - # We are looking for nodes such that - # hashes[node] == hashes[some_other_node] + + # Using the created hashes, map nodes to other_nodes + bijection = {} nodes.each do | node | - tuple = bijection_hashes.find do |k, v| - (v == bijection_hashes[node]) && - # eql? instead of include? since RDF.rb coincedentally-same-named identifiers will be == - other_nodes.any? do | item | k.eql?(item) end + other_node, other_hash = other_hashes.find do | other_node, other_hash | + # we need to use eql?, as coincedentally-named bnode identifiers are == in rdf.rb + these_hashes[node].eql? other_hash end - next unless tuple - target = tuple.first - bijection_hashes.delete target - bijection[node] = target + next unless other_node + bijection[node] = other_node + + # we need to delete , as we don't want two nodes with the same hash + # to be mapped to the same other_node. + other_hashes.delete other_node end - # This if is the return statement, believe it or not. + # bijection is now a mapping of nodes to other_nodes. If all are + # accounted for on both sides, we have a bijection. # - # First, is the anonymous node mapping 1 to 1? - # If so, we have a bijection and are done - if (bijection.keys.sort == nodes.sort) && (bijection.values.sort == other_nodes.sort) - bijection - # So we've got unhashed nodes that can't be definitively grounded. Make - # a tentative bijection between two with identical ungrounded signatures - # in the graph and recurse. - else + # If not, we will speculatively mark pairs with matching ungrounded + # hashes as bijected and recurse. + unless (bijection.keys.sort == nodes.sort) && (bijection.values.sort == other_nodes.sort) bijection = nil - nodes.each do | node | + nodes.any? do | node | + # We don't replace grounded nodes' hashes - next if hashes.member? node - bijectable = other_nodes.any? do | other_node | - # We don't replace grounded nodes' hashes - next if hashes.member? other_node - # The ungrounded signature must match for this pair to have a chance. - # If the signature doesn't match, skip it. - next unless potential_hashes[node] == potential_hashes[other_node] + next if these_hashes.member? node + other_nodes.any? do | other_node | + + # We don't replace grounded other_nodes' hashes + next if other_hashes.member? other_node + + # The ungrounded signature must match for this to potentially work + next unless these_ungrounded_hashes[node] == other_ungrounded_hashes[other_node] + hash = Digest::SHA1.hexdigest(node.to_s) - test_hashes = { node => hash, other_node => hash} - bijection = build_bijection_to(anon_stmts, nodes, other_anon_stmts, other_nodes, hashes.merge(test_hashes)) + bijection = build_bijection_to(anon_stmts, nodes, other_anon_stmts, other_nodes, these_hashes.merge( node => hash), other_hashes.merge(other_node => hash)) end - break if bijection + bijection end - bijection end + + bijection end + # Blank nodes appearing in given list of statements # @private # @return [RDF::Node] - # Blank nodes appearing in given list of statements - def blank_nodes_in(blank_stmt_list) + def self.blank_nodes_in(blank_stmt_list) nodes = [] blank_stmt_list.each do | statement | nodes << statement.object if statement.object.anonymous? nodes << statement.subject if statement.subject.anonymous? end nodes.uniq end - + + # Given a set of statements, create a mapping of node => SHA1 for a given + # set of blank nodes. grounded_hashes is a mapping of node => SHA1 pairs + # that we will take as a given, and use those to make more specific + # signatures of other nodes. + # + # Returns a tuple of hashes: one of grounded hashes, and one of all + # hashes. grounded hashes are based on non-blank nodes and grounded blank + # nodes, and can be used to determine if a node's signature matches + # another. + # + # @param [Array] statements + # @param [Array] nodes + # @param [Hash] grounded_hashes + # @private + # @return [Hash, Hash] + def self.hash_nodes(statements, nodes, grounded_hashes) + hashes = grounded_hashes.dup + ungrounded_hashes = {} + hash_needed = true + + # We may have to go over the list multiple times. If a node is marked as + # grounded, other nodes can then use it to decide their own state of + # grounded. + while hash_needed + hash_needed = false + nodes.each do | node | + unless hashes.member? node + grounded, hash = node_hash_for(node, statements, hashes) + if grounded + hash_needed = true + hashes[node] = hash + end + ungrounded_hashes[node] = hash + end + end + end + [hashes,ungrounded_hashes] + end + # Generate a hash for a node based on the signature of the statements it # appears in. Signatures consist of grounded elements in statements # associated with a node, that is, anything but an ungrounded anonymous # node. Creating the hash is simply hashing a sorted list of each # statement's signature, which is itself a concatenation of the string form @@ -164,56 +191,67 @@ # # Returns a tuple consisting of grounded being true or false and the String # for the hash # @private # @return [Boolean, String] - def node_hash_for(node,statements,hashes) + def self.node_hash_for(node,statements,hashes) statement_signatures = [] grounded = true statements.each do | statement | if (statement.object == node) || (statement.subject == node) - statement_signatures << hash_string_for(statement,hashes) + statement_signatures << hash_string_for(statement,hashes,node) [statement.subject, statement.object].each do | resource | - grounded = false unless grounded(resource, hashes) + grounded = false unless grounded(resource, hashes) || resource == node end end end + # Note that we sort the signatures--without a canonical ordering, + # we might get different hashes for equivalent nodes. [grounded,Digest::SHA1.hexdigest(statement_signatures.sort.to_s)] end - # Provide a string signature for the given statement. + # Provide a string signature for the given statement, collecting + # string signatures for grounded node elements. + # return [String] # @private - def hash_string_for(statement,hashes) - hash = "" - hash << string_for_node(statement.subject,hashes) - hash << statement.predicate.to_s - hash << string_for_node(statement.object,hashes) - hash + def self.hash_string_for(statement,hashes,node) + string = "" + string << string_for_node(statement.subject,hashes,node) + string << statement.predicate.to_s + string << string_for_node(statement.object,hashes,node) + string end # Returns true if a given node is grounded + # A node is groundd if it is not a blank node or it is included + # in the given mapping of grounded nodes. + # @return [Boolean] # @private - def grounded(node, hashes) + def self.grounded(node, hashes) (!(node.anonymous?)) || (hashes.member? node) end # Provides a string for the given node for use in a string signature + # Non-anonymous nodes will return their string form. Grounded anonymous + # nodes will return their hashed form. + # @return [String] # @private - def string_for_node(node, hashes) - if node.anonymous? - if hashes.member? node + def self.string_for_node(node, hashes,target) + case + when node == target + "itself" + when node.anonymous? && hashes.member?(node) hashes[node] + when node.anonymous? + "a blank node" else - "" - end - else - node.to_s + node.to_s end end end - + # Extend RDF::Enumerables with these functions. module Enumerable include RDF::Isomorphic end end