lib/rdf/isomorphic.rb in rdf-isomorphic-0.0.1 vs lib/rdf/isomorphic.rb in rdf-isomorphic-0.1.1
- old
+ new
@@ -14,43 +14,45 @@
# Returns `true` if this RDF::Enumerable is isomorphic with another.
# @return [Boolean]
# @example
# repository_a.isomorphic_with repository_b #=> true
- def isomorphic_with(other)
+ def isomorphic_with?(other)
!(bijection_to(other).nil?)
end
- alias_method :isomorphic?, :isomorphic_with
- alias_method :isomorphic_with?, :isomorphic_with
+ alias_method :isomorphic?, :isomorphic_with?
# Returns a hash of RDF::Nodes => RDF::Nodes representing an isomorphic
- # bijection of this RDF::Enumerable's blank nodes, or nil if a bijection
- # cannot be found.
+ # bijection of this RDF::Enumerable's to another RDF::Enumerable's blank
+ # nodes, or nil if a bijection cannot be found.
# @example
# repository_a.bijection_to repository_b
# @param other [RDF::Enumerable]
# @return [Hash, nil]
def bijection_to(other)
- named_statements_match = true
- each_statement do |statement|
- unless statement.has_blank_nodes?
- named_statements_match = other.has_statement?(statement)
- end
- break unless named_statements_match
+
+ grounded_stmts_match = each_statement.all? do | stmt |
+ stmt.has_blank_nodes? || other.has_statement?(stmt)
end
- unless named_statements_match
- nil
- else
+ if grounded_stmts_match
+ # blank_stmts and other_blank_stmts are just a performance
+ # consideration--we could just as well pass in self and other. But we
+ # will be iterating over this list quite a bit during the algorithm, so
+ # we break it down to the parts we're interested in.
blank_stmts = find_all { |statement| statement.has_blank_nodes? }
other_blank_stmts = other.find_all { |statement| statement.has_blank_nodes? }
- nodes = blank_nodes_in(blank_stmts)
- other_nodes = blank_nodes_in(other_blank_stmts)
+
+ nodes = RDF::Isomorphic.blank_nodes_in(blank_stmts)
+ other_nodes = RDF::Isomorphic.blank_nodes_in(other_blank_stmts)
build_bijection_to blank_stmts, nodes, other_blank_stmts, other_nodes
+ else
+ nil
end
+
end
private
# The main recursive bijection algorithm.
@@ -58,102 +60,127 @@
# This algorithm is very similar to the one explained by Jeremy Carroll in
# http://www.hpl.hp.com/techreports/2001/HPL-2001-293.pdf. Page 12 has the
# relevant pseudocode.
#
# Many more comments are in the method itself.
+ #
+ # @param [RDF::Enumerable] anon_stmts
+ # @param [Array] nodes
+ # @param [RDF::Enumerable] other_anon_stmts
+ # @param [Array] other_nodes
+ # @param [Hash] these_grounded_hashes
+ # @param [Hash] other_grounded_hashes
+ # @return [nil,Hash]
# @private
- def build_bijection_to(anon_stmts, nodes, other_anon_stmts, other_nodes, hashes = {})
+ def build_bijection_to(anon_stmts, nodes, other_anon_stmts, other_nodes, these_grounded_hashes = {}, other_grounded_hashes = {})
- # Some variable descriptions:
- # anon_stmts, other_anon_stmts: All statements from this and other with anonymous nodes
- # nodes, other_nodes: All anonymous nodes from this and other
- # hashes: hashes of signature of an anonymous nodes' relevant statements. Only contains hashes for grounded nodes.
- # potential_hashes: as hashes, but not limited to grounded nodes
- # bijection: node => node mapping representing an anonymous node bijection
- # bijection_hashes: duplicate of hashes from which we remove hashes to make sure bijection is one to one
-
- # A grounded node, the difference between the contents of
- # potential_hashes and hashes, is a node which has no ungrounded
- # anonymous neighbors in a relevant statement.
- potential_hashes = {}
- [ [anon_stmts,nodes], [other_anon_stmts,other_nodes] ].each do | tuple |
- hash_needed = true
- while hash_needed
- hash_needed = false
- tuple.last.each do | node |
- unless hashes.member? node
- grounded, hash = node_hash_for(node, tuple.first, hashes) unless hashes.member? node
- if grounded
- hash_needed = true
- hashes[node] = hash
- end
- potential_hashes[node] = hash
- end
- end
- end
- end
- # see variables above
- bijection = {}
- bijection_hashes = hashes.dup
+ # Create a hash signature of every node, based on the signature of
+ # statements it exists in.
+ # We also save hashes of nodes that cannot be reliably known; we will use
+ # that information to eliminate possible recursion combinations.
+ #
+ # Any mappings given in the method parameters are considered grounded.
+ these_hashes, these_ungrounded_hashes = RDF::Isomorphic.hash_nodes(anon_stmts, nodes, these_grounded_hashes)
+ other_hashes, other_ungrounded_hashes = RDF::Isomorphic.hash_nodes(other_anon_stmts, other_nodes, other_grounded_hashes)
- # We are looking for nodes such that
- # hashes[node] == hashes[some_other_node]
+
+ # Using the created hashes, map nodes to other_nodes
+ bijection = {}
nodes.each do | node |
- tuple = bijection_hashes.find do |k, v|
- (v == bijection_hashes[node]) &&
- # eql? instead of include? since RDF.rb coincedentally-same-named identifiers will be ==
- other_nodes.any? do | item | k.eql?(item) end
+ other_node, other_hash = other_hashes.find do | other_node, other_hash |
+ # we need to use eql?, as coincedentally-named bnode identifiers are == in rdf.rb
+ these_hashes[node].eql? other_hash
end
- next unless tuple
- target = tuple.first
- bijection_hashes.delete target
- bijection[node] = target
+ next unless other_node
+ bijection[node] = other_node
+
+ # we need to delete , as we don't want two nodes with the same hash
+ # to be mapped to the same other_node.
+ other_hashes.delete other_node
end
- # This if is the return statement, believe it or not.
+ # bijection is now a mapping of nodes to other_nodes. If all are
+ # accounted for on both sides, we have a bijection.
#
- # First, is the anonymous node mapping 1 to 1?
- # If so, we have a bijection and are done
- if (bijection.keys.sort == nodes.sort) && (bijection.values.sort == other_nodes.sort)
- bijection
- # So we've got unhashed nodes that can't be definitively grounded. Make
- # a tentative bijection between two with identical ungrounded signatures
- # in the graph and recurse.
- else
+ # If not, we will speculatively mark pairs with matching ungrounded
+ # hashes as bijected and recurse.
+ unless (bijection.keys.sort == nodes.sort) && (bijection.values.sort == other_nodes.sort)
bijection = nil
- nodes.each do | node |
+ nodes.any? do | node |
+
# We don't replace grounded nodes' hashes
- next if hashes.member? node
- bijectable = other_nodes.any? do | other_node |
- # We don't replace grounded nodes' hashes
- next if hashes.member? other_node
- # The ungrounded signature must match for this pair to have a chance.
- # If the signature doesn't match, skip it.
- next unless potential_hashes[node] == potential_hashes[other_node]
+ next if these_hashes.member? node
+ other_nodes.any? do | other_node |
+
+ # We don't replace grounded other_nodes' hashes
+ next if other_hashes.member? other_node
+
+ # The ungrounded signature must match for this to potentially work
+ next unless these_ungrounded_hashes[node] == other_ungrounded_hashes[other_node]
+
hash = Digest::SHA1.hexdigest(node.to_s)
- test_hashes = { node => hash, other_node => hash}
- bijection = build_bijection_to(anon_stmts, nodes, other_anon_stmts, other_nodes, hashes.merge(test_hashes))
+ bijection = build_bijection_to(anon_stmts, nodes, other_anon_stmts, other_nodes, these_hashes.merge( node => hash), other_hashes.merge(other_node => hash))
end
- break if bijection
+ bijection
end
- bijection
end
+
+ bijection
end
+ # Blank nodes appearing in given list of statements
# @private
# @return [RDF::Node]
- # Blank nodes appearing in given list of statements
- def blank_nodes_in(blank_stmt_list)
+ def self.blank_nodes_in(blank_stmt_list)
nodes = []
blank_stmt_list.each do | statement |
nodes << statement.object if statement.object.anonymous?
nodes << statement.subject if statement.subject.anonymous?
end
nodes.uniq
end
-
+
+ # Given a set of statements, create a mapping of node => SHA1 for a given
+ # set of blank nodes. grounded_hashes is a mapping of node => SHA1 pairs
+ # that we will take as a given, and use those to make more specific
+ # signatures of other nodes.
+ #
+ # Returns a tuple of hashes: one of grounded hashes, and one of all
+ # hashes. grounded hashes are based on non-blank nodes and grounded blank
+ # nodes, and can be used to determine if a node's signature matches
+ # another.
+ #
+ # @param [Array] statements
+ # @param [Array] nodes
+ # @param [Hash] grounded_hashes
+ # @private
+ # @return [Hash, Hash]
+ def self.hash_nodes(statements, nodes, grounded_hashes)
+ hashes = grounded_hashes.dup
+ ungrounded_hashes = {}
+ hash_needed = true
+
+ # We may have to go over the list multiple times. If a node is marked as
+ # grounded, other nodes can then use it to decide their own state of
+ # grounded.
+ while hash_needed
+ hash_needed = false
+ nodes.each do | node |
+ unless hashes.member? node
+ grounded, hash = node_hash_for(node, statements, hashes)
+ if grounded
+ hash_needed = true
+ hashes[node] = hash
+ end
+ ungrounded_hashes[node] = hash
+ end
+ end
+ end
+ [hashes,ungrounded_hashes]
+ end
+
# Generate a hash for a node based on the signature of the statements it
# appears in. Signatures consist of grounded elements in statements
# associated with a node, that is, anything but an ungrounded anonymous
# node. Creating the hash is simply hashing a sorted list of each
# statement's signature, which is itself a concatenation of the string form
@@ -164,56 +191,67 @@
#
# Returns a tuple consisting of grounded being true or false and the String
# for the hash
# @private
# @return [Boolean, String]
- def node_hash_for(node,statements,hashes)
+ def self.node_hash_for(node,statements,hashes)
statement_signatures = []
grounded = true
statements.each do | statement |
if (statement.object == node) || (statement.subject == node)
- statement_signatures << hash_string_for(statement,hashes)
+ statement_signatures << hash_string_for(statement,hashes,node)
[statement.subject, statement.object].each do | resource |
- grounded = false unless grounded(resource, hashes)
+ grounded = false unless grounded(resource, hashes) || resource == node
end
end
end
+ # Note that we sort the signatures--without a canonical ordering,
+ # we might get different hashes for equivalent nodes.
[grounded,Digest::SHA1.hexdigest(statement_signatures.sort.to_s)]
end
- # Provide a string signature for the given statement.
+ # Provide a string signature for the given statement, collecting
+ # string signatures for grounded node elements.
+ # return [String]
# @private
- def hash_string_for(statement,hashes)
- hash = ""
- hash << string_for_node(statement.subject,hashes)
- hash << statement.predicate.to_s
- hash << string_for_node(statement.object,hashes)
- hash
+ def self.hash_string_for(statement,hashes,node)
+ string = ""
+ string << string_for_node(statement.subject,hashes,node)
+ string << statement.predicate.to_s
+ string << string_for_node(statement.object,hashes,node)
+ string
end
# Returns true if a given node is grounded
+ # A node is groundd if it is not a blank node or it is included
+ # in the given mapping of grounded nodes.
+ # @return [Boolean]
# @private
- def grounded(node, hashes)
+ def self.grounded(node, hashes)
(!(node.anonymous?)) || (hashes.member? node)
end
# Provides a string for the given node for use in a string signature
+ # Non-anonymous nodes will return their string form. Grounded anonymous
+ # nodes will return their hashed form.
+ # @return [String]
# @private
- def string_for_node(node, hashes)
- if node.anonymous?
- if hashes.member? node
+ def self.string_for_node(node, hashes,target)
+ case
+ when node == target
+ "itself"
+ when node.anonymous? && hashes.member?(node)
hashes[node]
+ when node.anonymous?
+ "a blank node"
else
- ""
- end
- else
- node.to_s
+ node.to_s
end
end
end
-
+ # Extend RDF::Enumerables with these functions.
module Enumerable
include RDF::Isomorphic
end
end