module RDF::Normalize class Carroll2001 include RDF::Enumerable include Base include Utils ## # Create an enumerable with grounded nodes # # @param [RDF::Enumerable] enumerable # @return [RDF::Enumerable] def initialize(enumerable, options) @dataset = enumerable end def each(&block) ground_statements, anon_statements = [], [] dataset.each_statement do |statement| (statement.has_blank_nodes? ? anon_statements : ground_statements) << statement end nodes = anon_statements.map(&:to_quad).flatten.compact.select(&:node?).uniq # Create a hash signature of every node, based on the signature of # statements it exists in. # We also save hashes of nodes that cannot be reliably known; we will use # that information to eliminate possible recursion combinations. # # Any mappings given in the method parameters are considered grounded. hashes, ungrounded_hashes = hash_nodes(anon_statements, nodes, {}) # FIXME: likely need to iterate until hashes and ungrounded_hashes are the same size while hashes.size != ungrounded_hashes.size raise "Not done" end # Enumerate all statements, replacing nodes with new ground nodes using the hash as an identifier ground_statements.each(&block) anon_statements.each do |statement| quad = statement.to_quad.compact.map do |term| term.node? ? RDF::Node.intern(hashes[term]) : term end block.call RDF::Statement.from(quad) end end private # Given a set of statements, create a mapping of node => SHA1 for a given # set of blank nodes. # # Returns a tuple of hashes: one of grounded hashes, and one of all # hashes. grounded hashes are based on non-blank nodes and grounded blank # nodes, and can be used to determine if a node's signature matches # another. # # @param [Array] statements # @param [Array] nodes # @param [Hash] grounded_hashes # mapping of node => SHA1 pairs as input, used to create more specific signatures of other nodes. # @private # @return [Hash, Hash] def hash_nodes(statements, nodes, grounded_hashes) hashes = grounded_hashes.dup ungrounded_hashes = {} hash_needed = true # We may have to go over the list multiple times. If a node is marked as # grounded, other nodes can then use it to decide their own state of # grounded. while hash_needed starting_grounded_nodes = hashes.size nodes.each do | node | unless hashes.member? node grounded, hash = node_hash_for(node, statements, hashes) if grounded hashes[node] = hash end ungrounded_hashes[node] = hash end end # after going over the list, any nodes with a unique hash can be marked # as grounded, even if we have not tied them back to a root yet. uniques = {} ungrounded_hashes.each do |node, hash| uniques[hash] = uniques.has_key?(hash) ? false : node end uniques.each do |hash, node| hashes[node] = hash if node end hash_needed = starting_grounded_nodes != hashes.size end [hashes, ungrounded_hashes] end # Generate a hash for a node based on the signature of the statements it # appears in. Signatures consist of grounded elements in statements # associated with a node, that is, anything but an ungrounded anonymous # node. Creating the hash is simply hashing a sorted list of each # statement's signature, which is itself a concatenation of the string form # of all grounded elements. # # Nodes other than the given node are considered grounded if they are a # member in the given hash. # # @param [RDF::Node] node # @param [Array] statements # @param [Hash] hashes # @return [Boolean, String] # a tuple consisting of grounded being true or false and the String for the hash def node_hash_for(node, statements, hashes) statement_signatures = [] grounded = true statements.each do | statement | if statement.to_quad.include?(node) statement_signatures << hash_string_for(statement, hashes, node) statement.to_quad.compact.each do | resource | grounded = false unless grounded?(resource, hashes) || resource == node end end end # Note that we sort the signatures--without a canonical ordering, # we might get different hashes for equivalent nodes. [grounded,Digest::SHA1.hexdigest(statement_signatures.sort.to_s)] end # Provide a string signature for the given statement, collecting # string signatures for grounded node elements. # @return [String] def hash_string_for(statement, hashes, node) statement.to_quad.map {|r| string_for_node(r, hashes, node)}.join("") end # Returns true if a given node is grounded # A node is groundd if it is not a blank node or it is included # in the given mapping of grounded nodes. # @return [Boolean] def grounded?(node, hashes) (!(node.node?)) || (hashes.member? node) end # Provides a string for the given node for use in a string signature # Non-anonymous nodes will return their string form. Grounded anonymous # nodes will return their hashed form. # @return [String] def string_for_node(node, hashes, target) case when node.nil? "" when node == target "itself" when node.node? && hashes.member?(node) hashes[node] when node.node? "a blank node" # RDF.rb auto-boxing magic makes some literals the same when they # should not be; the ntriples serializer will take care of us when node.literal? node.class.name + RDF::NTriples.serialize(node) else node.to_s end end end end