module RDF::Normalize class URDNA2015 include RDF::Enumerable include RDF::Util::Logger include Base ## # Create an enumerable with grounded nodes # # @param [RDF::Enumerable] enumerable # @return [RDF::Enumerable] def initialize(enumerable, options) @dataset, @options = enumerable, options end def each(&block) ns = NormalizationState.new(@options) normalize_statements(ns, &block) end protected def normalize_statements(ns, &block) # Map BNodes to the statements they are used by dataset.each_statement do |statement| statement.to_quad.compact.select(&:node?).each do |node| ns.add_statement(node, statement) end end non_normalized_identifiers, simple = ns.bnode_to_statements.keys, true while simple simple = false ns.hash_to_bnodes = {} # Calculate hashes for first degree nodes non_normalized_identifiers.each do |node| hash = log_depth {ns.hash_first_degree_quads(node)} log_debug("1deg") {"hash: #{hash}"} ns.add_bnode_hash(node, hash) end # Create canonical replacements for hashes mapping to a single node ns.hash_to_bnodes.keys.sort.each do |hash| identifier_list = ns.hash_to_bnodes[hash] next if identifier_list.length > 1 node = identifier_list.first id = ns.canonical_issuer.issue_identifier(node) log_debug("single node") {"node: #{node.to_ntriples}, hash: #{hash}, id: #{id}"} non_normalized_identifiers -= identifier_list ns.hash_to_bnodes.delete(hash) simple = true end end # Iterate over hashs having more than one node ns.hash_to_bnodes.keys.sort.each do |hash| identifier_list = ns.hash_to_bnodes[hash] log_debug("multiple nodes") {"node: #{identifier_list.map(&:to_ntriples).join(",")}, hash: #{hash}"} hash_path_list = [] # Create a hash_path_list for all bnodes using a temporary identifier used to create canonical replacements identifier_list.each do |identifier| next if ns.canonical_issuer.issued.include?(identifier) temporary_issuer = IdentifierIssuer.new("_:b") temporary_issuer.issue_identifier(identifier) hash_path_list << log_depth {ns.hash_n_degree_quads(identifier, temporary_issuer)} end log_debug("->") {"hash_path_list: #{hash_path_list.map(&:first).inspect}"} # Create canonical replacements for nodes hash_path_list.sort_by(&:first).map(&:last).each do |issuer| issuer.issued.each do |node| id = ns.canonical_issuer.issue_identifier(node) log_debug("-->") {"node: #{node.to_ntriples}, id: #{id}"} end end end # Yield statements using BNodes from canonical replacements dataset.each_statement do |statement| if statement.has_blank_nodes? quad = statement.to_quad.compact.map do |term| term.node? ? RDF::Node.intern(ns.canonical_issuer.identifier(term)[2..-1]) : term end block.call RDF::Statement.from(quad) else block.call statement end end end private class NormalizationState include RDF::Util::Logger attr_accessor :bnode_to_statements attr_accessor :hash_to_bnodes attr_accessor :canonical_issuer def initialize(options) @options = options @bnode_to_statements, @hash_to_bnodes, @canonical_issuer = {}, {}, IdentifierIssuer.new("_:c14n") end def add_statement(node, statement) bnode_to_statements[node] ||= [] bnode_to_statements[node] << statement unless bnode_to_statements[node].include?(statement) end def add_bnode_hash(node, hash) hash_to_bnodes[hash] ||= [] hash_to_bnodes[hash] << node unless hash_to_bnodes[hash].include?(node) end # @param [RDF::Node] node # @return [String] the SHA256 hexdigest hash of statements using this node, with replacements def hash_first_degree_quads(node) quads = bnode_to_statements[node]. map do |statement| quad = statement.to_quad.map do |t| case t when node then RDF::Node("a") when RDF::Node then RDF::Node("z") else t end end RDF::NQuads::Writer.serialize(RDF::Statement.from(quad)) end log_debug("1deg") {"node: #{node}, quads: #{quads}"} hexdigest(quads.sort.join) end # @param [RDF::Node] related # @param [RDF::Statement] statement # @param [IdentifierIssuer] issuer # @param [String] position one of :s, :o, or :g # @return [String] the SHA256 hexdigest hash def hash_related_node(related, statement, issuer, position) identifier = canonical_issuer.identifier(related) || issuer.identifier(related) || hash_first_degree_quads(related) input = position.to_s input << statement.predicate.to_ntriples unless position == :g input << identifier log_debug("hrel") {"input: #{input.inspect}, hash: #{hexdigest(input)}"} hexdigest(input) end # @param [RDF::Node] identifier # @param [IdentifierIssuer] issuer # @return [Array] the Hash and issuer def hash_n_degree_quads(identifier, issuer) log_debug("ndeg") {"identifier: #{identifier.to_ntriples}"} # hash to related blank nodes map map = {} bnode_to_statements[identifier].each do |statement| hash_related_statement(identifier, statement, issuer, map) end data_to_hash = "" log_debug("ndeg") {"map: #{map.map {|h,l| "#{h}: #{l.map(&:to_ntriples)}"}.join('; ')}"} log_depth do map.keys.sort.each do |hash| list = map[hash] # Iterate over related nodes chosen_path, chosen_issuer = "", nil data_to_hash += hash list.permutation do |permutation| log_debug("ndeg") {"perm: #{permutation.map(&:to_ntriples).join(",")}"} issuer_copy, path, recursion_list = issuer.dup, "", [] permutation.each do |related| if canonical_issuer.identifier(related) path << canonical_issuer.issue_identifier(related) else recursion_list << related if !issuer_copy.identifier(related) path << issuer_copy.issue_identifier(related) end # Skip to the next permutation if chosen path isn't empty and the path is greater than the chosen path break if !chosen_path.empty? && path.length >= chosen_path.length end log_debug("ndeg") {"hash: #{hash}, path: #{path}, recursion: #{recursion_list.map(&:to_ntriples)}"} recursion_list.each do |related| result = log_depth {hash_n_degree_quads(related, issuer_copy)} path << issuer_copy.issue_identifier(related) path << "<#{result.first}>" issuer_copy = result.last break if !chosen_path.empty? && path.length >= chosen_path.length && path > chosen_path end if chosen_path.empty? || path < chosen_path chosen_path, chosen_issuer = path, issuer_copy end end data_to_hash += chosen_path issuer = chosen_issuer end end log_debug("ndeg") {"datatohash: #{data_to_hash.inspect}, hash: #{hexdigest(data_to_hash)}"} return [hexdigest(data_to_hash), issuer] end protected def hexdigest(val) Digest::SHA256.hexdigest(val) end # Group adjacent bnodes by hash def hash_related_statement(identifier, statement, issuer, map) statement.to_hash(:s, :p, :o, :g).each do |pos, term| next if !term.is_a?(RDF::Node) || term == identifier hash = log_depth {hash_related_node(term, statement, issuer, pos)} map[hash] ||= [] map[hash] << term unless map[hash].include?(term) end end end class IdentifierIssuer def initialize(prefix = "_:c14n") @prefix, @counter, @issued = prefix, 0, {} end # Return an identifier for this BNode def issue_identifier(node) @issued[node] ||= begin res, @counter = @prefix + @counter.to_s, @counter + 1 res end end def issued @issued.keys end def identifier(node) @issued[node] end # Duplicate this issuer, ensuring that the issued identifiers remain distinct # @return [IdentifierIssuer] def dup other = super other.instance_variable_set(:@issued, @issued.dup) other end end end end