lib/rdf/normalize/urdna2015.rb in rdf-normalize-0.1.0 vs lib/rdf/normalize/urdna2015.rb in rdf-normalize-0.3.0.beta1

- old
+ new

@@ -1,10 +1,10 @@ module RDF::Normalize class URDNA2015 include RDF::Enumerable + include RDF::Util::Logger include Base - include Utils ## # Create an enumerable with grounded nodes # # @param [RDF::Enumerable] enumerable @@ -33,49 +33,49 @@ simple = false ns.hash_to_bnodes = {} # Calculate hashes for first degree nodes non_normalized_identifiers.each do |node| - hash = depth {ns.hash_first_degree_quads(node)} - debug("1deg") {"hash: #{hash}"} + hash = log_depth {ns.hash_first_degree_quads(node)} + log_debug("1deg") {"hash: #{hash}"} ns.add_bnode_hash(node, hash) end # Create canonical replacements for hashes mapping to a single node ns.hash_to_bnodes.keys.sort.each do |hash| identifier_list = ns.hash_to_bnodes[hash] next if identifier_list.length > 1 node = identifier_list.first id = ns.canonical_issuer.issue_identifier(node) - debug("single node") {"node: #{node.to_ntriples}, hash: #{hash}, id: #{id}"} + log_debug("single node") {"node: #{node.to_ntriples}, hash: #{hash}, id: #{id}"} non_normalized_identifiers -= identifier_list ns.hash_to_bnodes.delete(hash) simple = true end end # Iterate over hashs having more than one node ns.hash_to_bnodes.keys.sort.each do |hash| identifier_list = ns.hash_to_bnodes[hash] - debug("multiple nodes") {"node: #{identifier_list.map(&:to_ntriples).join(",")}, hash: #{hash}"} + log_debug("multiple nodes") {"node: #{identifier_list.map(&:to_ntriples).join(",")}, hash: #{hash}"} hash_path_list = [] # Create a hash_path_list for all bnodes using a temporary identifier used to create canonical replacements identifier_list.each do |identifier| next if ns.canonical_issuer.issued.include?(identifier) temporary_issuer = IdentifierIssuer.new("_:b") temporary_issuer.issue_identifier(identifier) - hash_path_list << depth {ns.hash_n_degree_quads(identifier, temporary_issuer)} + hash_path_list << log_depth {ns.hash_n_degree_quads(identifier, temporary_issuer)} end - debug("->") {"hash_path_list: #{hash_path_list.map(&:first).inspect}"} + log_debug("->") {"hash_path_list: #{hash_path_list.map(&:first).inspect}"} # Create canonical replacements for nodes hash_path_list.sort_by(&:first).map(&:last).each do |issuer| issuer.issued.each do |node| id = ns.canonical_issuer.issue_identifier(node) - debug("-->") {"node: #{node.to_ntriples}, id: #{id}"} + log_debug("-->") {"node: #{node.to_ntriples}, id: #{id}"} end end end # Yield statements using BNodes from canonical replacements @@ -92,11 +92,11 @@ end private class NormalizationState - include Utils + include RDF::Util::Logger attr_accessor :bnode_to_statements attr_accessor :hash_to_bnodes attr_accessor :canonical_issuer @@ -114,11 +114,11 @@ hash_to_bnodes[hash] ||= [] hash_to_bnodes[hash] << node unless hash_to_bnodes[hash].include?(node) end # @param [RDF::Node] node - # @return [String] the SHA1 hexdigest hash of statements using this node, with replacements + # @return [String] the SHA256 hexdigest hash of statements using this node, with replacements def hash_first_degree_quads(node) quads = bnode_to_statements[node]. map do |statement| quad = statement.to_quad.map do |t| case t @@ -128,55 +128,55 @@ end end RDF::NQuads::Writer.serialize(RDF::Statement.from(quad)) end - debug("1deg") {"node: #{node}, quads: #{quads}"} + log_debug("1deg") {"node: #{node}, quads: #{quads}"} hexdigest(quads.sort.join) end # @param [RDF::Node] related # @param [RDF::Statement] statement # @param [IdentifierIssuer] issuer # @param [String] position one of :s, :o, or :g - # @return [String] the SHA1 hexdigest hash + # @return [String] the SHA256 hexdigest hash def hash_related_node(related, statement, issuer, position) identifier = canonical_issuer.identifier(related) || issuer.identifier(related) || hash_first_degree_quads(related) input = position.to_s input << statement.predicate.to_ntriples unless position == :g input << identifier - debug("hrel") {"input: #{input.inspect}, hash: #{hexdigest(input)}"} + log_debug("hrel") {"input: #{input.inspect}, hash: #{hexdigest(input)}"} hexdigest(input) end # @param [RDF::Node] identifier # @param [IdentifierIssuer] issuer # @return [Array<String,IdentifierIssuer>] the Hash and issuer def hash_n_degree_quads(identifier, issuer) - debug("ndeg") {"identifier: #{identifier.to_ntriples}"} + log_debug("ndeg") {"identifier: #{identifier.to_ntriples}"} # hash to related blank nodes map map = {} bnode_to_statements[identifier].each do |statement| hash_related_statement(identifier, statement, issuer, map) end data_to_hash = "" - debug("ndeg") {"map: #{map.map {|h,l| "#{h}: #{l.map(&:to_ntriples)}"}.join('; ')}"} - depth do + log_debug("ndeg") {"map: #{map.map {|h,l| "#{h}: #{l.map(&:to_ntriples)}"}.join('; ')}"} + log_depth do map.keys.sort.each do |hash| list = map[hash] # Iterate over related nodes chosen_path, chosen_issuer = "", nil data_to_hash += hash list.permutation do |permutation| - debug("ndeg") {"perm: #{permutation.map(&:to_ntriples).join(",")}"} + log_debug("ndeg") {"perm: #{permutation.map(&:to_ntriples).join(",")}"} issuer_copy, path, recursion_list = issuer.dup, "", [] permutation.each do |related| if canonical_issuer.identifier(related) path << canonical_issuer.issue_identifier(related) @@ -186,14 +186,14 @@ end # Skip to the next permutation if chosen path isn't empty and the path is greater than the chosen path break if !chosen_path.empty? && path.length >= chosen_path.length end - debug("ndeg") {"hash: #{hash}, path: #{path}, recursion: #{recursion_list.map(&:to_ntriples)}"} + log_debug("ndeg") {"hash: #{hash}, path: #{path}, recursion: #{recursion_list.map(&:to_ntriples)}"} recursion_list.each do |related| - result = depth {hash_n_degree_quads(related, issuer_copy)} + result = log_depth {hash_n_degree_quads(related, issuer_copy)} path << issuer_copy.issue_identifier(related) path << "<#{result.first}>" issuer_copy = result.last break if !chosen_path.empty? && path.length >= chosen_path.length && path > chosen_path end @@ -206,26 +206,25 @@ data_to_hash += chosen_path issuer = chosen_issuer end end - debug("ndeg") {"datatohash: #{data_to_hash.inspect}, hash: #{hexdigest(data_to_hash)}"} + log_debug("ndeg") {"datatohash: #{data_to_hash.inspect}, hash: #{hexdigest(data_to_hash)}"} return [hexdigest(data_to_hash), issuer] end protected - # FIXME: should be SHA-256. def hexdigest(val) - Digest::SHA1.hexdigest(val) + Digest::SHA256.hexdigest(val) end # Group adjacent bnodes by hash def hash_related_statement(identifier, statement, issuer, map) statement.to_hash(:s, :p, :o, :g).each do |pos, term| next if !term.is_a?(RDF::Node) || term == identifier - hash = depth {hash_related_node(term, statement, issuer, pos)} + hash = log_depth {hash_related_node(term, statement, issuer, pos)} map[hash] ||= [] map[hash] << term unless map[hash].include?(term) end end end \ No newline at end of file