lib/doc_sim/minhash.rb in doc_sim-0.1.0 vs lib/doc_sim/minhash.rb in doc_sim-0.1.1

- old
+ new

@@ -5,23 +5,26 @@ module Minhash # Class for generating Minhash signature class Minhash attr_reader :seed_root + # Hashes will always be <= 2**32 + HASH_MAX = (2**32) + 1 + def initialize(n_hashes = 1, seed_root = rand(2**32)) @seed_root = seed_root @hashes = Array.new(n_hashes) do |seed| ->(x) { MurmurHash3::V32.str_hash(x, seed_root + seed) } end end # Produces the Minhash signature for a given Set # - # @param set [Set] the set to produce the signature for + # @param set [Set[String]] the set to produce the signature for # # @return [Array[Integer]] 32 bit integer array of length n_hashes def signature(set) - counter = Array.new(@hashes.length, Float::INFINITY) + counter = Array.new(@hashes.length, Minhash::HASH_MAX) set.each do |elem| @hashes.each_with_index do |hash_func, i| counter[i] = [counter[i], hash_func.call(elem)].min end end