Sha256: c2cb64b206aff56679756ff6838f2b790d9ccc3ad3a315c5a160fb51d190910d

Contents?: true

Size: 917 Bytes

Versions: 1

Compression:

Stored size: 917 Bytes

Contents

# frozen_string_literal: true

require "murmurhash3"

module Minhash
  # Class for generating Minhash signature
  class Minhash
    attr_reader :seed_root

    # Hashes will always be <= 2**32
    HASH_MAX = (2**32) + 1

    def initialize(n_hashes = 1, seed_root = rand(2**32))
      @seed_root = seed_root
      @hashes = Array.new(n_hashes) do |seed|
        ->(x) { MurmurHash3::V32.str_hash(x, seed_root + seed) }
      end
    end

    # Produces the Minhash signature for a given Set
    #
    # @param set [Set[String]] the set to produce the signature for
    #
    # @return [Array[Integer]] 32 bit integer array of length n_hashes
    def signature(set)
      counter = Array.new(@hashes.length, Minhash::HASH_MAX)
      set.each do |elem|
        @hashes.each_with_index do |hash_func, i|
          counter[i] = [counter[i], hash_func.call(elem)].min
        end
      end
      counter
    end
  end
end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
doc_sim-0.1.1 lib/doc_sim/minhash.rb