Sha256: b7569ce2fb9b30e3824c30a8e3af7d26ccdcbde9eb7ebb842771ea1b0a33c892

Contents?: true

Size: 943 Bytes

Versions: 2

Compression:

Stored size: 943 Bytes

Contents

# frozen_string_literal: true

module LocalitySensitiveHashing
  # Hashing technique which groups similar objects together
  class LocalitySensitiveHashing
    def initialize(n_rows, n_bands)
      @buckets = Array.new(n_bands) { generate_band_bucket }
      @n_rows = n_rows
    end

    def insert(signature, doc_id)
      if signature.length != @n_rows * @buckets.length
        raise(ArgumentError, "signature length does not match n_rows and n_bands")
      end

      signature.each_slice(@n_rows).with_index do |band_signature, band_idx|
        @buckets[band_idx][band_signature] << doc_id
      end
    end

    def similar_pairs
      similar = Set.new
      @buckets.each do |band_bucket|
        band_bucket.each_value do |bucket|
          similar.merge(bucket.combination(2))
        end
      end
      similar
    end

    private

    def generate_band_bucket
      Hash.new { |table, key| table[key] = [] }
    end
  end
end

Version data entries

2 entries across 2 versions & 1 rubygems

Version Path
doc_sim-0.1.1 lib/doc_sim/locality_sensitive_hashing.rb
doc_sim-0.1.0 lib/doc_sim/locality_sensitive_hashing.rb