lib/fuzzy_set.rb in fuzzy_set-1.0.0 vs lib/fuzzy_set.rb in fuzzy_set-1.1.0

- old
+ new

@@ -1,22 +1,37 @@ require 'string/similarity' require 'fuzzy_set/version' -require 'core_ext/string' # FuzzySet implements a fuzzy-searchable set of strings. # # As a set, it cannot contain duplicate elements. class FuzzySet - NGRAM_SIZE = 3 + # default options for creating new instances + DEFAULT_OPTS = { + all_matches: false, + ngram_size_max: 3, + ngram_size_min: 2 + } - def initialize(*items) + # @param items [#each,#to_s] item(s) to add + # @param opts [Hash] options, see {DEFAULT_OPTS} + # @option opts [Boolean] :all_matches + # return all matches, even if an exact match is found + # @option opts [Fixnum] :ngram_size_max upper limit for ngram sizes + # @option opts [Fixnum] :ngram_size_min lower limit for ngram sizes + def initialize(*items, **opts) + opts = DEFAULT_OPTS.merge(opts) + @items = [] @denormalize = {} @index = {} + @all_matches = opts[:all_matches] + @ngram_size_max = opts[:ngram_size_max] + @ngram_size_min = opts[:ngram_size_min] - add(*items) + add(items) end # Normalizes +query+, and looks up an entry by its normalized value. # # @param query [String] search query @@ -27,13 +42,14 @@ # Add one or more +items+ to the set. # # Each item will be converted into a string and indexed upon adding. # - # @param items [#to_s] item(s) to add + # @param items [#each,#to_s] item(s) to add # @return [FuzzySet] +self+ def add(*items) + items = [items].flatten items.each do |item| item = item.to_s return self if @items.include?(item) id = _add(item) @@ -51,17 +67,19 @@ # # 1. normalize +query+ # 2. check for an exact match and return, if present # 3. find matches based on Ngrams # 4. sort matches by their cosine similarity to +query+ + # + # @param query [String] search query def get(query) query = normalize(query) # check for exact match - return [@denormalize[query]] if @denormalize[query] + return [@denormalize[query]] if !@all_matches && @denormalize[query] - match_ids = query.ngram(NGRAM_SIZE).map { |ng| @index[ng] } + match_ids = matches_for(query) match_ids = match_ids.flatten.compact.uniq matches = match_ids.map { |id| @items[id] } # sort matches by their cosine distance to query matches.sort_by { |match| 1.0 - String::Similarity.cosine(query, match) } @@ -83,10 +101,18 @@ @items.empty? end private + def matches_for(query) + @ngram_size_max.downto(@ngram_size_min).each do |size| + match_ids = ngram(query, size).map { |ng| @index[ng] } + return match_ids if match_ids.any? + end + [] + end + # Normalize a string by removing all non-word characters # except spaces and then converting it to lowercase. def normalize(str) str.gsub(/[^\w ]/, '').downcase end @@ -96,11 +122,27 @@ normalized = normalize(item) @denormalize[normalized] = item @items.index(item) end + # calculate Ngrams and add them to the items def calculate_grams_for(string, id) - string.ngram(NGRAM_SIZE).each do |gram| - @index[gram] = (@index[gram] || []).push(id) + @ngram_size_max.downto(@ngram_size_min).each do |size| + ngram(string, size).each do |gram| + @index[gram] = (@index[gram] || []).push(id) + end + end + end + + # break apart the string into strings of length `n` + # + # @example + # 'foobar'.ngram(3) + # # => ["-fo", "foo", "oob", "oba", "bar", "ar-"] + def ngram(str, n) + fail ArgumentError, "n must be >= 1, is #{n}" if n < 1 + str = "-#{str}-" if n > 1 + (str.length - n + 1).times.map do |i| + str.slice(i, n) end end end