fuzzy_set.rb in fuzzy_set-1.1.0

- old
+ new

@@ -1,22 +1,37 @@
 require 'string/similarity'
 
 require 'fuzzy_set/version'
-require 'core_ext/string'
 
 # FuzzySet implements a fuzzy-searchable set of strings.
 #
 # As a set, it cannot contain duplicate elements.
 class FuzzySet
-  NGRAM_SIZE = 3
+  # default options for creating new instances
+  DEFAULT_OPTS = {
+    all_matches: false,
+    ngram_size_max: 3,
+    ngram_size_min: 2
+  }
 
-  def initialize(*items)
+  # @param items [#each,#to_s] item(s) to add
+  # @param opts [Hash] options, see {DEFAULT_OPTS}
+  # @option opts [Boolean] :all_matches
+  #   return all matches, even if an exact match is found
+  # @option opts [Fixnum] :ngram_size_max upper limit for ngram sizes
+  # @option opts [Fixnum] :ngram_size_min lower limit for ngram sizes
+  def initialize(*items, **opts)
+    opts = DEFAULT_OPTS.merge(opts)
+
     @items = []
     @denormalize = {}
     @index = {}
+    @all_matches = opts[:all_matches]
+    @ngram_size_max = opts[:ngram_size_max]
+    @ngram_size_min = opts[:ngram_size_min]
 
-    add(*items)
+    add(items)
   end
 
   # Normalizes +query+, and looks up an entry by its normalized value.
   #
   # @param query [String] search query
@@ -27,13 +42,14 @@
 
   # Add one or more +items+ to the set.
   #
   # Each item will be converted into a string and indexed upon adding.
   #
-  # @param items [#to_s] item(s) to add
+  # @param items [#each,#to_s] item(s) to add
   # @return [FuzzySet] +self+
   def add(*items)
+    items = [items].flatten
     items.each do |item|
       item = item.to_s
       return self if @items.include?(item)
 
       id = _add(item)
@@ -51,17 +67,19 @@
   #
   # 1. normalize +query+
   # 2. check for an exact match and return, if present
   # 3. find matches based on Ngrams
   # 4. sort matches by their cosine similarity to +query+
+  #
+  # @param query [String] search query
   def get(query)
     query = normalize(query)
 
     # check for exact match
-    return [@denormalize[query]] if @denormalize[query]
+    return [@denormalize[query]] if !@all_matches && @denormalize[query]
 
-    match_ids = query.ngram(NGRAM_SIZE).map { |ng| @index[ng] }
+    match_ids = matches_for(query)
     match_ids = match_ids.flatten.compact.uniq
     matches = match_ids.map { |id| @items[id] }
 
     # sort matches by their cosine distance to query
     matches.sort_by { |match| 1.0 - String::Similarity.cosine(query, match) }
@@ -83,10 +101,18 @@
     @items.empty?
   end
 
   private
 
+  def matches_for(query)
+    @ngram_size_max.downto(@ngram_size_min).each do |size|
+      match_ids = ngram(query, size).map { |ng| @index[ng] }
+      return match_ids if match_ids.any?
+    end
+    []
+  end
+
   # Normalize a string by removing all non-word characters
   # except spaces and then converting it to lowercase.
   def normalize(str)
     str.gsub(/[^\w ]/, '').downcase
   end
@@ -96,11 +122,27 @@
     normalized = normalize(item)
     @denormalize[normalized] = item
     @items.index(item)
   end
 
+  # calculate Ngrams and add them to the items
   def calculate_grams_for(string, id)
-    string.ngram(NGRAM_SIZE).each do |gram|
-      @index[gram] = (@index[gram] || []).push(id)
+    @ngram_size_max.downto(@ngram_size_min).each do |size|
+      ngram(string, size).each do |gram|
+        @index[gram] = (@index[gram] || []).push(id)
+      end
+    end
+  end
+
+  # break apart the string into strings of length `n`
+  #
+  # @example
+  #     'foobar'.ngram(3)
+  #     # => ["-fo", "foo", "oob", "oba", "bar", "ar-"]
+  def ngram(str, n)
+    fail ArgumentError, "n must be >= 1, is #{n}" if n < 1
+    str = "-#{str}-" if n > 1
+    (str.length - n + 1).times.map do |i|
+      str.slice(i, n)
     end
   end
 end