lib/fuzzy_set.rb in fuzzy_set-1.0.0 vs lib/fuzzy_set.rb in fuzzy_set-1.1.0
- old
+ new
@@ -1,22 +1,37 @@
require 'string/similarity'
require 'fuzzy_set/version'
-require 'core_ext/string'
# FuzzySet implements a fuzzy-searchable set of strings.
#
# As a set, it cannot contain duplicate elements.
class FuzzySet
- NGRAM_SIZE = 3
+ # default options for creating new instances
+ DEFAULT_OPTS = {
+ all_matches: false,
+ ngram_size_max: 3,
+ ngram_size_min: 2
+ }
- def initialize(*items)
+ # @param items [#each,#to_s] item(s) to add
+ # @param opts [Hash] options, see {DEFAULT_OPTS}
+ # @option opts [Boolean] :all_matches
+ # return all matches, even if an exact match is found
+ # @option opts [Fixnum] :ngram_size_max upper limit for ngram sizes
+ # @option opts [Fixnum] :ngram_size_min lower limit for ngram sizes
+ def initialize(*items, **opts)
+ opts = DEFAULT_OPTS.merge(opts)
+
@items = []
@denormalize = {}
@index = {}
+ @all_matches = opts[:all_matches]
+ @ngram_size_max = opts[:ngram_size_max]
+ @ngram_size_min = opts[:ngram_size_min]
- add(*items)
+ add(items)
end
# Normalizes +query+, and looks up an entry by its normalized value.
#
# @param query [String] search query
@@ -27,13 +42,14 @@
# Add one or more +items+ to the set.
#
# Each item will be converted into a string and indexed upon adding.
#
- # @param items [#to_s] item(s) to add
+ # @param items [#each,#to_s] item(s) to add
# @return [FuzzySet] +self+
def add(*items)
+ items = [items].flatten
items.each do |item|
item = item.to_s
return self if @items.include?(item)
id = _add(item)
@@ -51,17 +67,19 @@
#
# 1. normalize +query+
# 2. check for an exact match and return, if present
# 3. find matches based on Ngrams
# 4. sort matches by their cosine similarity to +query+
+ #
+ # @param query [String] search query
def get(query)
query = normalize(query)
# check for exact match
- return [@denormalize[query]] if @denormalize[query]
+ return [@denormalize[query]] if !@all_matches && @denormalize[query]
- match_ids = query.ngram(NGRAM_SIZE).map { |ng| @index[ng] }
+ match_ids = matches_for(query)
match_ids = match_ids.flatten.compact.uniq
matches = match_ids.map { |id| @items[id] }
# sort matches by their cosine distance to query
matches.sort_by { |match| 1.0 - String::Similarity.cosine(query, match) }
@@ -83,10 +101,18 @@
@items.empty?
end
private
+ def matches_for(query)
+ @ngram_size_max.downto(@ngram_size_min).each do |size|
+ match_ids = ngram(query, size).map { |ng| @index[ng] }
+ return match_ids if match_ids.any?
+ end
+ []
+ end
+
# Normalize a string by removing all non-word characters
# except spaces and then converting it to lowercase.
def normalize(str)
str.gsub(/[^\w ]/, '').downcase
end
@@ -96,11 +122,27 @@
normalized = normalize(item)
@denormalize[normalized] = item
@items.index(item)
end
+ # calculate Ngrams and add them to the items
def calculate_grams_for(string, id)
- string.ngram(NGRAM_SIZE).each do |gram|
- @index[gram] = (@index[gram] || []).push(id)
+ @ngram_size_max.downto(@ngram_size_min).each do |size|
+ ngram(string, size).each do |gram|
+ @index[gram] = (@index[gram] || []).push(id)
+ end
+ end
+ end
+
+ # break apart the string into strings of length `n`
+ #
+ # @example
+ # 'foobar'.ngram(3)
+ # # => ["-fo", "foo", "oob", "oba", "bar", "ar-"]
+ def ngram(str, n)
+ fail ArgumentError, "n must be >= 1, is #{n}" if n < 1
+ str = "-#{str}-" if n > 1
+ (str.length - n + 1).times.map do |i|
+ str.slice(i, n)
end
end
end