lib/eco/data/fuzzy_match.rb in eco-helpers-2.0.18 vs lib/eco/data/fuzzy_match.rb in eco-helpers-2.0.19
- old
+ new
@@ -25,42 +25,48 @@
include StringHelpers
include Pairing
include CharsPositionScore
include NGramsScore
- def jaro_winkler(str1, str2)
+ def jaro_winkler(str1, str2, **options)
options = {
ignore_case: true,
weight: 0.25
- }
+ }.merge(options)
JaroWinkler.distance(str1, str2, **options)
end
end
module InstanceMethods
+ FUZZY_MATCH_OPTIONS = [
+ :identities, :groupings, :stop_words, :read,
+ :must_match_grouping, :must_match_at_least_one_word,
+ :gather_last_result, :threshold
+ ]
+
+ JARO_OPTIONS = [:ignore_case, :weight]
+ NGRAMS_OPTIONS = [:range]
+ POSITION_OPTIONS = [:max_distance]
+ RESULTS_OPTIONS = [:order, :threshold]
+
include StopWords
attr_accessor :fuzzy_options
def fuzzy_options
@fuzzy_options ||= {}
end
- def fuzzy_match(haystack = nil, **options)
- return @fuzzy_match if instance_variable_defined?(:@fuzzy_match)
- @fuzzy_options = options.merge({
- stop_words: PREPOSITIONS + PRONOUNS + ARTICLES
- })
+ def fuzzy_match(haystack_data = nil, **options)
+ if instance_variable_defined?(:@fuzzy_match) && !haystack_data
+ return @fuzzy_match if fuzzy_match_options == fuzzy_match_options(options)
+ end
+ @fuzzy_options = options
# make it run with a native C extension (for better performance: ~130 % increase of performance)
::FuzzyMatch.engine = :amatch
- haystack = obtain_haystack(haystack).tap do |items|
- if !fuzzy_read_method && found = items.find {|item| !item.is_a?(String)}
- raise "To use non String objects as 'haystack' you should provide `read:` or `options[:read]`. Given element: #{found.class}"
- end
- end
- @fuzzy_match = ::FuzzyMatch.new(haystack, fuzzy_options)
+ @fuzzy_match = ::FuzzyMatch.new(haystack(haystack_data), fuzzy_match_options)
end
# @note
# - When the `haystack` elements are **non** `String` objects, it excludes the needle itself from the results
# @param needle [String, Object] object is allowed when `fuzzy_options` includes `read:` key
@@ -69,40 +75,76 @@
results = fuzzy_match(**options).find_all_with_score(needle).each_with_object([]) do |fuzzy_results, results|
item, dice, lev = fuzzy_results
unless item == needle
needle_str = item_string(needle)
item_str = item_string(item)
- jaro_res = self.class.jaro_winkler(needle_str, item_str)
- ngram_res = self.class.ngrams_score(needle_str, item_str, range: 3..5).ratio
- wngram_res = self.class.words_ngrams_score(needle_str, item_str, range: 3..7).ratio
- pos_res = self.class.chars_position_score(needle_str, item_str).ratio
+ jaro_res = jaro(needle_str, item_str)
+ ngram_res = ngram(needle_str, item_str)
+ wngram_res = words_ngram(needle_str, item_str)
+ pos_res = position(needle_str, item_str)
+
results << Result.new(item, item_str, dice, lev, jaro_res, ngram_res, wngram_res, pos_res)
end
end
- Results.new(needle, item_string(needle), results)
+ Results.new(needle, item_string(needle), results).tap do |res|
+ res.order = fuzzy_options[:order] if fuzzy_options[:order]
+ res.threshold = fuzzy_options[:threshold] if fuzzy_options[:threshold]
+ end
end
private
+ def jaro(str1, str2)
+ options = fuzzy_options.slice(*JARO_OPTIONS)
+ self.class.jaro_winkler(str1, str2, **options)
+ end
+
+ def ngram(str1, str2)
+ options = { range: 3..5 }.merge(fuzzy_options.slice(*NGRAMS_OPTIONS))
+ self.class.ngrams_score(str1, str2, **options).ratio
+ end
+
+ def words_ngram(str1, str2)
+ options = { range: 3..7 }.merge(fuzzy_options.slice(*NGRAMS_OPTIONS))
+ self.class.words_ngrams_score(str1, str2, **options).ratio
+ end
+
+ def position(str1, str2)
+ options = fuzzy_options.slice(*POSITION_OPTIONS)
+ self.class.chars_position_score(str1, str2, **options).ratio
+ end
+
# @note
# - When used in an `Enumerable` it will use `to_a`, or `values` if it's a `Hash`
# @param data [Enumerable, nil]
# @return [Array<Object>] the non-repeated values of `data`
- def obtain_haystack(data = nil)
+ def haystack(data = nil)
data = self if self.is_a?(Enumerable) && !data
raise "'data' should be an Enumerable. Given: #{data.class}" unless data.is_a?(Enumerable)
data = self.is_a?(Hash) ? self.values.flatten : to_a.flatten
- data.uniq.compact
+ data.uniq.compact.tap do |items|
+ if !fuzzy_read_method && found = items.find {|item| !item.is_a?(String)}
+ raise "To use non String objects as 'haystack' you should provide `read:` or `options[:read]`. Given element: #{found.class}"
+ end
+ end
end
def item_string(item, attr = fuzzy_read_method)
return item if !item || item.is_a?(String) || !attr
+ return attr.call(item) if attr.is_a?(Proc)
attr = attr.to_sym
return item.send(attr) if item.respond_to?(attr)
end
+ def fuzzy_match_options(options = nil)
+ options = fuzzy_options unless options
+ options.slice(*FUZZY_MATCH_OPTIONS).merge({
+ stop_words: PREPOSITIONS + PRONOUNS + ARTICLES
+ })
+ end
+
def fuzzy_read_method
- fuzzy_options[:read]
+ fuzzy_match_options[:read]
end
end
class << self