lib/eco/data/fuzzy_match.rb in eco-helpers-2.0.18 vs lib/eco/data/fuzzy_match.rb in eco-helpers-2.0.19

- old
+ new

@@ -25,42 +25,48 @@ include StringHelpers include Pairing include CharsPositionScore include NGramsScore - def jaro_winkler(str1, str2) + def jaro_winkler(str1, str2, **options) options = { ignore_case: true, weight: 0.25 - } + }.merge(options) JaroWinkler.distance(str1, str2, **options) end end module InstanceMethods + FUZZY_MATCH_OPTIONS = [ + :identities, :groupings, :stop_words, :read, + :must_match_grouping, :must_match_at_least_one_word, + :gather_last_result, :threshold + ] + + JARO_OPTIONS = [:ignore_case, :weight] + NGRAMS_OPTIONS = [:range] + POSITION_OPTIONS = [:max_distance] + RESULTS_OPTIONS = [:order, :threshold] + include StopWords attr_accessor :fuzzy_options def fuzzy_options @fuzzy_options ||= {} end - def fuzzy_match(haystack = nil, **options) - return @fuzzy_match if instance_variable_defined?(:@fuzzy_match) - @fuzzy_options = options.merge({ - stop_words: PREPOSITIONS + PRONOUNS + ARTICLES - }) + def fuzzy_match(haystack_data = nil, **options) + if instance_variable_defined?(:@fuzzy_match) && !haystack_data + return @fuzzy_match if fuzzy_match_options == fuzzy_match_options(options) + end + @fuzzy_options = options # make it run with a native C extension (for better performance: ~130 % increase of performance) ::FuzzyMatch.engine = :amatch - haystack = obtain_haystack(haystack).tap do |items| - if !fuzzy_read_method && found = items.find {|item| !item.is_a?(String)} - raise "To use non String objects as 'haystack' you should provide `read:` or `options[:read]`. Given element: #{found.class}" - end - end - @fuzzy_match = ::FuzzyMatch.new(haystack, fuzzy_options) + @fuzzy_match = ::FuzzyMatch.new(haystack(haystack_data), fuzzy_match_options) end # @note # - When the `haystack` elements are **non** `String` objects, it excludes the needle itself from the results # @param needle [String, Object] object is allowed when `fuzzy_options` includes `read:` key @@ -69,40 +75,76 @@ results = fuzzy_match(**options).find_all_with_score(needle).each_with_object([]) do |fuzzy_results, results| item, dice, lev = fuzzy_results unless item == needle needle_str = item_string(needle) item_str = item_string(item) - jaro_res = self.class.jaro_winkler(needle_str, item_str) - ngram_res = self.class.ngrams_score(needle_str, item_str, range: 3..5).ratio - wngram_res = self.class.words_ngrams_score(needle_str, item_str, range: 3..7).ratio - pos_res = self.class.chars_position_score(needle_str, item_str).ratio + jaro_res = jaro(needle_str, item_str) + ngram_res = ngram(needle_str, item_str) + wngram_res = words_ngram(needle_str, item_str) + pos_res = position(needle_str, item_str) + results << Result.new(item, item_str, dice, lev, jaro_res, ngram_res, wngram_res, pos_res) end end - Results.new(needle, item_string(needle), results) + Results.new(needle, item_string(needle), results).tap do |res| + res.order = fuzzy_options[:order] if fuzzy_options[:order] + res.threshold = fuzzy_options[:threshold] if fuzzy_options[:threshold] + end end private + def jaro(str1, str2) + options = fuzzy_options.slice(*JARO_OPTIONS) + self.class.jaro_winkler(str1, str2, **options) + end + + def ngram(str1, str2) + options = { range: 3..5 }.merge(fuzzy_options.slice(*NGRAMS_OPTIONS)) + self.class.ngrams_score(str1, str2, **options).ratio + end + + def words_ngram(str1, str2) + options = { range: 3..7 }.merge(fuzzy_options.slice(*NGRAMS_OPTIONS)) + self.class.words_ngrams_score(str1, str2, **options).ratio + end + + def position(str1, str2) + options = fuzzy_options.slice(*POSITION_OPTIONS) + self.class.chars_position_score(str1, str2, **options).ratio + end + # @note # - When used in an `Enumerable` it will use `to_a`, or `values` if it's a `Hash` # @param data [Enumerable, nil] # @return [Array<Object>] the non-repeated values of `data` - def obtain_haystack(data = nil) + def haystack(data = nil) data = self if self.is_a?(Enumerable) && !data raise "'data' should be an Enumerable. Given: #{data.class}" unless data.is_a?(Enumerable) data = self.is_a?(Hash) ? self.values.flatten : to_a.flatten - data.uniq.compact + data.uniq.compact.tap do |items| + if !fuzzy_read_method && found = items.find {|item| !item.is_a?(String)} + raise "To use non String objects as 'haystack' you should provide `read:` or `options[:read]`. Given element: #{found.class}" + end + end end def item_string(item, attr = fuzzy_read_method) return item if !item || item.is_a?(String) || !attr + return attr.call(item) if attr.is_a?(Proc) attr = attr.to_sym return item.send(attr) if item.respond_to?(attr) end + def fuzzy_match_options(options = nil) + options = fuzzy_options unless options + options.slice(*FUZZY_MATCH_OPTIONS).merge({ + stop_words: PREPOSITIONS + PRONOUNS + ARTICLES + }) + end + def fuzzy_read_method - fuzzy_options[:read] + fuzzy_match_options[:read] end end class << self