require_relative 'fuzzy_match/stop_words' require_relative 'fuzzy_match/array_helpers' require_relative 'fuzzy_match/string_helpers' require_relative 'fuzzy_match/pairing' require_relative 'fuzzy_match/chars_position_score' require_relative 'fuzzy_match/ngrams_score' module Eco module Data module FuzzyMatch class << self def included(base) base.send(:include, InstanceMethods) base.extend(ClassMethods) end end module ClassMethods include ArrayHelpers include StringHelpers include Pairing include CharsPositionScore include NGramsScore def jaro_winkler(str1, str2, **options) return 0 if !str1 || !str2 options = { ignore_case: true, weight: 0.25 }.merge(options) require 'jaro_winkler' JaroWinkler.distance(str1, str2, **options) end end module InstanceMethods FUZZY_MATCH_OPTIONS = [ :identities, :groupings, :stop_words, :read, :must_match_grouping, :must_match_at_least_one_word, :gather_last_result, :threshold ] JARO_OPTIONS = [:ignore_case, :weight] NGRAMS_OPTIONS = [:range] POSITION_OPTIONS = [:max_distance] RESULTS_OPTIONS = [:order, :threshold] include StopWords attr_accessor :fuzzy_options def fuzzy_options @fuzzy_options ||= {} end def fuzzy_match(haystack_data = nil, **options) if instance_variable_defined?(:@fuzzy_match) && !haystack_data return @fuzzy_match if fuzzy_match_options == fuzzy_match_options(options) end @fuzzy_options = options # make it run with a native C extension (for better performance: ~130 % increase of performance) require 'fuzzy_match' require 'amatch' ::FuzzyMatch.engine = :amatch @fuzzy_match = ::FuzzyMatch.new(haystack(haystack_data), fuzzy_match_options) end # TODO: integration for options[:unique_words] => to ensure repeated words do not bring down the score are cut by threshold # @note # - When the `haystack` elements are **non** `String` objects, it excludes the needle itself from the results # @param needle [String, Object] object is allowed when `fuzzy_options` includes `read:` key. # @param needle_str [String, nil] the actual value of needle_str to be used. # @param haystack [Enumerable] the items to find `needle` among. # @return [Eco::Data::FuzzyMatch::Results] def find_all_with_score(needle, needle_str: nil, haystack: nil, **options) base_match = fuzzy_match(haystack, **options) match_results = base_match.find_all_with_score(needle_str || needle) needle_str ||= item_string(needle) results = match_results.each_with_object([]) do |fuzzy_results, results| item, dice, lev = fuzzy_results unless item == needle item_str = item_string(item) if item_str.to_s.strip.empty? || needle_str.to_s.strip.empty? dice = lev = jaro_res = ngram_res = ngram_res = wngram_res = pos_res = 0 end jaro_res ||= jaro(needle_str, item_str) ngram_res ||= ngram(needle_str, item_str) wngram_res ||= words_ngram(needle_str, item_str) pos_res ||= position(needle_str, item_str) results << Result.new(item, item_str, needle_str, dice, lev, jaro_res, ngram_res, wngram_res, pos_res) end end Results.new(needle, needle_str, results).tap do |res| res.order = fuzzy_options[:order] if fuzzy_options[:order] res.threshold = fuzzy_options[:threshold] if fuzzy_options[:threshold] end.relevant_results end def recalculate_results(results, needle_str: nil, **options) raise "You should provide a block |needle_str, item_str, needle, item|" unless block_given? new_results = results.each_with_object([]) do |result, new_results| nstr, istr = yield(needle_str || results.value, result.value, results.needle, result.match) if istr.to_s.strip.empty? dice = lev = jaro_res = ngram_res = ngram_res = wngram_res = pos_res = 1 elsif nstr.to_s.strip.empty? unless istr = needle_str dice = lev = jaro_res = ngram_res = ngram_res = wngram_res = pos_res = 0 end end require 'fuzzy_match' require 'amatch' res = ::FuzzyMatch.score_class.new(nstr, istr) unless dice && lev dice ||= res&.dices_coefficient_similar || 0 lev ||= res&.levenshtein_similar || 0 jaro_res ||= jaro(nstr, istr) ngram_res ||= ngram(nstr, istr) wngram_res ||= words_ngram(nstr, istr) pos_res ||= position(nstr, istr) new_results << Result.new(*result.values_at(:match, :value, :needle_str), dice, lev, jaro_res, ngram_res, wngram_res, pos_res) end Results.new(results.needle, results.value, new_results).tap do |res| res.order = options[:order] if options[:order] res.threshold = options[:threshold] if options[:threshold] end.relevant_results end private def jaro(str1, str2) options = fuzzy_options.slice(*JARO_OPTIONS) self.class.jaro_winkler(str1, str2, **options) end def ngram(str1, str2) options = { range: 3..5 }.merge(fuzzy_options.slice(*NGRAMS_OPTIONS)) self.class.ngrams_score(str1, str2, **options).ratio end def words_ngram(str1, str2) options = { range: 3..7 }.merge(fuzzy_options.slice(*NGRAMS_OPTIONS)) self.class.words_ngrams_score(str1, str2, **options).ratio end def position(str1, str2) options = fuzzy_options.slice(*POSITION_OPTIONS) self.class.chars_position_score(str1, str2, **options).ratio end # @note # - When used in an `Enumerable` it will use `to_a`, or `values` if it's a `Hash` # @param data [Enumerable, nil] # @return [Array] the non-repeated values of `data` def haystack(data = nil) data = self if self.is_a?(Enumerable) && !data raise "'data' should be an Enumerable. Given: #{data.class}" unless data.is_a?(Enumerable) data = self.is_a?(Hash) ? self.values.flatten : to_a.flatten data.uniq.compact.tap do |items| if !fuzzy_read_method && found = items.find {|item| !item.is_a?(String)} raise "To use non String objects as 'haystack' you should provide `read:` or `options[:read]`. Given element: #{found.class}" end end end def item_string(item, attr = fuzzy_read_method) return item if !item || item.is_a?(String) || !attr return attr.call(item) if attr.is_a?(Proc) attr = attr.to_sym return item.send(attr) if item.respond_to?(attr) end def fuzzy_match_options(options = nil) options = fuzzy_options unless options options.slice(*FUZZY_MATCH_OPTIONS).merge({ stop_words: PREPOSITIONS + PRONOUNS + ARTICLES }) end def fuzzy_read_method fuzzy_match_options[:read] end end class << self include FuzzyMatch::ClassMethods end end end end require_relative 'fuzzy_match/score' require_relative 'fuzzy_match/result' require_relative 'fuzzy_match/results'