require 'fuzzy_match' require 'amatch' require 'jaro_winkler' require_relative 'fuzzy_match/stop_words' require_relative 'fuzzy_match/array_helpers' require_relative 'fuzzy_match/string_helpers' require_relative 'fuzzy_match/pairing' require_relative 'fuzzy_match/chars_position_score' require_relative 'fuzzy_match/ngrams_score' module Eco module Data module FuzzyMatch class << self def included(base) base.send(:include, InstanceMethods) base.extend(ClassMethods) end end module ClassMethods include ArrayHelpers include StringHelpers include Pairing include CharsPositionScore include NGramsScore def jaro_winkler(str1, str2, **options) options = { ignore_case: true, weight: 0.25 }.merge(options) JaroWinkler.distance(str1, str2, **options) end end module InstanceMethods FUZZY_MATCH_OPTIONS = [ :identities, :groupings, :stop_words, :read, :must_match_grouping, :must_match_at_least_one_word, :gather_last_result, :threshold ] JARO_OPTIONS = [:ignore_case, :weight] NGRAMS_OPTIONS = [:range] POSITION_OPTIONS = [:max_distance] RESULTS_OPTIONS = [:order, :threshold] include StopWords attr_accessor :fuzzy_options def fuzzy_options @fuzzy_options ||= {} end def fuzzy_match(haystack_data = nil, **options) if instance_variable_defined?(:@fuzzy_match) && !haystack_data return @fuzzy_match if fuzzy_match_options == fuzzy_match_options(options) end @fuzzy_options = options # make it run with a native C extension (for better performance: ~130 % increase of performance) ::FuzzyMatch.engine = :amatch @fuzzy_match = ::FuzzyMatch.new(haystack(haystack_data), fuzzy_match_options) end # @note # - When the `haystack` elements are **non** `String` objects, it excludes the needle itself from the results # @param needle [String, Object] object is allowed when `fuzzy_options` includes `read:` key # @return [Eco::Data::FuzzyMatch::Results] def find_all_with_score(needle, **options) results = fuzzy_match(**options).find_all_with_score(needle).each_with_object([]) do |fuzzy_results, results| item, dice, lev = fuzzy_results unless item == needle needle_str = item_string(needle) item_str = item_string(item) jaro_res = jaro(needle_str, item_str) ngram_res = ngram(needle_str, item_str) wngram_res = words_ngram(needle_str, item_str) pos_res = position(needle_str, item_str) results << Result.new(item, item_str, dice, lev, jaro_res, ngram_res, wngram_res, pos_res) end end Results.new(needle, item_string(needle), results).tap do |res| res.order = fuzzy_options[:order] if fuzzy_options[:order] res.threshold = fuzzy_options[:threshold] if fuzzy_options[:threshold] end end private def jaro(str1, str2) options = fuzzy_options.slice(*JARO_OPTIONS) self.class.jaro_winkler(str1, str2, **options) end def ngram(str1, str2) options = { range: 3..5 }.merge(fuzzy_options.slice(*NGRAMS_OPTIONS)) self.class.ngrams_score(str1, str2, **options).ratio end def words_ngram(str1, str2) options = { range: 3..7 }.merge(fuzzy_options.slice(*NGRAMS_OPTIONS)) self.class.words_ngrams_score(str1, str2, **options).ratio end def position(str1, str2) options = fuzzy_options.slice(*POSITION_OPTIONS) self.class.chars_position_score(str1, str2, **options).ratio end # @note # - When used in an `Enumerable` it will use `to_a`, or `values` if it's a `Hash` # @param data [Enumerable, nil] # @return [Array] the non-repeated values of `data` def haystack(data = nil) data = self if self.is_a?(Enumerable) && !data raise "'data' should be an Enumerable. Given: #{data.class}" unless data.is_a?(Enumerable) data = self.is_a?(Hash) ? self.values.flatten : to_a.flatten data.uniq.compact.tap do |items| if !fuzzy_read_method && found = items.find {|item| !item.is_a?(String)} raise "To use non String objects as 'haystack' you should provide `read:` or `options[:read]`. Given element: #{found.class}" end end end def item_string(item, attr = fuzzy_read_method) return item if !item || item.is_a?(String) || !attr return attr.call(item) if attr.is_a?(Proc) attr = attr.to_sym return item.send(attr) if item.respond_to?(attr) end def fuzzy_match_options(options = nil) options = fuzzy_options unless options options.slice(*FUZZY_MATCH_OPTIONS).merge({ stop_words: PREPOSITIONS + PRONOUNS + ARTICLES }) end def fuzzy_read_method fuzzy_match_options[:read] end end class << self include FuzzyMatch::ClassMethods end end end end require_relative 'fuzzy_match/score' require_relative 'fuzzy_match/result' require_relative 'fuzzy_match/results'