require 'fuzzy_match' require 'amatch' require 'jaro_winkler' require_relative 'fuzzy_match/stop_words' require_relative 'fuzzy_match/array_helpers' require_relative 'fuzzy_match/string_helpers' require_relative 'fuzzy_match/pairing' require_relative 'fuzzy_match/chars_position_score' require_relative 'fuzzy_match/ngrams_score' module Eco module Data module FuzzyMatch class << self def included(base) base.send(:include, InstanceMethods) base.extend(ClassMethods) end end module ClassMethods include ArrayHelpers include StringHelpers include Pairing include CharsPositionScore include NGramsScore def jaro_winkler(str1, str2) options = { ignore_case: true, weight: 0.25 } JaroWinkler.distance(str1, str2, **options) end end module InstanceMethods include StopWords attr_accessor :fuzzy_options def fuzzy_options @fuzzy_options ||= {} end def fuzzy_match(haystack = nil, **options) return @fuzzy_match if instance_variable_defined?(:@fuzzy_match) @fuzzy_options = options.merge({ stop_words: PREPOSITIONS + PRONOUNS + ARTICLES }) # make it run with a native C extension (for better performance: ~130 % increase of performance) ::FuzzyMatch.engine = :amatch haystack = obtain_haystack(haystack).tap do |items| if !fuzzy_read_method && found = items.find {|item| !item.is_a?(String)} raise "To use non String objects as 'haystack' you should provide `read:` or `options[:read]`. Given element: #{found.class}" end end @fuzzy_match = ::FuzzyMatch.new(haystack, fuzzy_options) end # @note # - When the `haystack` elements are **non** `String` objects, it excludes the needle itself from the results # @param needle [String, Object] object is allowed when `fuzzy_options` includes `read:` key # @return [Eco::Data::FuzzyMatch::Results] def find_all_with_score(needle, **options) results = fuzzy_match(**options).find_all_with_score(needle).each_with_object([]) do |fuzzy_results, results| item, dice, lev = fuzzy_results unless item == needle needle_str = item_string(needle) item_str = item_string(item) jaro_res = self.class.jaro_winkler(needle_str, item_str) ngram_res = self.class.ngrams_score(needle_str, item_str, range: 3..5).ratio wngram_res = self.class.words_ngrams_score(needle_str, item_str, range: 3..7).ratio pos_res = self.class.chars_position_score(needle_str, item_str).ratio results << Result.new(item, item_str, dice, lev, jaro_res, ngram_res, wngram_res, pos_res) end end Results.new(needle, item_string(needle), results) end private # @note # - When used in an `Enumerable` it will use `to_a`, or `values` if it's a `Hash` # @param data [Enumerable, nil] # @return [Array] the non-repeated values of `data` def obtain_haystack(data = nil) data = self if self.is_a?(Enumerable) && !data raise "'data' should be an Enumerable. Given: #{data.class}" unless data.is_a?(Enumerable) data = self.is_a?(Hash) ? self.values.flatten : to_a.flatten data.uniq.compact end def item_string(item, attr = fuzzy_read_method) return item if !item || item.is_a?(String) || !attr attr = attr.to_sym return item.send(attr) if item.respond_to?(attr) end def fuzzy_read_method fuzzy_options[:read] end end class << self include FuzzyMatch::ClassMethods end end end end require_relative 'fuzzy_match/score' require_relative 'fuzzy_match/result' require_relative 'fuzzy_match/results'