lib/eco/data/fuzzy_match.rb in eco-helpers-2.0.19 vs lib/eco/data/fuzzy_match.rb in eco-helpers-2.0.21

- old
+ new

@@ -26,10 +26,11 @@ include Pairing include CharsPositionScore include NGramsScore def jaro_winkler(str1, str2, **options) + return 0 if !str1 || !str2 options = { ignore_case: true, weight: 0.25 }.merge(options) JaroWinkler.distance(str1, str2, **options) @@ -65,31 +66,70 @@ # make it run with a native C extension (for better performance: ~130 % increase of performance) ::FuzzyMatch.engine = :amatch @fuzzy_match = ::FuzzyMatch.new(haystack(haystack_data), fuzzy_match_options) end + # TODO: integration for options[:unique_words] => to ensure repeated words do not bring down the score are cut by threshold # @note # - When the `haystack` elements are **non** `String` objects, it excludes the needle itself from the results - # @param needle [String, Object] object is allowed when `fuzzy_options` includes `read:` key + # @param needle [String, Object] object is allowed when `fuzzy_options` includes `read:` key. + # @param needle_str [String, nil] the actual value of needle_str to be used. + # @param haystack [Enumerable] the items to find `needle` among. # @return [Eco::Data::FuzzyMatch::Results] - def find_all_with_score(needle, **options) - results = fuzzy_match(**options).find_all_with_score(needle).each_with_object([]) do |fuzzy_results, results| + def find_all_with_score(needle, needle_str: nil, haystack: nil, **options) + base_match = fuzzy_match(haystack, **options) + match_results = base_match.find_all_with_score(needle_str || needle) + needle_str ||= item_string(needle) + results = match_results.each_with_object([]) do |fuzzy_results, results| item, dice, lev = fuzzy_results unless item == needle - needle_str = item_string(needle) - item_str = item_string(item) - jaro_res = jaro(needle_str, item_str) - ngram_res = ngram(needle_str, item_str) - wngram_res = words_ngram(needle_str, item_str) - pos_res = position(needle_str, item_str) + item_str = item_string(item) - results << Result.new(item, item_str, dice, lev, jaro_res, ngram_res, wngram_res, pos_res) + if item_str.to_s.strip.empty? || needle_str.to_s.strip.empty? + dice = lev = jaro_res = ngram_res = ngram_res = wngram_res = pos_res = 0 + end + + jaro_res ||= jaro(needle_str, item_str) + ngram_res ||= ngram(needle_str, item_str) + wngram_res ||= words_ngram(needle_str, item_str) + pos_res ||= position(needle_str, item_str) + + results << Result.new(item, item_str, needle_str, dice, lev, jaro_res, ngram_res, wngram_res, pos_res) end end - Results.new(needle, item_string(needle), results).tap do |res| + Results.new(needle, needle_str, results).tap do |res| res.order = fuzzy_options[:order] if fuzzy_options[:order] res.threshold = fuzzy_options[:threshold] if fuzzy_options[:threshold] + end.relevant_results + end + + def recalculate_results(results, needle_str: nil, **options) + raise "You should provide a block |needle_str, item_str, needle, item|" unless block_given? + new_results = results.each_with_object([]) do |result, new_results| + nstr, istr = yield(needle_str || results.value, result.value, results.needle, result.match) + + if istr.to_s.strip.empty? + dice = lev = jaro_res = ngram_res = ngram_res = wngram_res = pos_res = 1 + elsif nstr.to_s.strip.empty? + unless istr = needle_str + dice = lev = jaro_res = ngram_res = ngram_res = wngram_res = pos_res = 0 + end + end + + res = ::FuzzyMatch.score_class.new(nstr, istr) unless dice && lev + dice ||= res&.dices_coefficient_similar || 0 + lev ||= res&.levenshtein_similar || 0 + jaro_res ||= jaro(nstr, istr) + ngram_res ||= ngram(nstr, istr) + wngram_res ||= words_ngram(nstr, istr) + pos_res ||= position(nstr, istr) + + new_results << Result.new(*result.values_at(:match, :value, :needle_str), dice, lev, jaro_res, ngram_res, wngram_res, pos_res) end + Results.new(results.needle, results.value, new_results).tap do |res| + res.order = options[:order] if options[:order] + res.threshold = options[:threshold] if options[:threshold] + end.relevant_results end private def jaro(str1, str2)