lib/eco/data/fuzzy_match.rb in eco-helpers-2.0.19 vs lib/eco/data/fuzzy_match.rb in eco-helpers-2.0.21
- old
+ new
@@ -26,10 +26,11 @@
include Pairing
include CharsPositionScore
include NGramsScore
def jaro_winkler(str1, str2, **options)
+ return 0 if !str1 || !str2
options = {
ignore_case: true,
weight: 0.25
}.merge(options)
JaroWinkler.distance(str1, str2, **options)
@@ -65,31 +66,70 @@
# make it run with a native C extension (for better performance: ~130 % increase of performance)
::FuzzyMatch.engine = :amatch
@fuzzy_match = ::FuzzyMatch.new(haystack(haystack_data), fuzzy_match_options)
end
+ # TODO: integration for options[:unique_words] => to ensure repeated words do not bring down the score are cut by threshold
# @note
# - When the `haystack` elements are **non** `String` objects, it excludes the needle itself from the results
- # @param needle [String, Object] object is allowed when `fuzzy_options` includes `read:` key
+ # @param needle [String, Object] object is allowed when `fuzzy_options` includes `read:` key.
+ # @param needle_str [String, nil] the actual value of needle_str to be used.
+ # @param haystack [Enumerable] the items to find `needle` among.
# @return [Eco::Data::FuzzyMatch::Results]
- def find_all_with_score(needle, **options)
- results = fuzzy_match(**options).find_all_with_score(needle).each_with_object([]) do |fuzzy_results, results|
+ def find_all_with_score(needle, needle_str: nil, haystack: nil, **options)
+ base_match = fuzzy_match(haystack, **options)
+ match_results = base_match.find_all_with_score(needle_str || needle)
+ needle_str ||= item_string(needle)
+ results = match_results.each_with_object([]) do |fuzzy_results, results|
item, dice, lev = fuzzy_results
unless item == needle
- needle_str = item_string(needle)
- item_str = item_string(item)
- jaro_res = jaro(needle_str, item_str)
- ngram_res = ngram(needle_str, item_str)
- wngram_res = words_ngram(needle_str, item_str)
- pos_res = position(needle_str, item_str)
+ item_str = item_string(item)
- results << Result.new(item, item_str, dice, lev, jaro_res, ngram_res, wngram_res, pos_res)
+ if item_str.to_s.strip.empty? || needle_str.to_s.strip.empty?
+ dice = lev = jaro_res = ngram_res = ngram_res = wngram_res = pos_res = 0
+ end
+
+ jaro_res ||= jaro(needle_str, item_str)
+ ngram_res ||= ngram(needle_str, item_str)
+ wngram_res ||= words_ngram(needle_str, item_str)
+ pos_res ||= position(needle_str, item_str)
+
+ results << Result.new(item, item_str, needle_str, dice, lev, jaro_res, ngram_res, wngram_res, pos_res)
end
end
- Results.new(needle, item_string(needle), results).tap do |res|
+ Results.new(needle, needle_str, results).tap do |res|
res.order = fuzzy_options[:order] if fuzzy_options[:order]
res.threshold = fuzzy_options[:threshold] if fuzzy_options[:threshold]
+ end.relevant_results
+ end
+
+ def recalculate_results(results, needle_str: nil, **options)
+ raise "You should provide a block |needle_str, item_str, needle, item|" unless block_given?
+ new_results = results.each_with_object([]) do |result, new_results|
+ nstr, istr = yield(needle_str || results.value, result.value, results.needle, result.match)
+
+ if istr.to_s.strip.empty?
+ dice = lev = jaro_res = ngram_res = ngram_res = wngram_res = pos_res = 1
+ elsif nstr.to_s.strip.empty?
+ unless istr = needle_str
+ dice = lev = jaro_res = ngram_res = ngram_res = wngram_res = pos_res = 0
+ end
+ end
+
+ res = ::FuzzyMatch.score_class.new(nstr, istr) unless dice && lev
+ dice ||= res&.dices_coefficient_similar || 0
+ lev ||= res&.levenshtein_similar || 0
+ jaro_res ||= jaro(nstr, istr)
+ ngram_res ||= ngram(nstr, istr)
+ wngram_res ||= words_ngram(nstr, istr)
+ pos_res ||= position(nstr, istr)
+
+ new_results << Result.new(*result.values_at(:match, :value, :needle_str), dice, lev, jaro_res, ngram_res, wngram_res, pos_res)
end
+ Results.new(results.needle, results.value, new_results).tap do |res|
+ res.order = options[:order] if options[:order]
+ res.threshold = options[:threshold] if options[:threshold]
+ end.relevant_results
end
private
def jaro(str1, str2)