module Eco module Data module FuzzyMatch module NGramsScore # It does the following: # 1. It splits both strings into words # 2. Pairs all words by best `ngrams_score` match # 3. Gives `0` score to those words of `str2` that lost their pair (a word of `str1` cannot be paired twice) # 4. Merges the `ngrams_score` of all the paired words of `str2` against their `str1` word pair # @param range [Integer, Range] determine the lenght of the generated values for each `word`. # @normalized [Boolean] to avoid double ups in normalizing. # @return [Score] the score object with the result. def words_ngrams_score(str1, str2, range: 3..5, normalized: false) str1, str2 = normalize_string([str1, str2]) unless normalized len1 = str1 && str1.length; len2 = str2 && str2.length Score.new(0, 0).tap do |score| next if !str2 || !str1 next score.increase(score.total) if str1 == str2 next if str1.length < 2 || str1.length < 2 paired_words(str1, str2, normalized: true) do |needle, item| ngrams_score(needle, item, range: range, normalized: true) end.each do |sub_str1, (item, iscore)| #puts "pairs '#{sub_str1}' --> '#{item}' (score: #{iscore.ratio})" score.merge!(iscore) end end end # A score is kept of matching ngram combinations of `str2`. # @note This algorithm is best suited for matching sentences, or 'firstname lastname' compared with 'lastname firstname' combinations. # @param range [Integer, Range] determine the lenght of the generated values. # @normalized [Boolean] to avoid double ups in normalizing. # @return [Score] the score object with the result. def ngrams_score(str1, str2, range: 3..5, normalized: false) str1, str2 = normalize_string([str1, str2]) unless normalized len1 = str1 && str1.length; len2 = str2 && str2.length Score.new(0, len1 || 0).tap do |score| next if !str2 || !str1 next score.increase(score.total) if str1 == str2 next if str1.length < 2 || str2.length < 2 grams = word_ngrams(str2, range, normalized: true) next unless grams.length > 0 if range.is_a?(Integer) item_weight = score.total.to_f / grams.length matches = grams.select {|res| str1.include?(gram)}.length score.increase(matches * item_weight) else groups = grams.group_by {|gram| gram.length} sorted_lens = groups.keys.sort.reverse lens = sorted_lens.length group_weight = (1.0 / lens).round(3) groups.each do |len, grams| len_max_score = score.total * group_weight item_weight = len_max_score / grams.length matches = grams.select {|gram| str1.include?(gram)}.length #pp "#{len} match: #{matches} (over #{grams.length}) || max_score: #{len_max_score} (over #{score.total})" score.increase(matches * item_weight) end end end end end end end end