lib/fuzzy_match.rb in fuzzy_match-1.3.1 vs lib/fuzzy_match.rb in fuzzy_match-1.3.2

- old
+ new

@@ -3,14 +3,15 @@ if ::ActiveSupport::VERSION::MAJOR >= 3 require 'active_support/core_ext' end require 'to_regexp' -require 'fuzzy_match/normalizer' -require 'fuzzy_match/stop_word' -require 'fuzzy_match/blocking' -require 'fuzzy_match/identity' +require 'fuzzy_match/rule' +require 'fuzzy_match/rule/normalizer' +require 'fuzzy_match/rule/stop_word' +require 'fuzzy_match/rule/grouping' +require 'fuzzy_match/rule/identity' require 'fuzzy_match/result' require 'fuzzy_match/wrapper' require 'fuzzy_match/similarity' require 'fuzzy_match/score' @@ -42,19 +43,19 @@ end DEFAULT_ENGINE = :pure_ruby DEFAULT_OPTIONS = { - :first_blocking_decides => false, - :must_match_blocking => false, + :first_grouping_decides => false, + :must_match_grouping => false, :must_match_at_least_one_word => false, :gather_last_result => false, :find_all => false } attr_reader :haystack - attr_reader :blockings + attr_reader :groupings attr_reader :identities attr_reader :normalizers attr_reader :stop_words attr_reader :read attr_reader :default_options @@ -62,50 +63,56 @@ # haystack - a bunch of records that will compete to see who best matches the needle # # Rules (can only be specified at initialization or by using a setter) # * :<tt>normalizers</tt> - regexps (see README) # * :<tt>identities</tt> - regexps - # * :<tt>blockings</tt> - regexps + # * :<tt>groupings</tt> - regexps # * :<tt>stop_words</tt> - regexps # # Options (can be specified at initialization or when calling #find) # * :<tt>read</tt> - how to interpret each record in the 'haystack', either a Proc or a symbol - # * :<tt>must_match_blocking</tt> - don't return a match unless the needle fits into one of the blockings you specified + # * :<tt>must_match_grouping</tt> - don't return a match unless the needle fits into one of the groupings you specified # * :<tt>must_match_at_least_one_word</tt> - don't return a match unless the needle shares at least one word with the match - # * :<tt>first_blocking_decides</tt> - force records into the first blocking they match, rather than choosing a blocking that will give them a higher score + # * :<tt>first_grouping_decides</tt> - force records into the first grouping they match, rather than choosing a grouping that will give them a higher score # * :<tt>gather_last_result</tt> - enable <tt>last_result</tt> def initialize(competitors, options_and_rules = {}) options_and_rules = options_and_rules.symbolize_keys # rules - self.blockings = options_and_rules.delete(:blockings) || [] + self.groupings = options_and_rules.delete(:groupings) || options_and_rules.delete(:blockings) || [] self.identities = options_and_rules.delete(:identities) || [] self.normalizers = options_and_rules.delete(:normalizers) || options_and_rules.delete(:tighteners) || [] self.stop_words = options_and_rules.delete(:stop_words) || [] @read = options_and_rules.delete(:read) || options_and_rules.delete(:haystack_reader) # options + if deprecated = options_and_rules.delete(:first_blocking_decides) + options_and_rules[:first_grouping_decides] = deprecated + end + if deprecated = options_and_rules.delete(:must_match_blocking) + options_and_rules[:must_match_grouping] = deprecated + end @default_options = options_and_rules.reverse_merge(DEFAULT_OPTIONS).freeze # do this last self.haystack = competitors end - def blockings=(ary) - @blockings = ary.map { |regexp_or_str| Blocking.new regexp_or_str } + def groupings=(ary) + @groupings = ary.map { |regexp_or_str| Rule::Grouping.new regexp_or_str } end def identities=(ary) - @identities = ary.map { |regexp_or_str| Identity.new regexp_or_str } + @identities = ary.map { |regexp_or_str| Rule::Identity.new regexp_or_str } end def normalizers=(ary) - @normalizers = ary.map { |regexp_or_str| Normalizer.new regexp_or_str } + @normalizers = ary.map { |regexp_or_str| Rule::Normalizer.new regexp_or_str } end def stop_words=(ary) - @stop_words = ary.map { |regexp_or_str| StopWord.new regexp_or_str } + @stop_words = ary.map { |regexp_or_str| Rule::StopWord.new regexp_or_str } end def haystack=(ary) @haystack = ary.map { |competitor| Wrapper.new self, competitor } end @@ -122,12 +129,12 @@ def find(needle, options = {}) options = options.symbolize_keys.reverse_merge default_options gather_last_result = options[:gather_last_result] is_find_all = options[:find_all] - first_blocking_decides = options[:first_blocking_decides] - must_match_blocking = options[:must_match_blocking] + first_grouping_decides = options[:first_grouping_decides] + must_match_grouping = options[:must_match_grouping] must_match_at_least_one_word = options[:must_match_at_least_one_word] if gather_last_result @last_result = Result.new last_result.read = read @@ -140,11 +147,11 @@ end if gather_last_result last_result.normalizers = normalizers last_result.identities = identities - last_result.blockings = blockings + last_result.groupings = groupings last_result.stop_words = stop_words end needle = Wrapper.new self, needle, true @@ -154,15 +161,15 @@ The needle's #{needle.variants.length} variants were enumerated. \tVariants: #{needle.variants.map(&:inspect).join(', ')} EOS end - if must_match_blocking and blockings.any? and blockings.none? { |blocking| blocking.match? needle } + if must_match_grouping and groupings.any? and groupings.none? { |grouping| grouping.match? needle } if gather_last_result last_result.timeline << <<-EOS -The needle didn't match any of the #{blockings.length} blocking, which was a requirement. -\tBlockings (first 3): #{blockings[0,3].map(&:inspect).join(', ')} +The needle didn't match any of the #{groupings.length} grouping, which was a requirement. +\tGroupings (first 3): #{groupings[0,3].map(&:inspect).join(', ')} EOS end if is_find_all return [] @@ -185,35 +192,35 @@ end else passed_word_requirement = haystack end - if blockings.any? + if groupings.any? joint = passed_word_requirement.select do |straw| - if first_blocking_decides - blockings.detect { |blocking| blocking.match? needle }.try :join?, needle, straw + if first_grouping_decides + groupings.detect { |grouping| grouping.match? needle }.try :join?, needle, straw else - blockings.any? { |blocking| blocking.join? needle, straw } + groupings.any? { |grouping| grouping.join? needle, straw } end end if gather_last_result last_result.timeline << <<-EOS -Since there were blockings, the competition was reduced to records in the same block as the needle. -\tBlockings (first 3): #{blockings[0,3].map(&:inspect).join(', ')} +Since there were groupings, the competition was reduced to records in the same group as the needle. +\tGroupings (first 3): #{groupings[0,3].map(&:inspect).join(', ')} \tPassed (first 3): #{joint[0,3].map(&:render).map(&:inspect).join(', ')} \tFailed (first 3): #{(passed_word_requirement-joint)[0,3].map(&:render).map(&:inspect).join(', ')} EOS end else joint = passed_word_requirement.dup end if joint.none? - if must_match_blocking + if must_match_grouping if gather_last_result last_result.timeline << <<-EOS -Since :must_match_at_least_one_word => true and none of the competition was in the same block as the needle, the search stopped. +Since :must_match_at_least_one_word => true and none of the competition was in the same group as the needle, the search stopped. EOS end if is_find_all return [] else @@ -254,23 +261,24 @@ if is_find_all return similarities.map { |similarity| similarity.wrapper2.record } end + best_similarity = similarities.first winner = nil - if best_similarity = similarities.first and best_similarity.best_score.dices_coefficient_similar > 0 + if best_similarity and (best_similarity.best_score.dices_coefficient_similar > 0 or (needle.words & best_similarity.wrapper2.words).any?) winner = best_similarity.wrapper2.record if gather_last_result last_result.winner = winner last_result.score = best_similarity.best_score.dices_coefficient_similar last_result.timeline << <<-EOS -A winner was determined because the similarity score #{best_similarity.best_score.dices_coefficient_similar} is greater than zero. +A winner was determined because the Dice's Coefficient similarity (#{best_similarity.best_score.dices_coefficient_similar}) is greater than zero or because it shared a word with the needle. EOS end elsif gather_last_result last_result.timeline << <<-EOS -No winner assigned because similarity score was zero. +No winner assigned because the score of the best similarity (#{best_similarity.try(:wrapper2).try(:record).try(:inspect)}) was zero and it didn't match any words with the needle (#{needle.inspect}). EOS end winner end