lib/fuzzy_match.rb in fuzzy_match-1.0.5 vs lib/fuzzy_match.rb in fuzzy_match-1.1.0

- old
+ new

@@ -15,39 +15,78 @@ autoload :Wrapper, 'fuzzy_match/wrapper' autoload :Similarity, 'fuzzy_match/similarity' autoload :Score, 'fuzzy_match/score' autoload :CachedResult, 'fuzzy_match/cached_result' + DEFAULT_OPTIONS = { + :first_blocking_decides => false, + :must_match_blocking => false, + :must_match_at_least_one_word => false, + :gather_last_result => false, + :find_all => false + } + attr_reader :haystack attr_reader :blockings attr_reader :identities attr_reader :tighteners attr_reader :stop_words - attr_reader :default_first_blocking_decides - attr_reader :default_must_match_blocking - attr_reader :default_must_match_at_least_one_word + attr_reader :read + attr_reader :default_options - # haystack - a bunch of records - # options + # haystack - a bunch of records that will compete to see who best matches the needle + # + # rules (can only be specified at initialization or by using a setter) # * tighteners: regexps (see readme) # * identities: regexps # * blockings: regexps # * stop_words: regexps # * read: how to interpret each entry in the 'haystack', either a Proc or a symbol - def initialize(records, options = {}) - options = options.symbolize_keys - @default_first_blocking_decides = options[:first_blocking_decides] - @default_must_match_blocking = options[:must_match_blocking] - @default_must_match_at_least_one_word = options[:must_match_at_least_one_word] - @blockings = options.fetch(:blockings, []).map { |regexp_or_str| Blocking.new regexp_or_str } - @identities = options.fetch(:identities, []).map { |regexp_or_str| Identity.new regexp_or_str } - @tighteners = options.fetch(:tighteners, []).map { |regexp_or_str| Tightener.new regexp_or_str } - @stop_words = options.fetch(:stop_words, []).map { |regexp_or_str| StopWord.new regexp_or_str } - read = options[:read] || options[:haystack_reader] - @haystack = records.map { |record| Wrapper.new self, record, read } + # + # options (can be specified at initialization or when calling #find) + # * first_blocking_decides + # * must_match_blocking + # * must_match_at_least_one_word + # * gather_last_result + # * find_all + def initialize(competitors, options_and_rules = {}) + options_and_rules = options_and_rules.symbolize_keys + + # rules + self.blockings = options_and_rules.delete(:blockings) || [] + self.identities = options_and_rules.delete(:identities) || [] + self.tighteners = options_and_rules.delete(:tighteners) || [] + self.stop_words = options_and_rules.delete(:stop_words) || [] + @read = options_and_rules.delete(:read) || options_and_rules.delete(:haystack_reader) + + # options + @default_options = options_and_rules.reverse_merge(DEFAULT_OPTIONS).freeze + + # do this last + self.haystack = competitors end + def blockings=(ary) + @blockings = ary.map { |regexp_or_str| Blocking.new regexp_or_str } + end + + def identities=(ary) + @identities = ary.map { |regexp_or_str| Identity.new regexp_or_str } + end + + def tighteners=(ary) + @tighteners = ary.map { |regexp_or_str| Tightener.new regexp_or_str } + end + + def stop_words=(ary) + @stop_words = ary.map { |regexp_or_str| StopWord.new regexp_or_str } + end + + def haystack=(ary) + @haystack = ary.map { |competitor| Wrapper.new self, competitor } + end + def last_result @last_result || raise(::RuntimeError, "[fuzzy_match] You can't access the last result until you've run a find with :gather_last_result => true") end def find_all(needle, options = {}) @@ -56,183 +95,175 @@ end def find(needle, options = {}) raise ::RuntimeError, "[fuzzy_match] Dictionary has already been freed, can't perform more finds" if freed? - options = options.symbolize_keys - gather_last_result = options.fetch(:gather_last_result, false) - is_find_all = options.fetch(:find_all, false) - first_blocking_decides = options.fetch(:first_blocking_decides, default_first_blocking_decides) - must_match_blocking = options.fetch(:must_match_blocking, default_must_match_blocking) - must_match_at_least_one_word = options.fetch(:must_match_at_least_one_word, default_must_match_at_least_one_word) + options = options.symbolize_keys.reverse_merge default_options + gather_last_result = options[:gather_last_result] + is_find_all = options[:find_all] + first_blocking_decides = options[:first_blocking_decides] + must_match_blocking = options[:must_match_blocking] + must_match_at_least_one_word = options[:must_match_at_least_one_word] + if gather_last_result free_last_result @last_result = Result.new + last_result.read = read + last_result.haystack = haystack + last_result.options = options + last_result.timeline << <<-EOS +Options were set, either by you or by falling back to defaults. +\tOptions: #{options.inspect} +EOS end if gather_last_result last_result.tighteners = tighteners last_result.identities = identities last_result.blockings = blockings last_result.stop_words = stop_words end - needle = Wrapper.new self, needle + needle = Wrapper.new self, needle, true if gather_last_result last_result.needle = needle + last_result.timeline << <<-EOS +The needle's #{needle.variants.length} variants were enumerated. +\tVariants: #{needle.variants.map(&:inspect).join(', ')} +EOS end if must_match_blocking and blockings.any? and blockings.none? { |blocking| blocking.match? needle } + if gather_last_result + last_result.timeline << <<-EOS +The needle didn't match any of the #{blockings.length} blocking, which was a requirement. +\tBlockings (first 3): #{blockings[0,3].map(&:inspect).join(', ')} +EOS + end + if is_find_all return [] else return nil end end - candidates = if must_match_at_least_one_word - haystack.select do |straw| + if must_match_at_least_one_word + passed_word_requirement = haystack.select do |straw| (needle.words & straw.words).any? end + if gather_last_result + last_result.timeline << <<-EOS +Since :must_match_at_least_one_word => true, the competition was reduced to records sharing at least one word with the needle. +\tNeedle words: #{needle.words.map(&:inspect).join(', ')} +\tPassed (first 3): #{passed_word_requirement[0,3].map(&:render).map(&:inspect).join(', ')} +\tFailed (first 3): #{(haystack-passed_word_requirement)[0,3].map(&:render).map(&:inspect).join(', ')} +EOS + end else - haystack + passed_word_requirement = haystack end - if gather_last_result - last_result.candidates = candidates - end - - joint, disjoint = if blockings.any? - candidates.partition do |straw| + if blockings.any? + joint = passed_word_requirement.select do |straw| if first_blocking_decides blockings.detect { |blocking| blocking.match? needle }.try :join?, needle, straw else blockings.any? { |blocking| blocking.join? needle, straw } end end + if gather_last_result + last_result.timeline << <<-EOS +Since there were blockings, the competition was reduced to records in the same block as the needle. +\tBlockings (first 3): #{blockings[0,3].map(&:inspect).join(', ')} +\tPassed (first 3): #{joint[0,3].map(&:render).map(&:inspect).join(', ')} +\tFailed (first 3): #{(passed_word_requirement-joint)[0,3].map(&:render).map(&:inspect).join(', ')} +EOS + end else - [ candidates.dup, [] ] + joint = passed_word_requirement.dup end if joint.none? if must_match_blocking + if gather_last_result + last_result.timeline << <<-EOS +Since :must_match_at_least_one_word => true and none of the competition was in the same block as the needle, the search stopped. +EOS + end if is_find_all return [] else return nil end else - # special case: the needle didn't fit anywhere, but must_match_blocking is false, so we'll try it against everything - joint = disjoint - disjoint = [] + joint = passed_word_requirement.dup end end - - if gather_last_result - last_result.joint = joint - last_result.disjoint = disjoint - end - - possibly_identical, certainly_different = if identities.any? - joint.partition do |straw| + + if identities.any? + possibly_identical = joint.select do |straw| identities.all? do |identity| answer = identity.identical? needle, straw answer.nil? or answer == true end end + if gather_last_result + last_result.timeline << <<-EOS +Since there were identities, the competition was reduced to records that might be identical to the needle (in other words, are not certainly different) +\Identities (first 3): #{identities[0,3].map(&:inspect).join(', ')} +\tPassed (first 3): #{possibly_identical[0,3].map(&:render).map(&:inspect).join(', ')} +\tFailed (first 3): #{(joint-possibly_identical)[0,3].map(&:render).map(&:inspect).join(', ')} +EOS + end else - [ joint.dup, [] ] + possibly_identical = joint.dup end - + + similarities = possibly_identical.map { |straw| needle.similarity straw }.sort.reverse + if gather_last_result - last_result.possibly_identical = possibly_identical - last_result.certainly_different = certainly_different + last_result.timeline << <<-EOS +The competition was sorted in order of similarity to the needle. +\tSimilar (first 3): #{(similarities)[0,3].map(&:wrapper2).map(&:render).map(&:inspect).join(', ')} +EOS end if is_find_all - return possibly_identical.map { |straw| straw.record } + return similarities.map { |similarity| similarity.wrapper2.record } end - similarities = possibly_identical.map { |straw| needle.similarity straw }.sort - - if gather_last_result - last_result.similarities = similarities - end - - if best_similarity = similarities[-1] and best_similarity.best_score.dices_coefficient > 0 - record = best_similarity.wrapper2.record + winner = nil + + if best_similarity = similarities.first and best_similarity.best_score.dices_coefficient_similar > 0 + winner = best_similarity.wrapper2.record if gather_last_result - last_result.record = record - last_result.score = best_similarity.best_score.dices_coefficient + last_result.winner = winner + last_result.score = best_similarity.best_score.dices_coefficient_similar + last_result.timeline << <<-EOS +A winner was determined because the similarity score #{best_similarity.best_score.dices_coefficient_similar} is greater than zero. +EOS end - record + elsif gather_last_result + last_result.timeline << <<-EOS +No winner assigned because similarity score was zero. +EOS end + + winner end # Explain is like mysql's EXPLAIN command. You give it a needle and it tells you about how it was located (successfully or not) in the haystack. # # d = FuzzyMatch.new ['737', '747', '757' ] # d.explain 'boeing 737-100' def explain(needle, options = {}) - record = find needle, options.merge(:gather_last_result => true) - log "#" * 150 - log "# Match #{needle.inspect} => #{record.inspect}" - log "#" * 150 - log - log "Needle" - log "-" * 150 - log last_result.needle.render - log - log "Stop words" - log last_result.stop_words.blank? ? '(none)' : last_result.stop_words.map { |stop_word| stop_word.inspect }.join("\n") - log - log "Candidates" - log "-" * 150 - log last_result.candidates.map { |record| record.render }.join("\n") - log - log "Tighteners" - log "-" * 150 - log last_result.tighteners.blank? ? '(none)' : last_result.tighteners.map { |tightener| tightener.inspect }.join("\n") - log - log "Blockings" - log "-" * 150 - log last_result.blockings.blank? ? '(none)' : last_result.blockings.map { |blocking| blocking.inspect }.join("\n") - log - log "Identities" - log "-" * 150 - log last_result.identities.blank? ? '(none)' : last_result.identities.map { |blocking| blocking.inspect }.join("\n") - log - log "Joint" - log "-" * 150 - log last_result.joint.blank? ? '(none)' : last_result.joint.map { |joint| joint.render }.join("\n") - log - log "Disjoint" - log "-" * 150 - log last_result.disjoint.blank? ? '(none)' : last_result.disjoint.map { |disjoint| disjoint.render }.join("\n") - log - log "Possibly identical" - log "-" * 150 - log last_result.possibly_identical.blank? ? '(none)' : last_result.possibly_identical.map { |possibly_identical| possibly_identical.render }.join("\n") - log - log "Certainly different" - log "-" * 150 - log last_result.certainly_different.blank? ? '(none)' : last_result.certainly_different.map { |certainly_different| certainly_different.render }.join("\n") - log - log "Similarities" - log "-" * 150 - log last_result.similarities.blank? ? '(none)' : last_result.similarities.reverse[0..9].map { |similarity| similarity.inspect }.join("\n") - log - log "Match" - log "-" * 150 - log record.inspect + find needle, options.merge(:gather_last_result => true) + last_result.explain end - - def log(str = '') #:nodoc: - $stderr.puts str - end - + def freed? @freed == true end def free