require 'to_regexp'
require 'fuzzy_match/rule'
require 'fuzzy_match/rule/normalizer'
require 'fuzzy_match/rule/stop_word'
require 'fuzzy_match/rule/grouping'
require 'fuzzy_match/rule/identity'
require 'fuzzy_match/result'
require 'fuzzy_match/wrapper'
require 'fuzzy_match/similarity'
require 'fuzzy_match/score'
# See the README for more information.
class FuzzyMatch
class << self
def engine
@engine
end
def engine=(alt_engine)
@engine = alt_engine
end
def score_class
case engine
when :pure_ruby
Score::PureRuby
when :amatch
Score::Amatch
else
raise ::ArgumentError, "[fuzzy_match] #{engine.inspect} is not a recognized engine."
end
end
end
DEFAULT_ENGINE = :pure_ruby
DEFAULT_OPTIONS = {
:first_grouping_decides => false,
:must_match_grouping => false,
:must_match_at_least_one_word => false,
:gather_last_result => false,
:find_all => false
}
self.engine = DEFAULT_ENGINE
attr_reader :haystack
attr_reader :groupings
attr_reader :identities
attr_reader :normalizers
attr_reader :stop_words
attr_reader :read
attr_reader :default_options
# haystack - a bunch of records that will compete to see who best matches the needle
#
# Rules (can only be specified at initialization or by using a setter)
# * :normalizers - regexps (see README)
# * :identities - regexps
# * :groupings - regexps
# * :stop_words - regexps
#
# Options (can be specified at initialization or when calling #find)
# * :read - how to interpret each record in the 'haystack', either a Proc or a symbol
# * :must_match_grouping - don't return a match unless the needle fits into one of the groupings you specified
# * :must_match_at_least_one_word - don't return a match unless the needle shares at least one word with the match
# * :first_grouping_decides - force records into the first grouping they match, rather than choosing a grouping that will give them a higher score
# * :gather_last_result - enable last_result
def initialize(competitors, options_and_rules = {})
options_and_rules = options_and_rules.dup
# rules
self.groupings = options_and_rules.delete(:groupings) || options_and_rules.delete(:blockings) || []
self.identities = options_and_rules.delete(:identities) || []
self.normalizers = options_and_rules.delete(:normalizers) || options_and_rules.delete(:tighteners) || []
self.stop_words = options_and_rules.delete(:stop_words) || []
@read = options_and_rules.delete(:read) || options_and_rules.delete(:haystack_reader)
# options
if deprecated = options_and_rules.delete(:first_blocking_decides)
options_and_rules[:first_grouping_decides] = deprecated
end
if deprecated = options_and_rules.delete(:must_match_blocking)
options_and_rules[:must_match_grouping] = deprecated
end
@default_options = DEFAULT_OPTIONS.merge(options_and_rules).freeze
# do this last
self.haystack = competitors
end
def groupings=(ary)
@groupings = ary.map { |regexp_or_str| Rule::Grouping.new regexp_or_str }
end
def identities=(ary)
@identities = ary.map { |regexp_or_str| Rule::Identity.new regexp_or_str }
end
def normalizers=(ary)
@normalizers = ary.map { |regexp_or_str| Rule::Normalizer.new regexp_or_str }
end
def stop_words=(ary)
@stop_words = ary.map { |regexp_or_str| Rule::StopWord.new regexp_or_str }
end
def haystack=(ary)
@haystack = ary.map { |competitor| Wrapper.new self, competitor }
end
def last_result
@last_result || raise(::RuntimeError, "[fuzzy_match] You can't access the last result until you've run a find with :gather_last_result => true")
end
def find_all(needle, options = {})
options = options.merge(:find_all => true)
find needle, options
end
def find(needle, options = {})
options = default_options.merge options
gather_last_result = options[:gather_last_result]
is_find_all = options[:find_all]
first_grouping_decides = options[:first_grouping_decides]
must_match_grouping = options[:must_match_grouping]
must_match_at_least_one_word = options[:must_match_at_least_one_word]
if gather_last_result
@last_result = Result.new
last_result.read = read
last_result.haystack = haystack
last_result.options = options
last_result.timeline << <<-EOS
Options were set, either by you or by falling back to defaults.
\tOptions: #{options.inspect}
EOS
end
if gather_last_result
last_result.normalizers = normalizers
last_result.identities = identities
last_result.groupings = groupings
last_result.stop_words = stop_words
end
needle = Wrapper.new self, needle, true
if gather_last_result
last_result.needle = needle
last_result.timeline << <<-EOS
The needle's #{needle.variants.length} variants were enumerated.
\tVariants: #{needle.variants.map(&:inspect).join(', ')}
EOS
end
if must_match_grouping and groupings.any? and groupings.none? { |grouping| grouping.match? needle }
if gather_last_result
last_result.timeline << <<-EOS
The needle didn't match any of the #{groupings.length} grouping, which was a requirement.
\tGroupings (first 3): #{groupings[0,3].map(&:inspect).join(', ')}
EOS
end
if is_find_all
return []
else
return nil
end
end
if must_match_at_least_one_word
passed_word_requirement = haystack.select do |straw|
(needle.words & straw.words).any?
end
if gather_last_result
last_result.timeline << <<-EOS
Since :must_match_at_least_one_word => true, the competition was reduced to records sharing at least one word with the needle.
\tNeedle words: #{needle.words.map(&:inspect).join(', ')}
\tPassed (first 3): #{passed_word_requirement[0,3].map(&:render).map(&:inspect).join(', ')}
\tFailed (first 3): #{(haystack-passed_word_requirement)[0,3].map(&:render).map(&:inspect).join(', ')}
EOS
end
else
passed_word_requirement = haystack
end
if groupings.any?
joint = passed_word_requirement.select do |straw|
if first_grouping_decides
if first_grouping = groupings.detect { |grouping| grouping.match? needle }
first_grouping.join? needle, straw
end
else
groupings.any? { |grouping| grouping.join? needle, straw }
end
end
if gather_last_result
last_result.timeline << <<-EOS
Since there were groupings, the competition was reduced to records in the same group as the needle.
\tGroupings (first 3): #{groupings[0,3].map(&:inspect).join(', ')}
\tPassed (first 3): #{joint[0,3].map(&:render).map(&:inspect).join(', ')}
\tFailed (first 3): #{(passed_word_requirement-joint)[0,3].map(&:render).map(&:inspect).join(', ')}
EOS
end
else
joint = passed_word_requirement.dup
end
if joint.none?
if must_match_grouping
if gather_last_result
last_result.timeline << <<-EOS
Since :must_match_at_least_one_word => true and none of the competition was in the same group as the needle, the search stopped.
EOS
end
if is_find_all
return []
else
return nil
end
else
joint = passed_word_requirement.dup
end
end
if identities.any?
possibly_identical = joint.select do |straw|
identities.all? do |identity|
answer = identity.identical? needle, straw
answer.nil? or answer == true
end
end
if gather_last_result
last_result.timeline << <<-EOS
Since there were identities, the competition was reduced to records that might be identical to the needle (in other words, are not certainly different)
\tIdentities (first 10 of #{identities.length}): #{identities[0,9].map(&:inspect).join(', ')}
\tPassed (first 10 of #{possibly_identical.length}): #{possibly_identical[0,9].map(&:render).map(&:inspect).join(', ')}
\tFailed (first 10 of #{(joint-possibly_identical).length}): #{(joint-possibly_identical)[0,9].map(&:render).map(&:inspect).join(', ')}
EOS
end
else
possibly_identical = joint.dup
end
similarities = possibly_identical.map { |straw| needle.similarity straw }.sort.reverse
if gather_last_result
last_result.timeline << <<-EOS
The competition was sorted in order of similarity to the needle.
\tSimilar (first 10 of #{similarities.length}): #{similarities[0,9].map { |s| "#{s.wrapper2.render.inspect} (#{[s.best_score.dices_coefficient_similar, s.best_score.levenshtein_similar].map { |v| '%0.5f' % v }.join('/')})" }.join(', ')}
EOS
end
if is_find_all
return similarities.map { |similarity| similarity.wrapper2.record }
end
best_similarity = similarities.first
winner = nil
if best_similarity and (best_similarity.best_score.dices_coefficient_similar > 0 or (needle.words & best_similarity.wrapper2.words).any?)
winner = best_similarity.wrapper2.record
if gather_last_result
last_result.winner = winner
last_result.score = best_similarity.best_score.dices_coefficient_similar
last_result.timeline << <<-EOS
A winner was determined because the Dice's Coefficient similarity (#{best_similarity.best_score.dices_coefficient_similar}) is greater than zero or because it shared a word with the needle.
EOS
end
elsif gather_last_result
best_similarity_record = if best_similarity and best_similarity.wrapper2
best_similarity.wrapper2.record
end
last_result.timeline << <<-EOS
No winner assigned because the score of the best similarity (#{best_similarity_record.inspect}) was zero and it didn't match any words with the needle (#{needle.inspect}).
EOS
end
winner
end
# Explain is like mysql's EXPLAIN command. You give it a needle and it tells you about how it was located (successfully or not) in the haystack.
#
# d = FuzzyMatch.new ['737', '747', '757' ]
# d.explain 'boeing 737-100'
def explain(needle, options = {})
find needle, options.merge(:gather_last_result => true)
last_result.explain
end
# DEPRECATED
def free
end
end