lib/fuzzy_match.rb in fuzzy_match-1.5.0 vs lib/fuzzy_match.rb in fuzzy_match-2.0.0
- old
+ new
@@ -1,12 +1,10 @@
require 'fuzzy_match/rule'
-require 'fuzzy_match/rule/normalizer'
-require 'fuzzy_match/rule/stop_word'
require 'fuzzy_match/rule/grouping'
require 'fuzzy_match/rule/identity'
require 'fuzzy_match/result'
-require 'fuzzy_match/wrapper'
+require 'fuzzy_match/record'
require 'fuzzy_match/similarity'
require 'fuzzy_match/score'
# See the README for more information.
class FuzzyMatch
@@ -31,195 +29,179 @@
end
end
DEFAULT_ENGINE = :pure_ruby
+ #TODO refactor at least all the :find_X things
DEFAULT_OPTIONS = {
- :first_grouping_decides => false,
:must_match_grouping => false,
:must_match_at_least_one_word => false,
:gather_last_result => false,
:find_all => false,
:find_all_with_score => false,
:threshold => 0,
:find_best => false,
+ :find_with_score => false,
}
self.engine = DEFAULT_ENGINE
attr_reader :haystack
attr_reader :groupings
attr_reader :identities
- attr_reader :normalizers
attr_reader :stop_words
attr_accessor :read
attr_reader :default_options
# haystack - a bunch of records that will compete to see who best matches the needle
#
# Rules (can only be specified at initialization or by using a setter)
- # * :<tt>normalizers</tt> - regexps (see README)
# * :<tt>identities</tt> - regexps
# * :<tt>groupings</tt> - regexps
# * :<tt>stop_words</tt> - regexps
# * :<tt>read</tt> - how to interpret each record in the 'haystack', either a Proc or a symbol
#
# Options (can be specified at initialization or when calling #find)
# * :<tt>must_match_grouping</tt> - don't return a match unless the needle fits into one of the groupings you specified
# * :<tt>must_match_at_least_one_word</tt> - don't return a match unless the needle shares at least one word with the match
- # * :<tt>first_grouping_decides</tt> - force records into the first grouping they match, rather than choosing a grouping that will give them a higher score
# * :<tt>gather_last_result</tt> - enable <tt>last_result</tt>
# * :<tt>threshold</tt> - set a score threshold below which not to return results (not generally recommended - please test the results of setting a threshold thoroughly - one set of results and their scores probably won't be enough to determine the appropriate number). Only checked against the Pair Distance score and ignored when one string or the other is of length 1.
- def initialize(competitors, options_and_rules = {})
- options_and_rules = options_and_rules.dup
+ def initialize(haystack, options_and_rules = {})
+ o = options_and_rules.dup
# rules
- self.groupings = options_and_rules.delete(:groupings) || options_and_rules.delete(:blockings) || []
- self.identities = options_and_rules.delete(:identities) || []
- self.normalizers = options_and_rules.delete(:normalizers) || options_and_rules.delete(:tighteners) || []
- self.stop_words = options_and_rules.delete(:stop_words) || []
- @read = options_and_rules.delete(:read) || options_and_rules.delete(:haystack_reader)
-
+ @read = o.delete(:read) || o.delete(:haystack_reader)
+ @groupings = (o.delete(:groupings) || o.delete(:blockings) || []).map { |regexp| Rule::Grouping.make(regexp) }.flatten
+ @identities = (o.delete(:identities) || []).map { |regexp| Rule::Identity.new(regexp) }
+ @stop_words = o.delete(:stop_words) || []
+
# options
- if deprecated = options_and_rules.delete(:first_blocking_decides)
- options_and_rules[:first_grouping_decides] = deprecated
+ if deprecated = o.delete(:must_match_blocking)
+ o[:must_match_grouping] = deprecated
end
- if deprecated = options_and_rules.delete(:must_match_blocking)
- options_and_rules[:must_match_grouping] = deprecated
- end
- @default_options = DEFAULT_OPTIONS.merge(options_and_rules).freeze
+ @default_options = DEFAULT_OPTIONS.merge(o).freeze
- # do this last
- self.haystack = competitors
+ @haystack = haystack.map { |original| Record.new original, stop_words: @stop_words, read: @read }
end
-
- def groupings=(ary)
- @groupings = ary.map { |regexp| Rule::Grouping.new regexp }
- end
-
- def identities=(ary)
- @identities = ary.map { |regexp| Rule::Identity.new regexp }
- end
-
- def normalizers=(ary)
- @normalizers = ary.map { |regexp| Rule::Normalizer.new regexp }
- end
-
- def stop_words=(ary)
- @stop_words = ary.map { |regexp| Rule::StopWord.new regexp }
- end
-
- def haystack=(ary)
- @haystack = ary.map { |competitor| Wrapper.new self, competitor }
- end
-
+
def last_result
- @last_result || raise(::RuntimeError, "[fuzzy_match] You can't access the last result until you've run a find with :gather_last_result => true")
+ @last_result or raise("You can't access the last result until you've run a find with :gather_last_result => true")
end
+ # Return everything in sorted order
def find_all(needle, options = {})
options = options.merge(:find_all => true)
find needle, options
end
+ # Return the top results with the same score
def find_best(needle, options = {})
options = options.merge(:find_best => true)
find needle, options
end
+ # Return everything in sorted order with score
def find_all_with_score(needle, options = {})
options = options.merge(:find_all_with_score => true)
find needle, options
end
+
+ # Return one with score
+ def find_with_score(needle, options = {})
+ options = options.merge(:find_with_score => true)
+ find needle, options
+ end
def find(needle, options = {})
options = default_options.merge options
threshold = options[:threshold]
gather_last_result = options[:gather_last_result]
is_find_all_with_score = options[:find_all_with_score]
+ is_find_with_score = options[:find_with_score]
is_find_best = options[:find_best]
is_find_all = options[:find_all] || is_find_all_with_score || is_find_best
- first_grouping_decides = options[:first_grouping_decides]
must_match_grouping = options[:must_match_grouping]
must_match_at_least_one_word = options[:must_match_at_least_one_word]
if gather_last_result
@last_result = Result.new
last_result.read = read
last_result.haystack = haystack
last_result.options = options
- last_result.timeline << <<-EOS
-Options were set, either by you or by falling back to defaults.
-\tOptions: #{options.inspect}
-EOS
end
if gather_last_result
- last_result.normalizers = normalizers
last_result.identities = identities
last_result.groupings = groupings
last_result.stop_words = stop_words
end
- needle = Wrapper.new self, needle, true
+ needle = Record.new needle
if gather_last_result
last_result.needle = needle
- last_result.timeline << <<-EOS
-The needle's #{needle.variants.length} variants were enumerated.
-\tVariants: #{needle.variants.map(&:inspect).join(', ')}
-EOS
end
-
- if must_match_grouping and groupings.any? and groupings.none? { |grouping| grouping.match? needle }
+
+ if groupings.any?
+ first_grouping = groupings.detect { |grouping| grouping.xmatch? needle }
if gather_last_result
+ if first_grouping
+ last_result.timeline << "Grouping: #{first_grouping.inspect}"
+ else
+ last_result.timeline << "No grouping."
+ end
+ end
+ end
+
+ if must_match_grouping and not first_grouping
+ if gather_last_result
last_result.timeline << <<-EOS
-The needle didn't match any of the #{groupings.length} grouping, which was a requirement.
-\tGroupings (first 3): #{groupings[0,3].map(&:inspect).join(', ')}
+The needle didn't match any of the #{groupings.length} groupings, which was a requirement.
+\t#{groupings.map(&:inspect).join("\n\t")}
EOS
end
-
if is_find_all
return []
else
return nil
end
end
+ if groupings.any? and not first_grouping
+ passed_grouping_requirement = haystack.reject do |straw|
+ groupings.any? { |grouping| grouping.xmatch? straw }
+ end
+ else
+ passed_grouping_requirement = haystack
+ end
+
if must_match_at_least_one_word
- passed_word_requirement = haystack.select do |straw|
+ passed_word_requirement = passed_grouping_requirement.select do |straw|
(needle.words & straw.words).any?
end
if gather_last_result
last_result.timeline << <<-EOS
Since :must_match_at_least_one_word => true, the competition was reduced to records sharing at least one word with the needle.
\tNeedle words: #{needle.words.map(&:inspect).join(', ')}
-\tPassed (first 3): #{passed_word_requirement[0,3].map(&:render).map(&:inspect).join(', ')}
-\tFailed (first 3): #{(haystack-passed_word_requirement)[0,3].map(&:render).map(&:inspect).join(', ')}
+\tPassed (first 3): #{passed_word_requirement[0,3].map(&:inspect).join(', ')}
+\tFailed (first 3): #{(passed_grouping_requirement-passed_word_requirement)[0,3].map(&:inspect).join(', ')}
EOS
end
else
- passed_word_requirement = haystack
+ passed_word_requirement = passed_grouping_requirement
end
-
- if groupings.any?
+
+ if first_grouping
joint = passed_word_requirement.select do |straw|
- if first_grouping_decides
- if first_grouping = groupings.detect { |grouping| grouping.match? needle }
- first_grouping.join? needle, straw
- end
- else
- groupings.any? { |grouping| grouping.join? needle, straw }
- end
+ first_grouping.xjoin? needle, straw
end
+ # binding.pry
if gather_last_result
last_result.timeline << <<-EOS
-Since there were groupings, the competition was reduced to records in the same group as the needle.
-\tGroupings (first 3): #{groupings[0,3].map(&:inspect).join(', ')}
-\tPassed (first 3): #{joint[0,3].map(&:render).map(&:inspect).join(', ')}
-\tFailed (first 3): #{(passed_word_requirement-joint)[0,3].map(&:render).map(&:inspect).join(', ')}
+Since there were groupings, the competition was reduced to #{joint.length} records in the same group as the needle.
+\t#{joint.map(&:inspect).join("\n\t")}
EOS
end
else
joint = passed_word_requirement.dup
end
@@ -250,12 +232,12 @@
end
if gather_last_result
last_result.timeline << <<-EOS
Since there were identities, the competition was reduced to records that might be identical to the needle (in other words, are not certainly different)
\tIdentities (first 10 of #{identities.length}): #{identities[0,9].map(&:inspect).join(', ')}
-\tPassed (first 10 of #{possibly_identical.length}): #{possibly_identical[0,9].map(&:render).map(&:inspect).join(', ')}
-\tFailed (first 10 of #{(joint-possibly_identical).length}): #{(joint-possibly_identical)[0,9].map(&:render).map(&:inspect).join(', ')}
+\tPassed (first 10 of #{possibly_identical.length}): #{possibly_identical[0,9].map(&:inspect).join(', ')}
+\tFailed (first 10 of #{(joint-possibly_identical).length}): #{(joint-possibly_identical)[0,9].map(&:inspect).join(', ')}
EOS
end
else
possibly_identical = joint.dup
end
@@ -263,20 +245,20 @@
similarities = possibly_identical.map { |straw| needle.similarity straw }.sort.reverse
if gather_last_result
last_result.timeline << <<-EOS
The competition was sorted in order of similarity to the needle.
-\tSimilar (first 10 of #{similarities.length}): #{similarities[0,9].map { |s| "#{s.wrapper2.render.inspect} (#{[s.best_score.dices_coefficient_similar, s.best_score.levenshtein_similar].map { |v| '%0.5f' % v }.join('/')})" }.join(', ')}
+\t#{similarities[0,9].map { |s| "#{s.record2.similarity(needle).inspect}" }.join("\n\t")}
EOS
end
if is_find_all_with_score
memo = []
similarities.each do |similarity|
if similarity.satisfy?(needle, threshold)
bs = similarity.best_score
- memo << [similarity.wrapper2.record, bs.dices_coefficient_similar, bs.levenshtein_similar]
+ memo << [similarity.record2.original, bs.dices_coefficient_similar, bs.levenshtein_similar]
end
end
return memo
end
@@ -286,11 +268,11 @@
similarities.each do |similarity|
if similarity.satisfy?(needle, threshold)
bs = similarity.best_score
best_bs ||= bs
if bs >= best_bs
- memo << similarity.wrapper2.record
+ memo << similarity.record2.original
else
break
end
end
end
@@ -299,48 +281,50 @@
if is_find_all
memo = []
similarities.each do |similarity|
if similarity.satisfy?(needle, threshold)
- memo << similarity.wrapper2.record
+ memo << similarity.record2.original
end
end
return memo
end
best_similarity = similarities.first
winner = nil
if best_similarity and best_similarity.satisfy?(needle, threshold)
- winner = best_similarity.wrapper2.record
+ winner = best_similarity.record2.original
if gather_last_result
last_result.winner = winner
last_result.score = best_similarity.best_score.dices_coefficient_similar
last_result.timeline << <<-EOS
-A winner was determined because the Dice's Coefficient similarity (#{best_similarity.best_score.dices_coefficient_similar}) is greater than zero or because it shared a word with the needle.
+A winner was determined because the Dice's Coefficient similarity (#{'%0.5f' % best_similarity.best_score.dices_coefficient_similar}) is greater than zero or because it shared a word with the needle.
EOS
end
+ if is_find_with_score
+ bs = best_similarity.best_score
+ return [winner, bs.dices_coefficient_similar, bs.levenshtein_similar]
+ else
+ return winner
+ end
elsif gather_last_result
- best_similarity_record = if best_similarity and best_similarity.wrapper2
- best_similarity.wrapper2.record
+ best_similarity_record = if best_similarity and best_similarity.record2
+ best_similarity.record2.original
end
last_result.timeline << <<-EOS
No winner assigned because the score of the best similarity (#{best_similarity_record.inspect}) was zero and it didn't match any words with the needle (#{needle.inspect}).
EOS
end
-
- winner
+
+ nil # ugly
end
# Explain is like mysql's EXPLAIN command. You give it a needle and it tells you about how it was located (successfully or not) in the haystack.
#
# d = FuzzyMatch.new ['737', '747', '757' ]
# d.explain 'boeing 737-100'
def explain(needle, options = {})
find needle, options.merge(:gather_last_result => true)
last_result.explain
- end
-
- # DEPRECATED
- def free
end
end