lib/perobs/FuzzyStringMatcher.rb in perobs-4.2.0 vs lib/perobs/FuzzyStringMatcher.rb in perobs-4.3.0
- old
+ new
@@ -24,44 +24,46 @@
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
require 'perobs/Log'
-require 'perobs/ObjectBase'
+require 'perobs/Object'
module PEROBS
# The fuzzy string matcher can be used to perform a fuzzy string search
# against a known set of strings. The dictionary of known strings does not
- # store the actual strings but references to arbitrary objects. These could
- # be the string, but can be something else related to the learned strings.
- # To use this class a list of strings with their references must be learned.
- # Once the dictionary has been established, fuzzy matches can be done.
- class FuzzyStringMatcher
+ # store the actual strings but references to String or PEROBS objects.
+ # Once the dictionary has been established, fuzzy matches can be done. Since
+ # the actual input strings are not directly stored, you cannot remove or
+ # modified already stored strings. To remove strings, you have to clear the
+ # matcher and add the strings again that you want to keep.
+ class FuzzyStringMatcher < PEROBS::Object
+ attr_persist :case_sensitive, :n, :dict
+
# Create a new FuzzyStringMatcher.
- # @param store [PEROBS::Store] place to store the dictionary
- # @param name [String] Unique name of the string matcher
+ # @param p [PEROBS::Store] place to store the dictionary
# @param case_sensitive [Boolean] True if case matters for matching
# @param n [Integer] Determines what kind of n-gramm is used to store the
# references in the dictionary. It also determines the minimum word
- # length that can be used for fuzzy matches.
- def initialize(store, name, case_sensitive = false, n = 4)
- @store = store
- @dict_name = "FuzzyStringMatcher::#{name}"
+ # length that can be used for fuzzy matches. Values between 2 and
+ # 10 are supported. The default is 4.
+ def initialize(p, case_sensitive = false, n = 4)
+ super(p)
if n < 2 || n > 10
raise ArgumentError, 'n must be between 2 and 10'
end
- @case_sensitive = case_sensitive
- @n = n
+ self.case_sensitive = case_sensitive
+ self.n = n
- clear unless (@dict = @store[@dict_name])
+ clear unless @dict
end
# Wipe the dictionary.
def clear
- @store[@dict_name] = @dict = @store.new(BigHash)
+ self.dict = @store.new(BigHash)
end
# Add a string with its reference to the dictionary.
# @param string [String] The string to store
# @param reference [Object] Any object that is associated with the string
@@ -77,15 +79,12 @@
each_n_gramm(string) do |n_gramm|
unless (ng_list = @dict[n_gramm])
@dict[n_gramm] = ng_list = @store.new(Hash)
end
- if ng_list.include?(reference)
- ng_list[reference] += 1
- else
- ng_list[reference] = 0
- end
+ # We use the Hash as a Set. The value doesn't matter.
+ ng_list[reference] = true unless ng_list.include?(reference)
end
nil
end
@@ -107,47 +106,41 @@
# Enclose string in 'start of text' and 'end of text' ASCII values.
string = "\002" + string + "\003"
matches = {}
- # This will be the best possible score for a perfect match.
- best_possible_score = 0
each_n_gramm(string) do |n_gramm|
- best_possible_score += 1
if (ng_list = @dict[n_gramm])
- ng_list.each do |reference, count|
+ ng_list.each do |reference, dummy|
if matches.include?(reference)
matches[reference] += 1
else
- # We use internally a 10 times larger list so that we don't
- # throw away good matches too early. If the max_count value is
- # chosen too small there is a risk of not finding the best
- # matches!
- if matches.size > 10 * max_count
- matches = discard_worst_match(matches)
- end
matches[reference] = 1
end
end
end
end
return [] if matches.empty?
- # Sort in the order of occurance count downwards.
- match_list = matches.to_a.sort do |a, b|
- b[1] <=> a[1]
- end
+ match_list = matches.to_a
# Set occurance counters to scores relative to the best possible score.
+ # This will be the best possible score for a perfect match.
+ best_possible_score = string.length - @n + 1
match_list.map! { |a, b| [ a, b.to_f / best_possible_score ] }
- # Delete all matches that occured less than half as often than the
- # top match.
+ # Delete all matches that don't have the required minimum match score.
match_list.delete_if { |a| a[1] < min_score }
- match_list[0..max_count]
+ # Sort the list best to worst match
+ match_list.sort! do |a, b|
+ b[1] <=> a[1]
+ end
+
+ # Return the top max_count matches.
+ match_list[0..max_count - 1]
end
# Returns some internal stats about the dictionary.
def stats
s = {}
@@ -172,19 +165,9 @@
0.upto(string.length - @n) do |i|
n_gramm = string[i, @n]
yield(n_gramm)
end
- end
-
- def discard_worst_match(matches)
- # Sort in the order of occurance count downwards.
- match_list = matches.to_a.sort do |a, b|
- b[1] <=> a[1]
- end
- # Discard the lowest half of the matches
- match_list = match_list[0..match_list.length / 2]
- match_list.to_h
end
end
end