# encoding: UTF-8 # # = FuzzyStringMatcher.rb -- Persistent Ruby Object Store # # Copyright (c) 2020 by Chris Schlaeger # # MIT License # # Permission is hereby granted, free of charge, to any person obtaining # a copy of this software and associated documentation files (the # "Software"), to deal in the Software without restriction, including # without limitation the rights to use, copy, modify, merge, publish, # distribute, sublicense, and/or sell copies of the Software, and to # permit persons to whom the Software is furnished to do so, subject to # the following conditions: # # The above copyright notice and this permission notice shall be # included in all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. require 'perobs/Log' require 'perobs/ObjectBase' module PEROBS # The fuzzy string matcher can be used to perform a fuzzy string search # against a known set of strings. The dictionary of known strings does not # store the actual strings but references to arbitrary objects. These could # be the string, but can be something else related to the learned strings. # To use this class a list of strings with their references must be learned. # Once the dictionary has been established, fuzzy matches can be done. class FuzzyStringMatcher # Create a new FuzzyStringMatcher. # @param store [PEROBS::Store] place to store the dictionary # @param name [String] Unique name of the string matcher # @param case_sensitive [Boolean] True if case matters for matching # @param n [Integer] Determines what kind of n-gramm is used to store the # references in the dictionary. It also determines the minimum word # length that can be used for fuzzy matches. def initialize(store, name, case_sensitive = false, n = 4) @store = store @dict_name = "FuzzyStringMatcher::#{name}" if n < 2 || n > 10 raise ArgumentError, 'n must be between 2 and 10' end @case_sensitive = case_sensitive @n = n clear unless (@dict = @store[@dict_name]) end # Wipe the dictionary. def clear @store[@dict_name] = @dict = @store.new(BigHash) end # Add a string with its reference to the dictionary. # @param string [String] The string to store # @param reference [Object] Any object that is associated with the string def learn(string, reference = string) reference = string if reference.nil? unless @case_sensitive string = string.downcase end # Enclose string in 'start of text' and 'end of text' ASCII values. string = "\002" + string + "\003" each_n_gramm(string) do |n_gramm| unless (ng_list = @dict[n_gramm]) @dict[n_gramm] = ng_list = @store.new(Hash) end if ng_list.include?(reference) ng_list[reference] += 1 else ng_list[reference] = 0 end end nil end # Find the references who's string best matches the given string. # @param string [String] string to search for # @param min_score [Float] Value 0.01 and 1.0 that specifies how strict # the matching should be done. The larger the value the more closer # the given string needs to be. # @param max_count [Integer] The maximum number of matches that should be # returned. # @return [Array] The result is an Array of Arrays. The nested Arrays only # have 2 entries. The reference and a Float value between 0 and # 1.0 that describes how good the match is. The matches are sorted # in descending order by the match score. def best_matches(string, min_score = 0.5, max_count = 100) unless @case_sensitive string = string.downcase end # Enclose string in 'start of text' and 'end of text' ASCII values. string = "\002" + string + "\003" matches = {} # This will be the best possible score for a perfect match. best_possible_score = 0 each_n_gramm(string) do |n_gramm| best_possible_score += 1 if (ng_list = @dict[n_gramm]) ng_list.each do |reference, count| if matches.include?(reference) matches[reference] += 1 else # We use internally a 10 times larger list so that we don't # throw away good matches too early. If the max_count value is # chosen too small there is a risk of not finding the best # matches! if matches.size > 10 * max_count matches = discard_worst_match(matches) end matches[reference] = 1 end end end end return [] if matches.empty? # Sort in the order of occurance count downwards. match_list = matches.to_a.sort do |a, b| b[1] <=> a[1] end # Set occurance counters to scores relative to the best possible score. match_list.map! { |a, b| [ a, b.to_f / best_possible_score ] } # Delete all matches that occured less than half as often than the # top match. match_list.delete_if { |a| a[1] < min_score } match_list[0..max_count] end # Returns some internal stats about the dictionary. def stats s = {} s['dictionary_size'] = @dict.size max = total = 0 @dict.each do |n_gramm, ng_list| size = ng_list.length max = size if size > max total += size end s['max_list_size'] = max s['avg_list_size'] = total > 0 ? total.to_f / s['dictionary_size'] : 0 s end private def each_n_gramm(string, &block) return if string.length < @n 0.upto(string.length - @n) do |i| n_gramm = string[i, @n] yield(n_gramm) end end def discard_worst_match(matches) # Sort in the order of occurance count downwards. match_list = matches.to_a.sort do |a, b| b[1] <=> a[1] end # Discard the lowest half of the matches match_list = match_list[0..match_list.length / 2] match_list.to_h end end end