lib/perobs/FuzzyStringMatcher.rb in perobs-4.2.0 vs lib/perobs/FuzzyStringMatcher.rb in perobs-4.3.0

- old
+ new

@@ -24,44 +24,46 @@ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. require 'perobs/Log' -require 'perobs/ObjectBase' +require 'perobs/Object' module PEROBS # The fuzzy string matcher can be used to perform a fuzzy string search # against a known set of strings. The dictionary of known strings does not - # store the actual strings but references to arbitrary objects. These could - # be the string, but can be something else related to the learned strings. - # To use this class a list of strings with their references must be learned. - # Once the dictionary has been established, fuzzy matches can be done. - class FuzzyStringMatcher + # store the actual strings but references to String or PEROBS objects. + # Once the dictionary has been established, fuzzy matches can be done. Since + # the actual input strings are not directly stored, you cannot remove or + # modified already stored strings. To remove strings, you have to clear the + # matcher and add the strings again that you want to keep. + class FuzzyStringMatcher < PEROBS::Object + attr_persist :case_sensitive, :n, :dict + # Create a new FuzzyStringMatcher. - # @param store [PEROBS::Store] place to store the dictionary - # @param name [String] Unique name of the string matcher + # @param p [PEROBS::Store] place to store the dictionary # @param case_sensitive [Boolean] True if case matters for matching # @param n [Integer] Determines what kind of n-gramm is used to store the # references in the dictionary. It also determines the minimum word - # length that can be used for fuzzy matches. - def initialize(store, name, case_sensitive = false, n = 4) - @store = store - @dict_name = "FuzzyStringMatcher::#{name}" + # length that can be used for fuzzy matches. Values between 2 and + # 10 are supported. The default is 4. + def initialize(p, case_sensitive = false, n = 4) + super(p) if n < 2 || n > 10 raise ArgumentError, 'n must be between 2 and 10' end - @case_sensitive = case_sensitive - @n = n + self.case_sensitive = case_sensitive + self.n = n - clear unless (@dict = @store[@dict_name]) + clear unless @dict end # Wipe the dictionary. def clear - @store[@dict_name] = @dict = @store.new(BigHash) + self.dict = @store.new(BigHash) end # Add a string with its reference to the dictionary. # @param string [String] The string to store # @param reference [Object] Any object that is associated with the string @@ -77,15 +79,12 @@ each_n_gramm(string) do |n_gramm| unless (ng_list = @dict[n_gramm]) @dict[n_gramm] = ng_list = @store.new(Hash) end - if ng_list.include?(reference) - ng_list[reference] += 1 - else - ng_list[reference] = 0 - end + # We use the Hash as a Set. The value doesn't matter. + ng_list[reference] = true unless ng_list.include?(reference) end nil end @@ -107,47 +106,41 @@ # Enclose string in 'start of text' and 'end of text' ASCII values. string = "\002" + string + "\003" matches = {} - # This will be the best possible score for a perfect match. - best_possible_score = 0 each_n_gramm(string) do |n_gramm| - best_possible_score += 1 if (ng_list = @dict[n_gramm]) - ng_list.each do |reference, count| + ng_list.each do |reference, dummy| if matches.include?(reference) matches[reference] += 1 else - # We use internally a 10 times larger list so that we don't - # throw away good matches too early. If the max_count value is - # chosen too small there is a risk of not finding the best - # matches! - if matches.size > 10 * max_count - matches = discard_worst_match(matches) - end matches[reference] = 1 end end end end return [] if matches.empty? - # Sort in the order of occurance count downwards. - match_list = matches.to_a.sort do |a, b| - b[1] <=> a[1] - end + match_list = matches.to_a # Set occurance counters to scores relative to the best possible score. + # This will be the best possible score for a perfect match. + best_possible_score = string.length - @n + 1 match_list.map! { |a, b| [ a, b.to_f / best_possible_score ] } - # Delete all matches that occured less than half as often than the - # top match. + # Delete all matches that don't have the required minimum match score. match_list.delete_if { |a| a[1] < min_score } - match_list[0..max_count] + # Sort the list best to worst match + match_list.sort! do |a, b| + b[1] <=> a[1] + end + + # Return the top max_count matches. + match_list[0..max_count - 1] end # Returns some internal stats about the dictionary. def stats s = {} @@ -172,19 +165,9 @@ 0.upto(string.length - @n) do |i| n_gramm = string[i, @n] yield(n_gramm) end - end - - def discard_worst_match(matches) - # Sort in the order of occurance count downwards. - match_list = matches.to_a.sort do |a, b| - b[1] <=> a[1] - end - # Discard the lowest half of the matches - match_list = match_list[0..match_list.length / 2] - match_list.to_h end end end