FuzzyStringMatcher.rb in perobs-4.3.0

- old
+ new

@@ -24,44 +24,46 @@
 # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
 # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
 # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 require 'perobs/Log'
-require 'perobs/ObjectBase'
+require 'perobs/Object'
 
 module PEROBS
 
   # The fuzzy string matcher can be used to perform a fuzzy string search
   # against a known set of strings. The dictionary of known strings does not
-  # store the actual strings but references to arbitrary objects. These could
-  # be the string, but can be something else related to the learned strings.
-  # To use this class a list of strings with their references must be learned.
-  # Once the dictionary has been established, fuzzy matches can be done.
-  class FuzzyStringMatcher
+  # store the actual strings but references to String or PEROBS objects.
+  # Once the dictionary has been established, fuzzy matches can be done. Since
+  # the actual input strings are not directly stored, you cannot remove or
+  # modified already stored strings. To remove strings, you have to clear the
+  # matcher and add the strings again that you want to keep.
+  class FuzzyStringMatcher < PEROBS::Object
 
+    attr_persist :case_sensitive, :n, :dict
+
     # Create a new FuzzyStringMatcher.
-    # @param store [PEROBS::Store] place to store the dictionary
-    # @param name [String] Unique name of the string matcher
+    # @param p [PEROBS::Store] place to store the dictionary
     # @param case_sensitive [Boolean] True if case matters for matching
     # @param n [Integer] Determines what kind of n-gramm is used to store the
     #        references in the dictionary. It also determines the minimum word
-    #        length that can be used for fuzzy matches.
-    def initialize(store, name, case_sensitive = false, n = 4)
-      @store = store
-      @dict_name = "FuzzyStringMatcher::#{name}"
+    #        length that can be used for fuzzy matches. Values between 2 and
+    #        10 are supported. The default is 4.
+    def initialize(p, case_sensitive = false, n = 4)
+      super(p)
       if n < 2 || n > 10
         raise ArgumentError, 'n must be between 2 and 10'
       end
-      @case_sensitive = case_sensitive
-      @n = n
+      self.case_sensitive = case_sensitive
+      self.n = n
 
-      clear unless (@dict = @store[@dict_name])
+      clear unless @dict
     end
 
     # Wipe the dictionary.
     def clear
-      @store[@dict_name] = @dict = @store.new(BigHash)
+      self.dict = @store.new(BigHash)
     end
 
     # Add a string with its reference to the dictionary.
     # @param string [String] The string to store
     # @param reference [Object] Any object that is associated with the string
@@ -77,15 +79,12 @@
       each_n_gramm(string) do |n_gramm|
         unless (ng_list = @dict[n_gramm])
           @dict[n_gramm] = ng_list = @store.new(Hash)
         end
 
-        if ng_list.include?(reference)
-          ng_list[reference] += 1
-        else
-          ng_list[reference] = 0
-        end
+        # We use the Hash as a Set. The value doesn't matter.
+        ng_list[reference] = true unless ng_list.include?(reference)
       end
 
       nil
     end
 
@@ -107,47 +106,41 @@
       # Enclose string in 'start of text' and 'end of text' ASCII values.
       string = "\002" + string + "\003"
 
       matches = {}
 
-      # This will be the best possible score for a perfect match.
-      best_possible_score = 0
       each_n_gramm(string) do |n_gramm|
-        best_possible_score += 1
         if (ng_list = @dict[n_gramm])
-          ng_list.each do |reference, count|
+          ng_list.each do |reference, dummy|
             if matches.include?(reference)
               matches[reference] += 1
             else
-              # We use internally a 10 times larger list so that we don't
-              # throw away good matches too early. If the max_count value is
-              # chosen too small there is a risk of not finding the best
-              # matches!
-              if matches.size > 10 * max_count
-                matches = discard_worst_match(matches)
-              end
               matches[reference] = 1
             end
           end
         end
       end
 
       return [] if matches.empty?
 
-      # Sort in the order of occurance count downwards.
-      match_list = matches.to_a.sort do |a, b|
-        b[1] <=> a[1]
-      end
+      match_list = matches.to_a
 
       # Set occurance counters to scores relative to the best possible score.
+      # This will be the best possible score for a perfect match.
+      best_possible_score = string.length - @n + 1
       match_list.map! { |a, b| [ a, b.to_f / best_possible_score ] }
 
-      # Delete all matches that occured less than half as often than the
-      # top match.
+      # Delete all matches that don't have the required minimum match score.
       match_list.delete_if { |a| a[1] < min_score }
 
-      match_list[0..max_count]
+      # Sort the list best to worst match
+      match_list.sort! do |a, b|
+        b[1] <=> a[1]
+      end
+
+      # Return the top max_count matches.
+      match_list[0..max_count - 1]
     end
 
     # Returns some internal stats about the dictionary.
     def stats
       s = {}
@@ -172,19 +165,9 @@
       0.upto(string.length - @n) do |i|
         n_gramm = string[i, @n]
 
         yield(n_gramm)
       end
-    end
-
-    def discard_worst_match(matches)
-      # Sort in the order of occurance count downwards.
-      match_list = matches.to_a.sort do |a, b|
-        b[1] <=> a[1]
-      end
-      # Discard the lowest half of the matches
-      match_list = match_list[0..match_list.length / 2]
-      match_list.to_h
     end
 
   end
 
 end