levenshtein.rb in text-1.3.0

- old
+ new
@@ -14,32 +14,110 @@
 module Text # :nodoc:
 module Levenshtein
 
   # Calculate the Levenshtein distance between two strings +str1+ and +str2+.
   #
+  # The optional argument max_distance can reduce the number of iterations by
+  # stopping if the Levenshtein distance exceeds this value. This increases
+  # performance where it is only necessary to compare the distance with a
+  # reference value instead of calculating the exact distance.
   #
-  # In Ruby 1.8, +str1+ and +str2+ should be ASCII, UTF-8, or a one-byte-per
-  # character encoding such as ISO-8859-*. They will be treated as UTF-8 if
-  # $KCODE is set appropriately (i.e. 'u').  Otherwise, the comparison will be
-  # performed byte-by-byte. There is no specific support for Shift-JIS or EUC
-  # strings.
+  # The distance is calculated in terms of Unicode codepoints. Be aware that
+  # this algorithm does not perform normalisation: if there is a possibility
+  # of different normalised forms being used, normalisation should be performed
+  # beforehand.
   #
-  # In Ruby 1.9+, the strings will be processed as UTF-8.
-  #
-  # When using Unicode text, be aware that this algorithm does not perform
-  # normalisation.  If there is a possibility of different normalised forms
-  # being used, normalisation should be performed beforehand.
-  #
-  def distance(str1, str2)
-    prepare =
-      if "ruby".respond_to?(:encoding)
-        lambda { |str| str.encode(Encoding::UTF_8).unpack("U*") }
+  def distance(str1, str2, max_distance = nil)
+    if max_distance
+      distance_with_maximum(str1, str2, max_distance)
+    else
+      distance_without_maximum(str1, str2)
+    end
+  end
+
+private
+  def distance_with_maximum(str1, str2, max_distance) # :nodoc:
+    s, t = [str1, str2].sort_by(&:length).
+                        map{ |str| str.encode(Encoding::UTF_8).unpack("U*") }
+    n = s.length
+    m = t.length
+    big_int = n * m
+    return m if n.zero?
+    return n if m.zero?
+    return 0 if s == t
+
+    # If the length difference is already greater than the max_distance, then
+    # there is nothing else to check
+    if (n - m).abs >= max_distance
+      return max_distance
+    end
+
+    # The values necessary for our threshold are written; the ones after must
+    # be filled with large integers since the tailing member of the threshold
+    # window in the bottom array will run min across them
+    d = (m + 1).times.map { |i|
+      if i < m || i < max_distance + 1
+        i
       else
-        rule = $KCODE.match(/^U/i) ? "U*" : "C*"
-        lambda { |str| str.unpack(rule) }
+        big_int
       end
+    }
+    x = nil
+    e = nil
 
-    s, t = [str1, str2].map(&prepare)
+    n.times do |i|
+      # Since we're reusing arrays, we need to be sure to wipe the value left
+      # of the starting index; we don't have to worry about the value above the
+      # ending index as the arrays were initially filled with large integers
+      # and we progress to the right
+      if e.nil?
+        e = i + 1
+      else
+        e = big_int
+      end
+
+      diag_index = t.length - s.length + i
+
+      # If max_distance was specified, we can reduce second loop. So we set
+      # up our threshold window.
+      # See:
+      # Gusfield, Dan (1997). Algorithms on strings, trees, and sequences:
+      # computer science and computational biology.
+      # Cambridge, UK: Cambridge University Press. ISBN 0-521-58519-8.
+      # pp. 263–264.
+      min = [0, i - max_distance - 1].max
+      max = [m - 1, i + max_distance].min
+
+      (min .. max).each do |j|
+        # If the diagonal value is already greater than the max_distance
+        # then we can safety return: the diagonal will never go lower again.
+        # See: http://www.levenshtein.net/
+        if j == diag_index && d[j] >= max_distance
+          return max_distance
+        end
+
+        cost = s[i] == t[j] ? 0 : 1
+        x = [
+          d[j+1] + 1, # insertion
+          e + 1,      # deletion
+          d[j] + cost # substitution
+        ].min
+
+        d[j] = e
+        e = x
+      end
+      d[m] = x
+    end
+
+    if x > max_distance
+      return max_distance
+    else
+      return x
+    end
+  end
+
+  def distance_without_maximum(str1, str2) # :nodoc:
+    s, t = [str1, str2].map{ |str| str.encode(Encoding::UTF_8).unpack("U*") }
     n = s.length
     m = t.length
     return m if n.zero?
     return n if m.zero?