levenshtein.rb in levenshtein-0.2.0

- old
+ new

@@ -1,32 +1,36 @@
 begin
-  require "levenshtein/levenshtein_c"
+  require "levenshtein/levenshtein_fast"	# If compiled by RubyGems.
 rescue LoadError
   begin
-    require "levenshtein_c"
+    require "levenshtein_fast"			# If compiled by the build script.
   rescue LoadError
-    $stderr.puts "WARNING: Couldn't find the fast C implementation of Levenshtein.distance_part2. Using the slow Ruby version instead."
+    $stderr.puts "WARNING: Couldn't find the fast C implementation of Levenshtein.distance. Using the much slower Ruby version instead."
   end
 end
 
-# The Levenshtein distance is a metric for measuring the amount of difference
-# between two sequences (i.e., the so called edit distance). The Levenshtein
-# distance between two strings is given by the minimum number of operations
-# needed to transform one string into the other, where an operation is an
-# insertion, deletion, or substitution of a single character.
+# The Levenshtein distance is a metric for measuring the amount
+# of difference between two sequences (i.e., the so called edit
+# distance). The Levenshtein distance between two sequences is
+# given by the minimum number of operations needed to transform
+# one sequence into the other, where an operation is an
+# insertion, deletion, or substitution of a single element.
 #
 # More information about the Levenshtein distance algorithm:
 # http://en.wikipedia.org/wiki/Levenshtein_distance .
 
 module Levenshtein
-  # Returns the Levenshtein distance as a number between 0.0 and 1.0.
-  # It's basically the Levenshtein distance divided by the length of the longest string.
+  VERSION	= "0.2.0"
 
+  # Returns the Levenshtein distance as a number between 0.0 and
+  # 1.0. It's basically the Levenshtein distance divided by the
+  # length of the longest sequence.
+
   def self.normalized_distance(s1, s2, threshold=nil)
     s1, s2	= s2, s1	if s1.length > s2.length	# s1 is the short one; s2 is the long one.
 
-    if s2.empty?
+    if s2.length == 0
       0.0	# Since s1.length < s2.length, s1 must be empty as well.
     else
       if threshold
         if d = self.distance(s1, s2, (threshold*s2.length+1).to_i)
           d.to_f/s2.length
@@ -37,61 +41,66 @@
         self.distance(s1, s2).to_f/s2.length
       end
     end
   end
 
-  # Returns the Levenshtein distance between two byte strings.
+  # Returns the Levenshtein distance between two sequences.
+  #
+  # The two sequences can be two strings, two arrays, or two other
+  # objects. Strings, arrays and arrays of strings are handled with
+  # optimized (very fast) C code. All other sequences are handled
+  # with generic (fast) C code.
+  #
+  # The sequences should respond to :length and :[] and all objects
+  # in the sequences (as returned by []) should response to :==.
 
   def self.distance(s1, s2, threshold=nil)
     s1, s2	= s2, s1	if s1.length > s2.length	# s1 is the short one; s2 is the long one.
 
     # Handle some basic circumstances.
 
-    return 0.0		if s1 == s2
-    return s2.length	if s1.empty?
-    return nil		if threshold and (s2.length-s1.length) >= threshold
-    return nil		if threshold and (s1.scan(/./) - s2.scan(/./)).length >= threshold
-    return nil		if threshold and (s2.scan(/./) - s1.scan(/./)).length >= threshold
+    return 0		if s1 == s2
+    return s2.length	if s1.length == 0
 
-    # Do the expensive calculation on a subset of the strings only, if possible.
+    if threshold
+      return nil	if (s2.length-s1.length) >= threshold
 
-    b	= 0
-    e1	= s1.length-1
-    e2	= s2.length-1
+      a1, a2	= nil, nil
+      a1, a2	= s1, s2			if s1.respond_to?(:-) and s2.respond_to?(:-)
+      a1, a2	= s1.scan(/./), s2.scan(/./)	if s1.respond_to?(:scan) and s2.respond_to?(:scan)
 
-    while s1[b, 1] == s2[b, 1]
-      b += 1
+      if a1 and a2
+        return nil	if (a1-a2).length >= threshold
+        return nil	if (a2-a1).length >= threshold
+      end
     end
 
-    while s1[e1, 1] == s2[e2, 1] and e1 > b and e2 > b
-      e1 -= 1
-      e2 -= 1
-    end
-
-    distance_part2(s1[b..e1], s2[b..e2], threshold)
+    distance_fast_or_slow(s1, s2, threshold)
   end
 
-  def self.distance_part2(s1, s2, threshold)	# :nodoc:
-    if respond_to?(:distance_part2_fast)
-      distance_part2_fast(s1, s2, threshold)	# Implemented in C.
+  def self.distance_fast_or_slow(s1, s2, threshold)	# :nodoc:
+    if respond_to?(:levenshtein_distance_fast)
+      levenshtein_distance_fast(s1, s2, threshold)	# Implemented in C.
     else
-      distance_part2_slow(s1, s2, threshold)	# Implemented in Ruby.
+      levenshtein_distance_slow(s1, s2, threshold)	# Implemented in Ruby.
     end
   end
 
-  def self.distance_part2_slow(s1, s2, threshold)	# :nodoc:
+  def self.levenshtein_distance_slow(s1, s2, threshold)	# :nodoc:
     row	= (0..s1.length).to_a
 
     1.upto(s2.length) do |y|
       prow	= row
       row	= [y]
 
       1.upto(s1.length) do |x|
         row[x]	= [prow[x]+1, row[x-1]+1, prow[x-1]+(s1[x-1]==s2[y-1] ? 0 : 1)].min
       end
 
-      # Stop analysing this string as soon as the best possible result for this string is bigger than the best result so far.
-      # (The minimum value in the next row will be equal to or greater than the minimum value in this row.)
+      # Stop analysing this sequence as soon as the best possible
+      # result for this sequence is bigger than the best result so far.
+      # (The minimum value in the next row will be equal to or greater
+      # than the minimum value in this row.)
 
       return nil	if threshold and row.min >= threshold
     end
 
     row[-1]