lib/levenshtein.rb in levenshtein-0.1.1 vs lib/levenshtein.rb in levenshtein-0.2.0
- old
+ new
@@ -1,32 +1,36 @@
begin
- require "levenshtein/levenshtein_c"
+ require "levenshtein/levenshtein_fast" # If compiled by RubyGems.
rescue LoadError
begin
- require "levenshtein_c"
+ require "levenshtein_fast" # If compiled by the build script.
rescue LoadError
- $stderr.puts "WARNING: Couldn't find the fast C implementation of Levenshtein.distance_part2. Using the slow Ruby version instead."
+ $stderr.puts "WARNING: Couldn't find the fast C implementation of Levenshtein.distance. Using the much slower Ruby version instead."
end
end
-# The Levenshtein distance is a metric for measuring the amount of difference
-# between two sequences (i.e., the so called edit distance). The Levenshtein
-# distance between two strings is given by the minimum number of operations
-# needed to transform one string into the other, where an operation is an
-# insertion, deletion, or substitution of a single character.
+# The Levenshtein distance is a metric for measuring the amount
+# of difference between two sequences (i.e., the so called edit
+# distance). The Levenshtein distance between two sequences is
+# given by the minimum number of operations needed to transform
+# one sequence into the other, where an operation is an
+# insertion, deletion, or substitution of a single element.
#
# More information about the Levenshtein distance algorithm:
# http://en.wikipedia.org/wiki/Levenshtein_distance .
module Levenshtein
- # Returns the Levenshtein distance as a number between 0.0 and 1.0.
- # It's basically the Levenshtein distance divided by the length of the longest string.
+ VERSION = "0.2.0"
+ # Returns the Levenshtein distance as a number between 0.0 and
+ # 1.0. It's basically the Levenshtein distance divided by the
+ # length of the longest sequence.
+
def self.normalized_distance(s1, s2, threshold=nil)
s1, s2 = s2, s1 if s1.length > s2.length # s1 is the short one; s2 is the long one.
- if s2.empty?
+ if s2.length == 0
0.0 # Since s1.length < s2.length, s1 must be empty as well.
else
if threshold
if d = self.distance(s1, s2, (threshold*s2.length+1).to_i)
d.to_f/s2.length
@@ -37,61 +41,66 @@
self.distance(s1, s2).to_f/s2.length
end
end
end
- # Returns the Levenshtein distance between two byte strings.
+ # Returns the Levenshtein distance between two sequences.
+ #
+ # The two sequences can be two strings, two arrays, or two other
+ # objects. Strings, arrays and arrays of strings are handled with
+ # optimized (very fast) C code. All other sequences are handled
+ # with generic (fast) C code.
+ #
+ # The sequences should respond to :length and :[] and all objects
+ # in the sequences (as returned by []) should response to :==.
def self.distance(s1, s2, threshold=nil)
s1, s2 = s2, s1 if s1.length > s2.length # s1 is the short one; s2 is the long one.
# Handle some basic circumstances.
- return 0.0 if s1 == s2
- return s2.length if s1.empty?
- return nil if threshold and (s2.length-s1.length) >= threshold
- return nil if threshold and (s1.scan(/./) - s2.scan(/./)).length >= threshold
- return nil if threshold and (s2.scan(/./) - s1.scan(/./)).length >= threshold
+ return 0 if s1 == s2
+ return s2.length if s1.length == 0
- # Do the expensive calculation on a subset of the strings only, if possible.
+ if threshold
+ return nil if (s2.length-s1.length) >= threshold
- b = 0
- e1 = s1.length-1
- e2 = s2.length-1
+ a1, a2 = nil, nil
+ a1, a2 = s1, s2 if s1.respond_to?(:-) and s2.respond_to?(:-)
+ a1, a2 = s1.scan(/./), s2.scan(/./) if s1.respond_to?(:scan) and s2.respond_to?(:scan)
- while s1[b, 1] == s2[b, 1]
- b += 1
+ if a1 and a2
+ return nil if (a1-a2).length >= threshold
+ return nil if (a2-a1).length >= threshold
+ end
end
- while s1[e1, 1] == s2[e2, 1] and e1 > b and e2 > b
- e1 -= 1
- e2 -= 1
- end
-
- distance_part2(s1[b..e1], s2[b..e2], threshold)
+ distance_fast_or_slow(s1, s2, threshold)
end
- def self.distance_part2(s1, s2, threshold) # :nodoc:
- if respond_to?(:distance_part2_fast)
- distance_part2_fast(s1, s2, threshold) # Implemented in C.
+ def self.distance_fast_or_slow(s1, s2, threshold) # :nodoc:
+ if respond_to?(:levenshtein_distance_fast)
+ levenshtein_distance_fast(s1, s2, threshold) # Implemented in C.
else
- distance_part2_slow(s1, s2, threshold) # Implemented in Ruby.
+ levenshtein_distance_slow(s1, s2, threshold) # Implemented in Ruby.
end
end
- def self.distance_part2_slow(s1, s2, threshold) # :nodoc:
+ def self.levenshtein_distance_slow(s1, s2, threshold) # :nodoc:
row = (0..s1.length).to_a
1.upto(s2.length) do |y|
prow = row
row = [y]
1.upto(s1.length) do |x|
row[x] = [prow[x]+1, row[x-1]+1, prow[x-1]+(s1[x-1]==s2[y-1] ? 0 : 1)].min
end
- # Stop analysing this string as soon as the best possible result for this string is bigger than the best result so far.
- # (The minimum value in the next row will be equal to or greater than the minimum value in this row.)
+ # Stop analysing this sequence as soon as the best possible
+ # result for this sequence is bigger than the best result so far.
+ # (The minimum value in the next row will be equal to or greater
+ # than the minimum value in this row.)
return nil if threshold and row.min >= threshold
end
row[-1]