lib/levenshtein.rb in levenshtein-0.2.0 vs lib/levenshtein.rb in levenshtein-0.2.1
- old
+ new
@@ -1,46 +1,27 @@
-begin
- require "levenshtein/levenshtein_fast" # If compiled by RubyGems.
-rescue LoadError
- begin
- require "levenshtein_fast" # If compiled by the build script.
- rescue LoadError
- $stderr.puts "WARNING: Couldn't find the fast C implementation of Levenshtein.distance. Using the much slower Ruby version instead."
- end
-end
+require "levenshtein/exception"
+require "levenshtein/version"
-# The Levenshtein distance is a metric for measuring the amount
-# of difference between two sequences (i.e., the so called edit
-# distance). The Levenshtein distance between two sequences is
-# given by the minimum number of operations needed to transform
-# one sequence into the other, where an operation is an
-# insertion, deletion, or substitution of a single element.
-#
-# More information about the Levenshtein distance algorithm:
-# http://en.wikipedia.org/wiki/Levenshtein_distance .
-
module Levenshtein
- VERSION = "0.2.0"
-
# Returns the Levenshtein distance as a number between 0.0 and
# 1.0. It's basically the Levenshtein distance divided by the
# length of the longest sequence.
- def self.normalized_distance(s1, s2, threshold=nil)
- s1, s2 = s2, s1 if s1.length > s2.length # s1 is the short one; s2 is the long one.
+ def self.normalized_distance(a1, a2, threshold=nil)
+ a1, a2 = a2, a1 if a1.length > a2.length # a1 is the short one; a2 is the long one.
- if s2.length == 0
- 0.0 # Since s1.length < s2.length, s1 must be empty as well.
+ if a2.length == 0
+ 0.0 # Since a1.length < a2.length, a1 must be empty as well.
else
if threshold
- if d = self.distance(s1, s2, (threshold*s2.length+1).to_i)
- d.to_f/s2.length
+ if d = self.distance(a1, a2, (threshold*a2.length+1).to_i)
+ d.to_f/a2.length
else
nil
end
else
- self.distance(s1, s2).to_f/s2.length
+ self.distance(a1, a2).to_f/a2.length
end
end
end
# Returns the Levenshtein distance between two sequences.
@@ -51,59 +32,86 @@
# with generic (fast) C code.
#
# The sequences should respond to :length and :[] and all objects
# in the sequences (as returned by []) should response to :==.
- def self.distance(s1, s2, threshold=nil)
- s1, s2 = s2, s1 if s1.length > s2.length # s1 is the short one; s2 is the long one.
+ def self.distance(a1, a2, threshold=nil)
+ a1, a2 = a2, a1 if a1.length > a2.length # a1 is the short one; a2 is the long one.
# Handle some basic circumstances.
- return 0 if s1 == s2
- return s2.length if s1.length == 0
+ return 0 if a1 == a2
+ return a2.length if a1.length == 0
if threshold
- return nil if (s2.length-s1.length) >= threshold
+ return nil if (a2.length-a1.length) >= threshold
- a1, a2 = nil, nil
- a1, a2 = s1, s2 if s1.respond_to?(:-) and s2.respond_to?(:-)
- a1, a2 = s1.scan(/./), s2.scan(/./) if s1.respond_to?(:scan) and s2.respond_to?(:scan)
+ a3, a4 = nil, nil
+ a3, a4 = a1, a2 if a1.respond_to?(:-) and a2.respond_to?(:-)
+ a3, a4 = a1.scan(/./), a2.scan(/./) if a1.respond_to?(:scan) and a2.respond_to?(:scan)
- if a1 and a2
- return nil if (a1-a2).length >= threshold
- return nil if (a2-a1).length >= threshold
+ if a3 and a4
+ return nil if (a3-a4).length >= threshold
+ return nil if (a4-a3).length >= threshold
end
end
- distance_fast_or_slow(s1, s2, threshold)
+ distance_fast_or_slow(a1, a2, threshold)
end
- def self.distance_fast_or_slow(s1, s2, threshold) # :nodoc:
- if respond_to?(:levenshtein_distance_fast)
- levenshtein_distance_fast(s1, s2, threshold) # Implemented in C.
+ def self.distance_fast_or_slow(a1, a2, threshold) # :nodoc:
+ if respond_to?(:distance_fast)
+ distance_fast(a1, a2, threshold) # Implemented in C.
else
- levenshtein_distance_slow(s1, s2, threshold) # Implemented in Ruby.
+ distance_slow(a1, a2, threshold) # Implemented in Ruby.
end
end
- def self.levenshtein_distance_slow(s1, s2, threshold) # :nodoc:
- row = (0..s1.length).to_a
+ def self.distance_slow(a1, a2, threshold) # :nodoc:
+ l1 = a1.length
+ l2 = a2.length
- 1.upto(s2.length) do |y|
- prow = row
- row = [y]
+ offset = 0
+
+ while offset < l1 and offset < l2 and a1[offset] == a2[offset]
+ offset += 1
+ end
+
+ while offset < l1 and offset < l2 and a1[l1-1] == a2[l2-1]
+ l1 -= 1
+ l2 -= 1
+ end
+
+ l1 -= offset
+ l2 -= offset
- 1.upto(s1.length) do |x|
- row[x] = [prow[x]+1, row[x-1]+1, prow[x-1]+(s1[x-1]==s2[y-1] ? 0 : 1)].min
+ crow = (0..l1).to_a
+
+ 1.upto(l2) do |y|
+ prow = crow
+ crow = [y]
+
+ 1.upto(l1) do |x|
+ crow[x] = [prow[x]+1, crow[x-1]+1, prow[x-1]+(a1[offset+x-1]==a2[offset+y-1] ? 0 : 1)].min
end
# Stop analysing this sequence as soon as the best possible
# result for this sequence is bigger than the best result so far.
# (The minimum value in the next row will be equal to or greater
# than the minimum value in this row.)
- return nil if threshold and row.min >= threshold
+ return nil if threshold and crow.min >= threshold
end
- row[-1]
+ crow[-1]
+ end
+end
+
+begin
+ require "levenshtein/levenshtein_fast" # Compiled by RubyGems.
+rescue LoadError
+ begin
+ require "levenshtein_fast" # Compiled by the build script.
+ rescue LoadError
+ $stderr.puts "WARNING: Couldn't find the fast C implementation of Levenshtein. Using the much slower Ruby version instead."
end
end