lib/levenshtein.rb in levenshtein-0.2.0 vs lib/levenshtein.rb in levenshtein-0.2.1

- old
+ new

@@ -1,46 +1,27 @@ -begin - require "levenshtein/levenshtein_fast" # If compiled by RubyGems. -rescue LoadError - begin - require "levenshtein_fast" # If compiled by the build script. - rescue LoadError - $stderr.puts "WARNING: Couldn't find the fast C implementation of Levenshtein.distance. Using the much slower Ruby version instead." - end -end +require "levenshtein/exception" +require "levenshtein/version" -# The Levenshtein distance is a metric for measuring the amount -# of difference between two sequences (i.e., the so called edit -# distance). The Levenshtein distance between two sequences is -# given by the minimum number of operations needed to transform -# one sequence into the other, where an operation is an -# insertion, deletion, or substitution of a single element. -# -# More information about the Levenshtein distance algorithm: -# http://en.wikipedia.org/wiki/Levenshtein_distance . - module Levenshtein - VERSION = "0.2.0" - # Returns the Levenshtein distance as a number between 0.0 and # 1.0. It's basically the Levenshtein distance divided by the # length of the longest sequence. - def self.normalized_distance(s1, s2, threshold=nil) - s1, s2 = s2, s1 if s1.length > s2.length # s1 is the short one; s2 is the long one. + def self.normalized_distance(a1, a2, threshold=nil) + a1, a2 = a2, a1 if a1.length > a2.length # a1 is the short one; a2 is the long one. - if s2.length == 0 - 0.0 # Since s1.length < s2.length, s1 must be empty as well. + if a2.length == 0 + 0.0 # Since a1.length < a2.length, a1 must be empty as well. else if threshold - if d = self.distance(s1, s2, (threshold*s2.length+1).to_i) - d.to_f/s2.length + if d = self.distance(a1, a2, (threshold*a2.length+1).to_i) + d.to_f/a2.length else nil end else - self.distance(s1, s2).to_f/s2.length + self.distance(a1, a2).to_f/a2.length end end end # Returns the Levenshtein distance between two sequences. @@ -51,59 +32,86 @@ # with generic (fast) C code. # # The sequences should respond to :length and :[] and all objects # in the sequences (as returned by []) should response to :==. - def self.distance(s1, s2, threshold=nil) - s1, s2 = s2, s1 if s1.length > s2.length # s1 is the short one; s2 is the long one. + def self.distance(a1, a2, threshold=nil) + a1, a2 = a2, a1 if a1.length > a2.length # a1 is the short one; a2 is the long one. # Handle some basic circumstances. - return 0 if s1 == s2 - return s2.length if s1.length == 0 + return 0 if a1 == a2 + return a2.length if a1.length == 0 if threshold - return nil if (s2.length-s1.length) >= threshold + return nil if (a2.length-a1.length) >= threshold - a1, a2 = nil, nil - a1, a2 = s1, s2 if s1.respond_to?(:-) and s2.respond_to?(:-) - a1, a2 = s1.scan(/./), s2.scan(/./) if s1.respond_to?(:scan) and s2.respond_to?(:scan) + a3, a4 = nil, nil + a3, a4 = a1, a2 if a1.respond_to?(:-) and a2.respond_to?(:-) + a3, a4 = a1.scan(/./), a2.scan(/./) if a1.respond_to?(:scan) and a2.respond_to?(:scan) - if a1 and a2 - return nil if (a1-a2).length >= threshold - return nil if (a2-a1).length >= threshold + if a3 and a4 + return nil if (a3-a4).length >= threshold + return nil if (a4-a3).length >= threshold end end - distance_fast_or_slow(s1, s2, threshold) + distance_fast_or_slow(a1, a2, threshold) end - def self.distance_fast_or_slow(s1, s2, threshold) # :nodoc: - if respond_to?(:levenshtein_distance_fast) - levenshtein_distance_fast(s1, s2, threshold) # Implemented in C. + def self.distance_fast_or_slow(a1, a2, threshold) # :nodoc: + if respond_to?(:distance_fast) + distance_fast(a1, a2, threshold) # Implemented in C. else - levenshtein_distance_slow(s1, s2, threshold) # Implemented in Ruby. + distance_slow(a1, a2, threshold) # Implemented in Ruby. end end - def self.levenshtein_distance_slow(s1, s2, threshold) # :nodoc: - row = (0..s1.length).to_a + def self.distance_slow(a1, a2, threshold) # :nodoc: + l1 = a1.length + l2 = a2.length - 1.upto(s2.length) do |y| - prow = row - row = [y] + offset = 0 + + while offset < l1 and offset < l2 and a1[offset] == a2[offset] + offset += 1 + end + + while offset < l1 and offset < l2 and a1[l1-1] == a2[l2-1] + l1 -= 1 + l2 -= 1 + end + + l1 -= offset + l2 -= offset - 1.upto(s1.length) do |x| - row[x] = [prow[x]+1, row[x-1]+1, prow[x-1]+(s1[x-1]==s2[y-1] ? 0 : 1)].min + crow = (0..l1).to_a + + 1.upto(l2) do |y| + prow = crow + crow = [y] + + 1.upto(l1) do |x| + crow[x] = [prow[x]+1, crow[x-1]+1, prow[x-1]+(a1[offset+x-1]==a2[offset+y-1] ? 0 : 1)].min end # Stop analysing this sequence as soon as the best possible # result for this sequence is bigger than the best result so far. # (The minimum value in the next row will be equal to or greater # than the minimum value in this row.) - return nil if threshold and row.min >= threshold + return nil if threshold and crow.min >= threshold end - row[-1] + crow[-1] + end +end + +begin + require "levenshtein/levenshtein_fast" # Compiled by RubyGems. +rescue LoadError + begin + require "levenshtein_fast" # Compiled by the build script. + rescue LoadError + $stderr.puts "WARNING: Couldn't find the fast C implementation of Levenshtein. Using the much slower Ruby version instead." end end