require "levenshtein/exception" require "levenshtein/version" module Levenshtein # Returns the Levenshtein distance as a number between 0.0 and # 1.0. It's basically the Levenshtein distance divided by the # length of the longest sequence. def self.normalized_distance(a1, a2, threshold=nil) a1, a2 = a2, a1 if a1.length > a2.length # a1 is the short one; a2 is the long one. if a2.length == 0 0.0 # Since a1.length < a2.length, a1 must be empty as well. else if threshold if d = self.distance(a1, a2, (threshold*a2.length+1).to_i) d.to_f/a2.length else nil end else self.distance(a1, a2).to_f/a2.length end end end # Returns the Levenshtein distance between two sequences. # # The two sequences can be two strings, two arrays, or two other # objects. Strings, arrays and arrays of strings are handled with # optimized (very fast) C code. All other sequences are handled # with generic (fast) C code. # # The sequences should respond to :length and :[] and all objects # in the sequences (as returned by []) should response to :==. def self.distance(a1, a2, threshold=nil) a1, a2 = a2, a1 if a1.length > a2.length # a1 is the short one; a2 is the long one. # Handle some basic circumstances. return 0 if a1 == a2 return a2.length if a1.length == 0 if threshold return nil if (a2.length-a1.length) >= threshold a3, a4 = nil, nil a3, a4 = a1, a2 if a1.respond_to?(:-) and a2.respond_to?(:-) a3, a4 = a1.scan(/./), a2.scan(/./) if a1.respond_to?(:scan) and a2.respond_to?(:scan) if a3 and a4 return nil if (a3-a4).length >= threshold return nil if (a4-a3).length >= threshold end end distance_fast_or_slow(a1, a2, threshold) end def self.distance_fast_or_slow(a1, a2, threshold) # :nodoc: if respond_to?(:distance_fast) distance_fast(a1, a2, threshold) # Implemented in C. else distance_slow(a1, a2, threshold) # Implemented in Ruby. end end def self.distance_slow(a1, a2, threshold) # :nodoc: l1 = a1.length l2 = a2.length offset = 0 while offset < l1 and offset < l2 and a1[offset] == a2[offset] offset += 1 end while offset < l1 and offset < l2 and a1[l1-1] == a2[l2-1] l1 -= 1 l2 -= 1 end l1 -= offset l2 -= offset crow = (0..l1).to_a 1.upto(l2) do |y| prow = crow crow = [y] 1.upto(l1) do |x| crow[x] = [prow[x]+1, crow[x-1]+1, prow[x-1]+(a1[offset+x-1]==a2[offset+y-1] ? 0 : 1)].min end # Stop analysing this sequence as soon as the best possible # result for this sequence is bigger than the best result so far. # (The minimum value in the next row will be equal to or greater # than the minimum value in this row.) return nil if threshold and crow.min >= threshold end crow[-1] end end begin require "levenshtein/levenshtein_fast" # Compiled by RubyGems. rescue LoadError begin require "levenshtein_fast" # Compiled by the build script. rescue LoadError $stderr.puts "WARNING: Couldn't find the fast C implementation of Levenshtein. Using the much slower Ruby version instead." end end