begin require "levenshtein/levenshtein_c" rescue LoadError begin require "levenshtein_c" rescue LoadError $stderr.puts "WARNING: Couldn't find the fast C implementation of Levenshtein.distance_part2. Using the slow Ruby version instead." end end # The Levenshtein distance is a metric for measuring the amount of difference # between two sequences (i.e., the so called edit distance). The Levenshtein # distance between two strings is given by the minimum number of operations # needed to transform one string into the other, where an operation is an # insertion, deletion, or substitution of a single character. # # More information about the Levenshtein distance algorithm: # http://en.wikipedia.org/wiki/Levenshtein_distance . module Levenshtein # Returns the Levenshtein distance as a number bestween 0.0 and 1.0. # It's basically the Levenshtein distance divided by the length of the longest string. def self.normalized_distance(s1, s2, threshold=nil) s1, s2 = s2, s1 if s1.length > s2.length # s1 is the short one; s2 is the long one. if s2.empty? 0.0 # Since s1.length < s2.length, s1 must be empty as well. else if threshold if d = self.distance(s1, s2, (threshold*s2.length+1).to_i) d.to_f/s2.length else nil end else self.distance(s1, s2).to_f/s2.length end end end # Returns the Levenshtein distance between two byte strings. def self.distance(s1, s2, threshold=nil) s1, s2 = s2, s1 if s1.length > s2.length # s1 is the short one; s2 is the long one. # Handle some basic circumstances. return 0.0 if s1 == s2 return s2.length if s1.empty? return nil if threshold and (s2.length-s1.length) >= threshold return nil if threshold and (s1.scan(/./) - s2.scan(/./)).length >= threshold return nil if threshold and (s2.scan(/./) - s1.scan(/./)).length >= threshold # Do the expensive calculation on a subset of the strings only, if possible. b = 0 e1 = s1.length-1 e2 = s2.length-1 while s1[b, 1] == s2[b, 1] b += 1 end while s1[e1, 1] == s2[e2, 1] e1 -= 1 e2 -= 1 end distance_part2(s1[b..e1], s2[b..e2], threshold) end def self.distance_part2(s1, s2, threshold) # :nodoc: if respond_to?(:distance_part2_fast) distance_part2_fast(s1, s2, threshold) # Implemented in C. else distance_part2_slow(s1, s2, threshold) # Implemented in Ruby. end end def self.distance_part2_slow(s1, s2, threshold) # :nodoc: row = (0..s1.length).to_a 1.upto(s2.length) do |y| prow = row row = [y] 1.upto(s1.length) do |x| row[x] = [prow[x]+1, row[x-1]+1, prow[x-1]+(s1[x-1]==s2[y-1] ? 0 : 1)].min end # Stop analysing this string as soon as the best possible result for this string is bigger than the best result so far. # (The minimum value in the next row will be equal to or greater than the minimum value in this row.) return nil if threshold and row.min >= threshold end row[-1] end end