lib/levenshtein.rb in levenshtein-0.2.1 vs lib/levenshtein.rb in levenshtein-0.2.2

- old
+ new

@@ -1,99 +1,112 @@ -require "levenshtein/exception" +# encoding: UTF-8 + require "levenshtein/version" module Levenshtein # Returns the Levenshtein distance as a number between 0.0 and # 1.0. It's basically the Levenshtein distance divided by the - # length of the longest sequence. + # size of the longest sequence. - def self.normalized_distance(a1, a2, threshold=nil) - a1, a2 = a2, a1 if a1.length > a2.length # a1 is the short one; a2 is the long one. + def self.normalized_distance(a1, a2, threshold=nil, options={}) + size = [a1.size, a2.size].max - if a2.length == 0 - 0.0 # Since a1.length < a2.length, a1 must be empty as well. + if a1.size == 0 and a2.size == 0 + 0.0 + elsif a1.size == 0 + a2.size.to_f/size + elsif a2.size == 0 + a1.size.to_f/size else if threshold - if d = self.distance(a1, a2, (threshold*a2.length+1).to_i) - d.to_f/a2.length + if d = self.distance(a1, a2, (threshold*size).to_i+1) + d.to_f/size else nil end else - self.distance(a1, a2).to_f/a2.length + self.distance(a1, a2).to_f/size end end end # Returns the Levenshtein distance between two sequences. # # The two sequences can be two strings, two arrays, or two other - # objects. Strings, arrays and arrays of strings are handled with - # optimized (very fast) C code. All other sequences are handled - # with generic (fast) C code. + # objects responding to :each. All sequences are by generic + # (fast) C code. # - # The sequences should respond to :length and :[] and all objects - # in the sequences (as returned by []) should response to :==. + # All objects in the sequences should respond to :hash and :eql?. - def self.distance(a1, a2, threshold=nil) - a1, a2 = a2, a1 if a1.length > a2.length # a1 is the short one; a2 is the long one. + def self.distance(a1, a2, threshold=nil, options={}) + a1, a2 = a1.scan(/./), a2.scan(/./) if String === a1 and String === a2 + a1, a2 = Util.pool(a1, a2) # Handle some basic circumstances. return 0 if a1 == a2 - return a2.length if a1.length == 0 + return a2.size if a1.empty? + return a1.size if a2.empty? if threshold - return nil if (a2.length-a1.length) >= threshold - - a3, a4 = nil, nil - a3, a4 = a1, a2 if a1.respond_to?(:-) and a2.respond_to?(:-) - a3, a4 = a1.scan(/./), a2.scan(/./) if a1.respond_to?(:scan) and a2.respond_to?(:scan) - - if a3 and a4 - return nil if (a3-a4).length >= threshold - return nil if (a4-a3).length >= threshold - end + return nil if (a1.size-a2.size) >= threshold + return nil if (a2.size-a1.size) >= threshold + return nil if (a1-a2).size >= threshold + return nil if (a2-a1).size >= threshold end - distance_fast_or_slow(a1, a2, threshold) - end + # Remove the common prefix and the common postfix. - def self.distance_fast_or_slow(a1, a2, threshold) # :nodoc: - if respond_to?(:distance_fast) - distance_fast(a1, a2, threshold) # Implemented in C. - else - distance_slow(a1, a2, threshold) # Implemented in Ruby. - end - end + l1 = a1.size + l2 = a2.size - def self.distance_slow(a1, a2, threshold) # :nodoc: - l1 = a1.length - l2 = a2.length - - offset = 0 + offset = 0 + no_more_optimizations = true - while offset < l1 and offset < l2 and a1[offset] == a2[offset] + while offset < l1 and offset < l2 and a1[offset].equal?(a2[offset]) offset += 1 + + no_more_optimizations = false end - while offset < l1 and offset < l2 and a1[l1-1] == a2[l2-1] + while offset < l1 and offset < l2 and a1[l1-1].equal?(a2[l2-1]) l1 -= 1 l2 -= 1 + + no_more_optimizations = false end - l1 -= offset - l2 -= offset + if no_more_optimizations + distance_fast_or_slow(a1, a2, threshold, options) + else + l1 -= offset + l2 -= offset - crow = (0..l1).to_a + a1 = a1[offset, l1] + a2 = a2[offset, l2] - 1.upto(l2) do |y| + distance(a1, a2, threshold, options) + end + end + + def self.distance_fast_or_slow(a1, a2, threshold, options) # :nodoc: + if respond_to?(:distance_fast) and options[:force_slow] + distance_fast(a1, a2, threshold) # Implemented in C. + else + distance_slow(a1, a2, threshold) # Implemented in Ruby. + end + end + + def self.distance_slow(a1, a2, threshold) # :nodoc: + crow = (0..a1.size).to_a + + 1.upto(a2.size) do |y| prow = crow crow = [y] - 1.upto(l1) do |x| - crow[x] = [prow[x]+1, crow[x-1]+1, prow[x-1]+(a1[offset+x-1]==a2[offset+y-1] ? 0 : 1)].min + 1.upto(a1.size) do |x| + crow[x] = [prow[x]+1, crow[x-1]+1, prow[x-1]+(a1[x-1].equal?(a2[y-1]) ? 0 : 1)].min end # Stop analysing this sequence as soon as the best possible # result for this sequence is bigger than the best result so far. # (The minimum value in the next row will be equal to or greater @@ -101,9 +114,27 @@ return nil if threshold and crow.min >= threshold end crow[-1] + end + + module Util # :nodoc: + def self.pool(*args) + # So we can compare pointers instead of objects (equal?() instead of ==()). + + pool = {} + + args.collect do |arg| + a = [] + + arg.each do |o| + a << pool[o] ||= o + end + + a + end + end end end begin require "levenshtein/levenshtein_fast" # Compiled by RubyGems.