lib/levenshtein.rb in levenshtein-0.2.1 vs lib/levenshtein.rb in levenshtein-0.2.2
- old
+ new
@@ -1,99 +1,112 @@
-require "levenshtein/exception"
+# encoding: UTF-8
+
require "levenshtein/version"
module Levenshtein
# Returns the Levenshtein distance as a number between 0.0 and
# 1.0. It's basically the Levenshtein distance divided by the
- # length of the longest sequence.
+ # size of the longest sequence.
- def self.normalized_distance(a1, a2, threshold=nil)
- a1, a2 = a2, a1 if a1.length > a2.length # a1 is the short one; a2 is the long one.
+ def self.normalized_distance(a1, a2, threshold=nil, options={})
+ size = [a1.size, a2.size].max
- if a2.length == 0
- 0.0 # Since a1.length < a2.length, a1 must be empty as well.
+ if a1.size == 0 and a2.size == 0
+ 0.0
+ elsif a1.size == 0
+ a2.size.to_f/size
+ elsif a2.size == 0
+ a1.size.to_f/size
else
if threshold
- if d = self.distance(a1, a2, (threshold*a2.length+1).to_i)
- d.to_f/a2.length
+ if d = self.distance(a1, a2, (threshold*size).to_i+1)
+ d.to_f/size
else
nil
end
else
- self.distance(a1, a2).to_f/a2.length
+ self.distance(a1, a2).to_f/size
end
end
end
# Returns the Levenshtein distance between two sequences.
#
# The two sequences can be two strings, two arrays, or two other
- # objects. Strings, arrays and arrays of strings are handled with
- # optimized (very fast) C code. All other sequences are handled
- # with generic (fast) C code.
+ # objects responding to :each. All sequences are by generic
+ # (fast) C code.
#
- # The sequences should respond to :length and :[] and all objects
- # in the sequences (as returned by []) should response to :==.
+ # All objects in the sequences should respond to :hash and :eql?.
- def self.distance(a1, a2, threshold=nil)
- a1, a2 = a2, a1 if a1.length > a2.length # a1 is the short one; a2 is the long one.
+ def self.distance(a1, a2, threshold=nil, options={})
+ a1, a2 = a1.scan(/./), a2.scan(/./) if String === a1 and String === a2
+ a1, a2 = Util.pool(a1, a2)
# Handle some basic circumstances.
return 0 if a1 == a2
- return a2.length if a1.length == 0
+ return a2.size if a1.empty?
+ return a1.size if a2.empty?
if threshold
- return nil if (a2.length-a1.length) >= threshold
-
- a3, a4 = nil, nil
- a3, a4 = a1, a2 if a1.respond_to?(:-) and a2.respond_to?(:-)
- a3, a4 = a1.scan(/./), a2.scan(/./) if a1.respond_to?(:scan) and a2.respond_to?(:scan)
-
- if a3 and a4
- return nil if (a3-a4).length >= threshold
- return nil if (a4-a3).length >= threshold
- end
+ return nil if (a1.size-a2.size) >= threshold
+ return nil if (a2.size-a1.size) >= threshold
+ return nil if (a1-a2).size >= threshold
+ return nil if (a2-a1).size >= threshold
end
- distance_fast_or_slow(a1, a2, threshold)
- end
+ # Remove the common prefix and the common postfix.
- def self.distance_fast_or_slow(a1, a2, threshold) # :nodoc:
- if respond_to?(:distance_fast)
- distance_fast(a1, a2, threshold) # Implemented in C.
- else
- distance_slow(a1, a2, threshold) # Implemented in Ruby.
- end
- end
+ l1 = a1.size
+ l2 = a2.size
- def self.distance_slow(a1, a2, threshold) # :nodoc:
- l1 = a1.length
- l2 = a2.length
-
- offset = 0
+ offset = 0
+ no_more_optimizations = true
- while offset < l1 and offset < l2 and a1[offset] == a2[offset]
+ while offset < l1 and offset < l2 and a1[offset].equal?(a2[offset])
offset += 1
+
+ no_more_optimizations = false
end
- while offset < l1 and offset < l2 and a1[l1-1] == a2[l2-1]
+ while offset < l1 and offset < l2 and a1[l1-1].equal?(a2[l2-1])
l1 -= 1
l2 -= 1
+
+ no_more_optimizations = false
end
- l1 -= offset
- l2 -= offset
+ if no_more_optimizations
+ distance_fast_or_slow(a1, a2, threshold, options)
+ else
+ l1 -= offset
+ l2 -= offset
- crow = (0..l1).to_a
+ a1 = a1[offset, l1]
+ a2 = a2[offset, l2]
- 1.upto(l2) do |y|
+ distance(a1, a2, threshold, options)
+ end
+ end
+
+ def self.distance_fast_or_slow(a1, a2, threshold, options) # :nodoc:
+ if respond_to?(:distance_fast) and options[:force_slow]
+ distance_fast(a1, a2, threshold) # Implemented in C.
+ else
+ distance_slow(a1, a2, threshold) # Implemented in Ruby.
+ end
+ end
+
+ def self.distance_slow(a1, a2, threshold) # :nodoc:
+ crow = (0..a1.size).to_a
+
+ 1.upto(a2.size) do |y|
prow = crow
crow = [y]
- 1.upto(l1) do |x|
- crow[x] = [prow[x]+1, crow[x-1]+1, prow[x-1]+(a1[offset+x-1]==a2[offset+y-1] ? 0 : 1)].min
+ 1.upto(a1.size) do |x|
+ crow[x] = [prow[x]+1, crow[x-1]+1, prow[x-1]+(a1[x-1].equal?(a2[y-1]) ? 0 : 1)].min
end
# Stop analysing this sequence as soon as the best possible
# result for this sequence is bigger than the best result so far.
# (The minimum value in the next row will be equal to or greater
@@ -101,9 +114,27 @@
return nil if threshold and crow.min >= threshold
end
crow[-1]
+ end
+
+ module Util # :nodoc:
+ def self.pool(*args)
+ # So we can compare pointers instead of objects (equal?() instead of ==()).
+
+ pool = {}
+
+ args.collect do |arg|
+ a = []
+
+ arg.each do |o|
+ a << pool[o] ||= o
+ end
+
+ a
+ end
+ end
end
end
begin
require "levenshtein/levenshtein_fast" # Compiled by RubyGems.