lib/text/levenshtein.rb in text-1.2.3 vs lib/text/levenshtein.rb in text-1.3.0
- old
+ new
@@ -14,32 +14,110 @@
module Text # :nodoc:
module Levenshtein
# Calculate the Levenshtein distance between two strings +str1+ and +str2+.
#
+ # The optional argument max_distance can reduce the number of iterations by
+ # stopping if the Levenshtein distance exceeds this value. This increases
+ # performance where it is only necessary to compare the distance with a
+ # reference value instead of calculating the exact distance.
#
- # In Ruby 1.8, +str1+ and +str2+ should be ASCII, UTF-8, or a one-byte-per
- # character encoding such as ISO-8859-*. They will be treated as UTF-8 if
- # $KCODE is set appropriately (i.e. 'u'). Otherwise, the comparison will be
- # performed byte-by-byte. There is no specific support for Shift-JIS or EUC
- # strings.
+ # The distance is calculated in terms of Unicode codepoints. Be aware that
+ # this algorithm does not perform normalisation: if there is a possibility
+ # of different normalised forms being used, normalisation should be performed
+ # beforehand.
#
- # In Ruby 1.9+, the strings will be processed as UTF-8.
- #
- # When using Unicode text, be aware that this algorithm does not perform
- # normalisation. If there is a possibility of different normalised forms
- # being used, normalisation should be performed beforehand.
- #
- def distance(str1, str2)
- prepare =
- if "ruby".respond_to?(:encoding)
- lambda { |str| str.encode(Encoding::UTF_8).unpack("U*") }
+ def distance(str1, str2, max_distance = nil)
+ if max_distance
+ distance_with_maximum(str1, str2, max_distance)
+ else
+ distance_without_maximum(str1, str2)
+ end
+ end
+
+private
+ def distance_with_maximum(str1, str2, max_distance) # :nodoc:
+ s, t = [str1, str2].sort_by(&:length).
+ map{ |str| str.encode(Encoding::UTF_8).unpack("U*") }
+ n = s.length
+ m = t.length
+ big_int = n * m
+ return m if n.zero?
+ return n if m.zero?
+ return 0 if s == t
+
+ # If the length difference is already greater than the max_distance, then
+ # there is nothing else to check
+ if (n - m).abs >= max_distance
+ return max_distance
+ end
+
+ # The values necessary for our threshold are written; the ones after must
+ # be filled with large integers since the tailing member of the threshold
+ # window in the bottom array will run min across them
+ d = (m + 1).times.map { |i|
+ if i < m || i < max_distance + 1
+ i
else
- rule = $KCODE.match(/^U/i) ? "U*" : "C*"
- lambda { |str| str.unpack(rule) }
+ big_int
end
+ }
+ x = nil
+ e = nil
- s, t = [str1, str2].map(&prepare)
+ n.times do |i|
+ # Since we're reusing arrays, we need to be sure to wipe the value left
+ # of the starting index; we don't have to worry about the value above the
+ # ending index as the arrays were initially filled with large integers
+ # and we progress to the right
+ if e.nil?
+ e = i + 1
+ else
+ e = big_int
+ end
+
+ diag_index = t.length - s.length + i
+
+ # If max_distance was specified, we can reduce second loop. So we set
+ # up our threshold window.
+ # See:
+ # Gusfield, Dan (1997). Algorithms on strings, trees, and sequences:
+ # computer science and computational biology.
+ # Cambridge, UK: Cambridge University Press. ISBN 0-521-58519-8.
+ # pp. 263–264.
+ min = [0, i - max_distance - 1].max
+ max = [m - 1, i + max_distance].min
+
+ (min .. max).each do |j|
+ # If the diagonal value is already greater than the max_distance
+ # then we can safety return: the diagonal will never go lower again.
+ # See: http://www.levenshtein.net/
+ if j == diag_index && d[j] >= max_distance
+ return max_distance
+ end
+
+ cost = s[i] == t[j] ? 0 : 1
+ x = [
+ d[j+1] + 1, # insertion
+ e + 1, # deletion
+ d[j] + cost # substitution
+ ].min
+
+ d[j] = e
+ e = x
+ end
+ d[m] = x
+ end
+
+ if x > max_distance
+ return max_distance
+ else
+ return x
+ end
+ end
+
+ def distance_without_maximum(str1, str2) # :nodoc:
+ s, t = [str1, str2].map{ |str| str.encode(Encoding::UTF_8).unpack("U*") }
n = s.length
m = t.length
return m if n.zero?
return n if m.zero?