lib/text/levenshtein.rb in text-1.0.4 vs lib/text/levenshtein.rb in text-1.0.5
- old
+ new
@@ -9,48 +9,48 @@
# easier-to-pronounce-and-spell 'edit distance'.
#
# Author: Paul Battley (pbattley@gmail.com)
#
-require "text/util"
-
module Text # :nodoc:
module Levenshtein
# Calculate the Levenshtein distance between two strings +str1+ and +str2+.
- # +str1+ and +str2+ should be ASCII, UTF-8, or a one-byte-per character encoding such
- # as ISO-8859-*.
#
- # The strings will be treated as UTF-8 if $KCODE is set appropriately (i.e. 'u').
- # Otherwise, the comparison will be performed byte-by-byte. There is no specific support
- # for Shift-JIS or EUC strings.
#
- # When using Unicode text, be aware that this algorithm does not perform normalisation.
- # If there is a possibility of different normalised forms being used, normalisation
- # should be performed beforehand.
+ # In Ruby 1.8, +str1+ and +str2+ should be ASCII, UTF-8, or a one-byte-per
+ # character encoding such as ISO-8859-*. They will be treated as UTF-8 if
+ # $KCODE is set appropriately (i.e. 'u'). Otherwise, the comparison will be
+ # performed byte-by-byte. There is no specific support for Shift-JIS or EUC
+ # strings.
#
+ # In Ruby 1.9+, the strings will be processed as UTF-8.
+ #
+ # When using Unicode text, be aware that this algorithm does not perform
+ # normalisation. If there is a possibility of different normalised forms
+ # being used, normalisation should be performed beforehand.
+ #
def distance(str1, str2)
- encoding = defined?(Encoding) ? str1.encoding.to_s : $KCODE
+ prepare =
+ if "ruby".respond_to?(:encoding)
+ lambda { |str| str.encode(Encoding::UTF_8).unpack("U*") }
+ else
+ rule = $KCODE.match(/^U/i) ? "U*" : "C*"
+ lambda { |str| str.unpack(rule) }
+ end
- if Text.encoding_of(str1) =~ /^U/i
- unpack_rule = 'U*'
- else
- unpack_rule = 'C*'
- end
-
- s = str1.unpack(unpack_rule)
- t = str2.unpack(unpack_rule)
+ s, t = [str1, str2].map(&prepare)
n = s.length
m = t.length
- return m if (0 == n)
- return n if (0 == m)
+ return m if n.zero?
+ return n if m.zero?
d = (0..m).to_a
x = nil
- (0...n).each do |i|
- e = i+1
- (0...m).each do |j|
+ n.times do |i|
+ e = i + 1
+ m.times do |j|
cost = (s[i] == t[j]) ? 0 : 1
x = [
d[j+1] + 1, # insertion
e + 1, # deletion
d[j] + cost # substitution