lib/active_support/multibyte/chars.rb in activesupport-3.0.0.beta2 vs lib/active_support/multibyte/chars.rb in activesupport-3.0.0.beta3
- old
+ new
@@ -17,11 +17,11 @@
# If certain methods do explicitly check the class, call +to_s+ before you pass chars objects to them.
#
# bad.explicit_checking_method "T".mb_chars.downcase.to_s
#
# The default Chars implementation assumes that the encoding of the string is UTF-8, if you want to handle different
- # encodings you can write your own multibyte string handler and configure it through
+ # encodings you can write your own multibyte string handler and configure it through
# ActiveSupport::Multibyte.proxy_class.
#
# class CharsForUTF32
# def size
# @wrapped_string.size / 4
@@ -456,12 +456,14 @@
def g_length
self.class.g_unpack(@wrapped_string).length
end
# Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent resulting in a valid UTF-8 string.
- def tidy_bytes
- chars(self.class.tidy_bytes(@wrapped_string))
+ #
+ # Passing +true+ will forcibly tidy all bytes, assuming that the string's encoding is entirely CP1252 or ISO-8859-1.
+ def tidy_bytes(force = false)
+ chars(self.class.tidy_bytes(@wrapped_string, force))
end
%w(lstrip rstrip strip reverse upcase downcase tidy_bytes capitalize).each do |method|
define_method("#{method}!") do |*args|
unless args.nil?
@@ -526,11 +528,11 @@
)
else
unpacked << codepoints[marker..pos-1]
marker = pos
end
- end
+ end
unpacked
end
# Reverse operation of g_unpack.
#
@@ -642,36 +644,83 @@
end
end
codepoints
end
+ def tidy_byte(byte)
+ if byte < 160
+ [UCD.cp1252[byte] || byte].pack("U").unpack("C*")
+ elsif byte < 192
+ [194, byte]
+ else
+ [195, byte - 64]
+ end
+ end
+ private :tidy_byte
+
# Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent resulting in a valid UTF-8 string.
- def tidy_bytes(string)
- string.split(//u).map do |c|
- c.force_encoding(Encoding::ASCII) if c.respond_to?(:force_encoding)
+ #
+ # Passing +true+ will forcibly tidy all bytes, assuming that the string's encoding is entirely CP-1252 or ISO-8859-1.
+ def tidy_bytes(string, force = false)
+ if force
+ return string.unpack("C*").map do |b|
+ tidy_byte(b)
+ end.flatten.compact.pack("C*").unpack("U*").pack("U*")
+ end
- if !ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8'].match(c)
- n = c.unpack('C')[0]
- n < 128 ? n.chr :
- n < 160 ? [UCD.cp1252[n] || n].pack('U') :
- n < 192 ? "\xC2" + n.chr : "\xC3" + (n-64).chr
+ bytes = string.unpack("C*")
+ conts_expected = 0
+ last_lead = 0
+
+ bytes.each_index do |i|
+
+ byte = bytes[i]
+ is_ascii = byte < 128
+ is_cont = byte > 127 && byte < 192
+ is_lead = byte > 191 && byte < 245
+ is_unused = byte > 240
+ is_restricted = byte > 244
+
+ # Impossible or highly unlikely byte? Clean it.
+ if is_unused || is_restricted
+ bytes[i] = tidy_byte(byte)
+ elsif is_cont
+ # Not expecting contination byte? Clean up. Otherwise, now expect one less.
+ conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
else
- c
+ if conts_expected > 0
+ # Expected continuation, but got ASCII or leading? Clean backwards up to
+ # the leading byte.
+ (1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
+ conts_expected = 0
+ end
+ if is_lead
+ # Final byte is leading? Clean it.
+ if i == bytes.length - 1
+ bytes[i] = tidy_byte(bytes.last)
+ else
+ # Valid leading byte? Expect continuations determined by position of
+ # first zero bit, with max of 3.
+ conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3
+ last_lead = i
+ end
+ end
end
- end.join
+ end
+ bytes.empty? ? "" : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
end
end
protected
-
+
def translate_offset(byte_offset) #:nodoc:
return nil if byte_offset.nil?
return 0 if @wrapped_string == ''
-
+
if @wrapped_string.respond_to?(:force_encoding)
@wrapped_string = @wrapped_string.dup.force_encoding(Encoding::ASCII_8BIT)
end
-
+
begin
@wrapped_string[0...byte_offset].unpack('U*').length
rescue ArgumentError => e
byte_offset -= 1
retry