lib/active_support/multibyte/chars.rb in activesupport-3.0.0.beta2 vs lib/active_support/multibyte/chars.rb in activesupport-3.0.0.beta3

- old
+ new

@@ -17,11 +17,11 @@ # If certain methods do explicitly check the class, call +to_s+ before you pass chars objects to them. # # bad.explicit_checking_method "T".mb_chars.downcase.to_s # # The default Chars implementation assumes that the encoding of the string is UTF-8, if you want to handle different - # encodings you can write your own multibyte string handler and configure it through + # encodings you can write your own multibyte string handler and configure it through # ActiveSupport::Multibyte.proxy_class. # # class CharsForUTF32 # def size # @wrapped_string.size / 4 @@ -456,12 +456,14 @@ def g_length self.class.g_unpack(@wrapped_string).length end # Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent resulting in a valid UTF-8 string. - def tidy_bytes - chars(self.class.tidy_bytes(@wrapped_string)) + # + # Passing +true+ will forcibly tidy all bytes, assuming that the string's encoding is entirely CP1252 or ISO-8859-1. + def tidy_bytes(force = false) + chars(self.class.tidy_bytes(@wrapped_string, force)) end %w(lstrip rstrip strip reverse upcase downcase tidy_bytes capitalize).each do |method| define_method("#{method}!") do |*args| unless args.nil? @@ -526,11 +528,11 @@ ) else unpacked << codepoints[marker..pos-1] marker = pos end - end + end unpacked end # Reverse operation of g_unpack. # @@ -642,36 +644,83 @@ end end codepoints end + def tidy_byte(byte) + if byte < 160 + [UCD.cp1252[byte] || byte].pack("U").unpack("C*") + elsif byte < 192 + [194, byte] + else + [195, byte - 64] + end + end + private :tidy_byte + # Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent resulting in a valid UTF-8 string. - def tidy_bytes(string) - string.split(//u).map do |c| - c.force_encoding(Encoding::ASCII) if c.respond_to?(:force_encoding) + # + # Passing +true+ will forcibly tidy all bytes, assuming that the string's encoding is entirely CP-1252 or ISO-8859-1. + def tidy_bytes(string, force = false) + if force + return string.unpack("C*").map do |b| + tidy_byte(b) + end.flatten.compact.pack("C*").unpack("U*").pack("U*") + end - if !ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8'].match(c) - n = c.unpack('C')[0] - n < 128 ? n.chr : - n < 160 ? [UCD.cp1252[n] || n].pack('U') : - n < 192 ? "\xC2" + n.chr : "\xC3" + (n-64).chr + bytes = string.unpack("C*") + conts_expected = 0 + last_lead = 0 + + bytes.each_index do |i| + + byte = bytes[i] + is_ascii = byte < 128 + is_cont = byte > 127 && byte < 192 + is_lead = byte > 191 && byte < 245 + is_unused = byte > 240 + is_restricted = byte > 244 + + # Impossible or highly unlikely byte? Clean it. + if is_unused || is_restricted + bytes[i] = tidy_byte(byte) + elsif is_cont + # Not expecting contination byte? Clean up. Otherwise, now expect one less. + conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1 else - c + if conts_expected > 0 + # Expected continuation, but got ASCII or leading? Clean backwards up to + # the leading byte. + (1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])} + conts_expected = 0 + end + if is_lead + # Final byte is leading? Clean it. + if i == bytes.length - 1 + bytes[i] = tidy_byte(bytes.last) + else + # Valid leading byte? Expect continuations determined by position of + # first zero bit, with max of 3. + conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3 + last_lead = i + end + end end - end.join + end + bytes.empty? ? "" : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*") end end protected - + def translate_offset(byte_offset) #:nodoc: return nil if byte_offset.nil? return 0 if @wrapped_string == '' - + if @wrapped_string.respond_to?(:force_encoding) @wrapped_string = @wrapped_string.dup.force_encoding(Encoding::ASCII_8BIT) end - + begin @wrapped_string[0...byte_offset].unpack('U*').length rescue ArgumentError => e byte_offset -= 1 retry