chars.rb in activesupport-2.3.2

- old
+ new

@@ -71,11 +71,20 @@
         array_of_codepoints.collect{ |e| [e].pack 'U*' }.join('|')
       end
       UNICODE_TRAILERS_PAT = /(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+\Z/
       UNICODE_LEADERS_PAT = /\A(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+/
 
-      UTF8_PAT = ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8']
+      # Borrowed from the Kconv library by Shinji KONO - (also as seen on the W3C site)
+      UTF8_PAT = /\A(?:
+                     [\x00-\x7f]                                     |
+                     [\xc2-\xdf] [\x80-\xbf]                         |
+                     \xe0        [\xa0-\xbf] [\x80-\xbf]             |
+                     [\xe1-\xef] [\x80-\xbf] [\x80-\xbf]             |
+                     \xf0        [\x90-\xbf] [\x80-\xbf] [\x80-\xbf] |
+                     [\xf1-\xf3] [\x80-\xbf] [\x80-\xbf] [\x80-\xbf] |
+                     \xf4        [\x80-\x8f] [\x80-\xbf] [\x80-\xbf]
+                    )*\z/xn
 
       attr_reader :wrapped_string
       alias to_s wrapped_string
       alias to_str wrapped_string
 
@@ -281,35 +290,35 @@
 
       # Strips entire range of Unicode whitespace from the right of the string.
       def rstrip
         chars(@wrapped_string.gsub(UNICODE_TRAILERS_PAT, ''))
       end
-
+      
       # Strips entire range of Unicode whitespace from the left of the string.
       def lstrip
         chars(@wrapped_string.gsub(UNICODE_LEADERS_PAT, ''))
       end
-
+      
       # Strips entire range of Unicode whitespace from the right and left of the string.
       def strip
         rstrip.lstrip
       end
-
+      
       # Returns the number of codepoints in the string
       def size
         self.class.u_unpack(@wrapped_string).size
       end
       alias_method :length, :size
-
+      
       # Reverses all characters in the string.
       #
       # Example:
       #   'Café'.mb_chars.reverse.to_s #=> 'éfaC'
       def reverse
         chars(self.class.u_unpack(@wrapped_string).reverse.pack('U*'))
       end
-
+      
       # Implements Unicode-aware slice with codepoints. Slicing on one point returns the codepoints for that
       # character.
       #
       # Example:
       #   'こんにちは'.mb_chars.slice(2..3).to_s #=> "にち"
@@ -333,13 +342,25 @@
         end
         result.nil? ? nil : chars(result)
       end
       alias_method :[], :slice
 
-      # Converts first character in the string to Unicode value
+      # Like <tt>String#slice!</tt>, except instead of byte offsets you specify character offsets.
       #
       # Example:
+      #   s = 'こんにちは'
+      #   s.mb_chars.slice!(2..3).to_s #=> "にち"
+      #   s #=> "こんは"
+      def slice!(*args)
+        slice = self[*args]
+        self[*args] = ''
+        slice
+      end
+
+      # Returns the codepoint of the first character in the string.
+      #
+      # Example:
       #   'こんにちは'.mb_chars.ord #=> 12371
       def ord
         self.class.u_unpack(@wrapped_string)[0]
       end
 
@@ -421,11 +442,11 @@
       # Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent resulting in a valid UTF-8 string.
       def tidy_bytes
         chars(self.class.tidy_bytes(@wrapped_string))
       end
 
-      %w(lstrip rstrip strip reverse upcase downcase slice tidy_bytes capitalize).each do |method|
+      %w(lstrip rstrip strip reverse upcase downcase tidy_bytes capitalize).each do |method|
         define_method("#{method}!") do |*args|
           unless args.nil?
             @wrapped_string = send(method, *args).to_s
           else
             @wrapped_string = send(method).to_s
@@ -608,10 +629,10 @@
         # Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent resulting in a valid UTF-8 string.
         def tidy_bytes(string)
           string.split(//u).map do |c|
             c.force_encoding(Encoding::ASCII) if c.respond_to?(:force_encoding)
 
-            if !ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8'].match(c)
+            if !UTF8_PAT.match(c)
               n = c.unpack('C')[0]
               n < 128 ? n.chr :
               n < 160 ? [UCD.cp1252[n] || n].pack('U') :
               n < 192 ? "\xC2" + n.chr : "\xC3" + (n-64).chr
             else