lib/active_support/multibyte/chars.rb in activesupport-2.2.3 vs lib/active_support/multibyte/chars.rb in activesupport-2.3.2

- old
+ new

@@ -71,11 +71,20 @@ array_of_codepoints.collect{ |e| [e].pack 'U*' }.join('|') end UNICODE_TRAILERS_PAT = /(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+\Z/ UNICODE_LEADERS_PAT = /\A(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+/ - UTF8_PAT = ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8'] + # Borrowed from the Kconv library by Shinji KONO - (also as seen on the W3C site) + UTF8_PAT = /\A(?: + [\x00-\x7f] | + [\xc2-\xdf] [\x80-\xbf] | + \xe0 [\xa0-\xbf] [\x80-\xbf] | + [\xe1-\xef] [\x80-\xbf] [\x80-\xbf] | + \xf0 [\x90-\xbf] [\x80-\xbf] [\x80-\xbf] | + [\xf1-\xf3] [\x80-\xbf] [\x80-\xbf] [\x80-\xbf] | + \xf4 [\x80-\x8f] [\x80-\xbf] [\x80-\xbf] + )*\z/xn attr_reader :wrapped_string alias to_s wrapped_string alias to_str wrapped_string @@ -281,35 +290,35 @@ # Strips entire range of Unicode whitespace from the right of the string. def rstrip chars(@wrapped_string.gsub(UNICODE_TRAILERS_PAT, '')) end - + # Strips entire range of Unicode whitespace from the left of the string. def lstrip chars(@wrapped_string.gsub(UNICODE_LEADERS_PAT, '')) end - + # Strips entire range of Unicode whitespace from the right and left of the string. def strip rstrip.lstrip end - + # Returns the number of codepoints in the string def size self.class.u_unpack(@wrapped_string).size end alias_method :length, :size - + # Reverses all characters in the string. # # Example: # 'Café'.mb_chars.reverse.to_s #=> 'éfaC' def reverse chars(self.class.u_unpack(@wrapped_string).reverse.pack('U*')) end - + # Implements Unicode-aware slice with codepoints. Slicing on one point returns the codepoints for that # character. # # Example: # 'こんにちは'.mb_chars.slice(2..3).to_s #=> "にち" @@ -333,13 +342,25 @@ end result.nil? ? nil : chars(result) end alias_method :[], :slice - # Converts first character in the string to Unicode value + # Like <tt>String#slice!</tt>, except instead of byte offsets you specify character offsets. # # Example: + # s = 'こんにちは' + # s.mb_chars.slice!(2..3).to_s #=> "にち" + # s #=> "こんは" + def slice!(*args) + slice = self[*args] + self[*args] = '' + slice + end + + # Returns the codepoint of the first character in the string. + # + # Example: # 'こんにちは'.mb_chars.ord #=> 12371 def ord self.class.u_unpack(@wrapped_string)[0] end @@ -421,11 +442,11 @@ # Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent resulting in a valid UTF-8 string. def tidy_bytes chars(self.class.tidy_bytes(@wrapped_string)) end - %w(lstrip rstrip strip reverse upcase downcase slice tidy_bytes capitalize).each do |method| + %w(lstrip rstrip strip reverse upcase downcase tidy_bytes capitalize).each do |method| define_method("#{method}!") do |*args| unless args.nil? @wrapped_string = send(method, *args).to_s else @wrapped_string = send(method).to_s @@ -608,10 +629,10 @@ # Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent resulting in a valid UTF-8 string. def tidy_bytes(string) string.split(//u).map do |c| c.force_encoding(Encoding::ASCII) if c.respond_to?(:force_encoding) - if !ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8'].match(c) + if !UTF8_PAT.match(c) n = c.unpack('C')[0] n < 128 ? n.chr : n < 160 ? [UCD.cp1252[n] || n].pack('U') : n < 192 ? "\xC2" + n.chr : "\xC3" + (n-64).chr else