lib/active_support/multibyte/unicode.rb in activesupport-4.1.0.beta2 vs lib/active_support/multibyte/unicode.rb in activesupport-4.1.0.rc1
- old
+ new
@@ -9,11 +9,11 @@
# See http://www.unicode.org/reports/tr15/tr15-29.html for more
# information about normalization.
NORMALIZATION_FORMS = [:c, :kc, :d, :kd]
# The Unicode version that is supported by the implementation
- UNICODE_VERSION = '6.2.0'
+ UNICODE_VERSION = '6.3.0'
# The default normalization used for operations that require
# normalization. It can be set to any of the normalizations
# in NORMALIZATION_FORMS.
#
@@ -210,41 +210,47 @@
end
end
codepoints
end
- # Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent
- # resulting in a valid UTF-8 string.
- #
- # Passing +true+ will forcibly tidy all bytes, assuming that the string's
- # encoding is entirely CP1252 or ISO-8859-1.
- def tidy_bytes(string, force = false)
- return string if string.empty?
-
- if force
- return string.encode(Encoding::UTF_8, Encoding::Windows_1252, invalid: :replace, undef: :replace)
+ # Ruby >= 2.1 has String#scrub, which is faster than the workaround used for < 2.1.
+ if '<3'.respond_to?(:scrub)
+ # Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent
+ # resulting in a valid UTF-8 string.
+ #
+ # Passing +true+ will forcibly tidy all bytes, assuming that the string's
+ # encoding is entirely CP1252 or ISO-8859-1.
+ def tidy_bytes(string, force = false)
+ return string if string.empty?
+ return recode_windows1252_chars(string) if force
+ string.scrub { |bad| recode_windows1252_chars(bad) }
end
+ else
+ def tidy_bytes(string, force = false)
+ return string if string.empty?
+ return recode_windows1252_chars(string) if force
- # We can't transcode to the same format, so we choose a nearly-identical encoding.
- # We're going to 'transcode' bytes from UTF-8 when possible, then fall back to
- # CP1252 when we get errors. The final string will be 'converted' back to UTF-8
- # before returning.
- reader = Encoding::Converter.new(Encoding::UTF_8, Encoding::UTF_8_MAC)
+ # We can't transcode to the same format, so we choose a nearly-identical encoding.
+ # We're going to 'transcode' bytes from UTF-8 when possible, then fall back to
+ # CP1252 when we get errors. The final string will be 'converted' back to UTF-8
+ # before returning.
+ reader = Encoding::Converter.new(Encoding::UTF_8, Encoding::UTF_16LE)
- source = string.dup
- out = ''.force_encoding(Encoding::UTF_8_MAC)
+ source = string.dup
+ out = ''.force_encoding(Encoding::UTF_16LE)
- loop do
- reader.primitive_convert(source, out)
- _, _, _, error_bytes, _ = reader.primitive_errinfo
- break if error_bytes.nil?
- out << error_bytes.encode(Encoding::UTF_8_MAC, Encoding::Windows_1252, invalid: :replace, undef: :replace)
- end
+ loop do
+ reader.primitive_convert(source, out)
+ _, _, _, error_bytes, _ = reader.primitive_errinfo
+ break if error_bytes.nil?
+ out << error_bytes.encode(Encoding::UTF_16LE, Encoding::Windows_1252, invalid: :replace, undef: :replace)
+ end
- reader.finish
+ reader.finish
- out.encode!(Encoding::UTF_8)
+ out.encode!(Encoding::UTF_8)
+ end
end
# Returns the KC normalization of the string by default. NFKC is
# considered the best normalization form for passing strings to databases
# and validations.
@@ -369,17 +375,11 @@
codepoint
end
end.pack('U*')
end
- def tidy_byte(byte)
- if byte < 160
- [database.cp1252[byte] || byte].pack("U").unpack("C*")
- elsif byte < 192
- [194, byte]
- else
- [195, byte - 64]
- end
+ def recode_windows1252_chars(string)
+ string.encode(Encoding::UTF_8, Encoding::Windows_1252, invalid: :replace, undef: :replace)
end
def database
@database ||= UnicodeDatabase.new
end