lib/addressable/idna/pure.rb in addressable-2.8.1 vs lib/addressable/idna/pure.rb in addressable-2.8.2
- old
+ new
@@ -64,22 +64,22 @@
# Converts from a Unicode internationalized domain name to an ASCII
# domain name as described in RFC 3490.
def self.to_ascii(input)
input = input.to_s unless input.is_a?(String)
- input = input.dup
+ input = input.dup.force_encoding(Encoding::UTF_8).unicode_normalize(:nfkc)
if input.respond_to?(:force_encoding)
input.force_encoding(Encoding::ASCII_8BIT)
end
if input =~ UTF8_REGEX && input =~ UTF8_REGEX_MULTIBYTE
parts = unicode_downcase(input).split('.')
parts.map! do |part|
if part.respond_to?(:force_encoding)
part.force_encoding(Encoding::ASCII_8BIT)
end
if part =~ UTF8_REGEX && part =~ UTF8_REGEX_MULTIBYTE
- ACE_PREFIX + punycode_encode(unicode_normalize_kc(part))
+ ACE_PREFIX + punycode_encode(part)
else
part
end
end
parts.join('.')
@@ -110,19 +110,10 @@
output.force_encoding(Encoding::UTF_8)
end
output
end
- # Unicode normalization form KC.
- def self.unicode_normalize_kc(input)
- input = input.to_s unless input.is_a?(String)
- unpacked = input.unpack("U*")
- unpacked =
- unicode_compose(unicode_sort_canonical(unicode_decompose(unpacked)))
- return unpacked.pack("U*")
- end
-
##
# Unicode aware downcase method.
#
# @api private
# @param [String] input
@@ -134,189 +125,16 @@
unpacked.map! { |codepoint| lookup_unicode_lowercase(codepoint) }
return unpacked.pack("U*")
end
private_class_method :unicode_downcase
- def self.unicode_compose(unpacked)
- unpacked_result = []
- length = unpacked.length
-
- return unpacked if length == 0
-
- starter = unpacked[0]
- starter_cc = lookup_unicode_combining_class(starter)
- starter_cc = 256 if starter_cc != 0
- for i in 1...length
- ch = unpacked[i]
-
- if (starter_cc == 0 &&
- (composite = unicode_compose_pair(starter, ch)) != nil)
- starter = composite
- else
- unpacked_result << starter
- starter = ch
- end
- end
- unpacked_result << starter
- return unpacked_result
- end
- private_class_method :unicode_compose
-
- def self.unicode_compose_pair(ch_one, ch_two)
- if ch_one >= HANGUL_LBASE && ch_one < HANGUL_LBASE + HANGUL_LCOUNT &&
- ch_two >= HANGUL_VBASE && ch_two < HANGUL_VBASE + HANGUL_VCOUNT
- # Hangul L + V
- return HANGUL_SBASE + (
- (ch_one - HANGUL_LBASE) * HANGUL_VCOUNT + (ch_two - HANGUL_VBASE)
- ) * HANGUL_TCOUNT
- elsif ch_one >= HANGUL_SBASE &&
- ch_one < HANGUL_SBASE + HANGUL_SCOUNT &&
- (ch_one - HANGUL_SBASE) % HANGUL_TCOUNT == 0 &&
- ch_two >= HANGUL_TBASE && ch_two < HANGUL_TBASE + HANGUL_TCOUNT
- # Hangul LV + T
- return ch_one + (ch_two - HANGUL_TBASE)
- end
-
- p = []
-
- ucs4_to_utf8(ch_one, p)
- ucs4_to_utf8(ch_two, p)
-
- return lookup_unicode_composition(p)
- end
- private_class_method :unicode_compose_pair
-
- def self.ucs4_to_utf8(char, buffer)
- if char < 128
- buffer << char
- elsif char < 2048
- buffer << (char >> 6 | 192)
- buffer << (char & 63 | 128)
- elsif char < 0x10000
- buffer << (char >> 12 | 224)
- buffer << (char >> 6 & 63 | 128)
- buffer << (char & 63 | 128)
- elsif char < 0x200000
- buffer << (char >> 18 | 240)
- buffer << (char >> 12 & 63 | 128)
- buffer << (char >> 6 & 63 | 128)
- buffer << (char & 63 | 128)
- elsif char < 0x4000000
- buffer << (char >> 24 | 248)
- buffer << (char >> 18 & 63 | 128)
- buffer << (char >> 12 & 63 | 128)
- buffer << (char >> 6 & 63 | 128)
- buffer << (char & 63 | 128)
- elsif char < 0x80000000
- buffer << (char >> 30 | 252)
- buffer << (char >> 24 & 63 | 128)
- buffer << (char >> 18 & 63 | 128)
- buffer << (char >> 12 & 63 | 128)
- buffer << (char >> 6 & 63 | 128)
- buffer << (char & 63 | 128)
- end
- end
- private_class_method :ucs4_to_utf8
-
- def self.unicode_sort_canonical(unpacked)
- unpacked = unpacked.dup
- i = 1
- length = unpacked.length
-
- return unpacked if length < 2
-
- while i < length
- last = unpacked[i-1]
- ch = unpacked[i]
- last_cc = lookup_unicode_combining_class(last)
- cc = lookup_unicode_combining_class(ch)
- if cc != 0 && last_cc != 0 && last_cc > cc
- unpacked[i] = last
- unpacked[i-1] = ch
- i -= 1 if i > 1
- else
- i += 1
- end
- end
- return unpacked
- end
- private_class_method :unicode_sort_canonical
-
- def self.unicode_decompose(unpacked)
- unpacked_result = []
- for cp in unpacked
- if cp >= HANGUL_SBASE && cp < HANGUL_SBASE + HANGUL_SCOUNT
- l, v, t = unicode_decompose_hangul(cp)
- unpacked_result << l
- unpacked_result << v if v
- unpacked_result << t if t
- else
- dc = lookup_unicode_compatibility(cp)
- unless dc
- unpacked_result << cp
- else
- unpacked_result.concat(unicode_decompose(dc.unpack("U*")))
- end
- end
- end
- return unpacked_result
- end
- private_class_method :unicode_decompose
-
- def self.unicode_decompose_hangul(codepoint)
- sindex = codepoint - HANGUL_SBASE;
- if sindex < 0 || sindex >= HANGUL_SCOUNT
- l = codepoint
- v = t = nil
- return l, v, t
- end
- l = HANGUL_LBASE + sindex / HANGUL_NCOUNT
- v = HANGUL_VBASE + (sindex % HANGUL_NCOUNT) / HANGUL_TCOUNT
- t = HANGUL_TBASE + sindex % HANGUL_TCOUNT
- if t == HANGUL_TBASE
- t = nil
- end
- return l, v, t
- end
- private_class_method :unicode_decompose_hangul
-
- def self.lookup_unicode_combining_class(codepoint)
- codepoint_data = UNICODE_DATA[codepoint]
- (codepoint_data ?
- (codepoint_data[UNICODE_DATA_COMBINING_CLASS] || 0) :
- 0)
- end
- private_class_method :lookup_unicode_combining_class
-
- def self.lookup_unicode_compatibility(codepoint)
- codepoint_data = UNICODE_DATA[codepoint]
- (codepoint_data ?
- codepoint_data[UNICODE_DATA_COMPATIBILITY] : nil)
- end
- private_class_method :lookup_unicode_compatibility
-
def self.lookup_unicode_lowercase(codepoint)
codepoint_data = UNICODE_DATA[codepoint]
(codepoint_data ?
(codepoint_data[UNICODE_DATA_LOWERCASE] || codepoint) :
codepoint)
end
private_class_method :lookup_unicode_lowercase
-
- def self.lookup_unicode_composition(unpacked)
- return COMPOSITION_TABLE[unpacked]
- end
- private_class_method :lookup_unicode_composition
-
- HANGUL_SBASE = 0xac00
- HANGUL_LBASE = 0x1100
- HANGUL_LCOUNT = 19
- HANGUL_VBASE = 0x1161
- HANGUL_VCOUNT = 21
- HANGUL_TBASE = 0x11a7
- HANGUL_TCOUNT = 28
- HANGUL_NCOUNT = HANGUL_VCOUNT * HANGUL_TCOUNT # 588
- HANGUL_SCOUNT = HANGUL_LCOUNT * HANGUL_NCOUNT # 11172
UNICODE_DATA_COMBINING_CLASS = 0
UNICODE_DATA_EXCLUSION = 1
UNICODE_DATA_CANONICAL = 2
UNICODE_DATA_COMPATIBILITY = 3