pure.rb in addressable-2.8.2

- old
+ new

@@ -64,22 +64,22 @@
 
     # Converts from a Unicode internationalized domain name to an ASCII
     # domain name as described in RFC 3490.
     def self.to_ascii(input)
       input = input.to_s unless input.is_a?(String)
-      input = input.dup
+      input = input.dup.force_encoding(Encoding::UTF_8).unicode_normalize(:nfkc)
       if input.respond_to?(:force_encoding)
         input.force_encoding(Encoding::ASCII_8BIT)
       end
       if input =~ UTF8_REGEX && input =~ UTF8_REGEX_MULTIBYTE
         parts = unicode_downcase(input).split('.')
         parts.map! do |part|
           if part.respond_to?(:force_encoding)
             part.force_encoding(Encoding::ASCII_8BIT)
           end
           if part =~ UTF8_REGEX && part =~ UTF8_REGEX_MULTIBYTE
-            ACE_PREFIX + punycode_encode(unicode_normalize_kc(part))
+            ACE_PREFIX + punycode_encode(part)
           else
             part
           end
         end
         parts.join('.')
@@ -110,19 +110,10 @@
         output.force_encoding(Encoding::UTF_8)
       end
       output
     end
 
-    # Unicode normalization form KC.
-    def self.unicode_normalize_kc(input)
-      input = input.to_s unless input.is_a?(String)
-      unpacked = input.unpack("U*")
-      unpacked =
-        unicode_compose(unicode_sort_canonical(unicode_decompose(unpacked)))
-      return unpacked.pack("U*")
-    end
-
     ##
     # Unicode aware downcase method.
     #
     # @api private
     # @param [String] input
@@ -134,189 +125,16 @@
       unpacked.map! { |codepoint| lookup_unicode_lowercase(codepoint) }
       return unpacked.pack("U*")
     end
     private_class_method :unicode_downcase
 
-    def self.unicode_compose(unpacked)
-      unpacked_result = []
-      length = unpacked.length
-
-      return unpacked if length == 0
-
-      starter = unpacked[0]
-      starter_cc = lookup_unicode_combining_class(starter)
-      starter_cc = 256 if starter_cc != 0
-      for i in 1...length
-        ch = unpacked[i]
-
-        if (starter_cc == 0 &&
-            (composite = unicode_compose_pair(starter, ch)) != nil)
-          starter = composite
-        else
-          unpacked_result << starter
-          starter = ch
-        end
-      end
-      unpacked_result << starter
-      return unpacked_result
-    end
-    private_class_method :unicode_compose
-
-    def self.unicode_compose_pair(ch_one, ch_two)
-      if ch_one >= HANGUL_LBASE && ch_one < HANGUL_LBASE + HANGUL_LCOUNT &&
-          ch_two >= HANGUL_VBASE && ch_two < HANGUL_VBASE + HANGUL_VCOUNT
-        # Hangul L + V
-        return HANGUL_SBASE + (
-          (ch_one - HANGUL_LBASE) * HANGUL_VCOUNT + (ch_two - HANGUL_VBASE)
-        ) * HANGUL_TCOUNT
-      elsif ch_one >= HANGUL_SBASE &&
-          ch_one < HANGUL_SBASE + HANGUL_SCOUNT &&
-          (ch_one - HANGUL_SBASE) % HANGUL_TCOUNT == 0 &&
-          ch_two >= HANGUL_TBASE && ch_two < HANGUL_TBASE + HANGUL_TCOUNT
-           # Hangul LV + T
-        return ch_one + (ch_two - HANGUL_TBASE)
-      end
-
-      p = []
-
-      ucs4_to_utf8(ch_one, p)
-      ucs4_to_utf8(ch_two, p)
-
-      return lookup_unicode_composition(p)
-    end
-    private_class_method :unicode_compose_pair
-
-    def self.ucs4_to_utf8(char, buffer)
-      if char < 128
-        buffer << char
-      elsif char < 2048
-        buffer << (char >> 6 | 192)
-        buffer << (char & 63 | 128)
-      elsif char < 0x10000
-        buffer << (char >> 12 | 224)
-        buffer << (char >> 6 & 63 | 128)
-        buffer << (char & 63 | 128)
-      elsif char < 0x200000
-        buffer << (char >> 18 | 240)
-        buffer << (char >> 12 & 63 | 128)
-        buffer << (char >> 6 & 63 | 128)
-        buffer << (char & 63 | 128)
-      elsif char < 0x4000000
-        buffer << (char >> 24 | 248)
-        buffer << (char >> 18 & 63 | 128)
-        buffer << (char >> 12 & 63 | 128)
-        buffer << (char >> 6 & 63 | 128)
-        buffer << (char & 63 | 128)
-      elsif char < 0x80000000
-        buffer << (char >> 30 | 252)
-        buffer << (char >> 24 & 63 | 128)
-        buffer << (char >> 18 & 63 | 128)
-        buffer << (char >> 12 & 63 | 128)
-        buffer << (char >> 6 & 63 | 128)
-        buffer << (char & 63 | 128)
-      end
-    end
-    private_class_method :ucs4_to_utf8
-
-    def self.unicode_sort_canonical(unpacked)
-      unpacked = unpacked.dup
-      i = 1
-      length = unpacked.length
-
-      return unpacked if length < 2
-
-      while i < length
-        last = unpacked[i-1]
-        ch = unpacked[i]
-        last_cc = lookup_unicode_combining_class(last)
-        cc = lookup_unicode_combining_class(ch)
-        if cc != 0 && last_cc != 0 && last_cc > cc
-          unpacked[i] = last
-          unpacked[i-1] = ch
-          i -= 1 if i > 1
-        else
-          i += 1
-        end
-      end
-      return unpacked
-    end
-    private_class_method :unicode_sort_canonical
-
-    def self.unicode_decompose(unpacked)
-      unpacked_result = []
-      for cp in unpacked
-        if cp >= HANGUL_SBASE && cp < HANGUL_SBASE + HANGUL_SCOUNT
-          l, v, t = unicode_decompose_hangul(cp)
-          unpacked_result << l
-          unpacked_result << v if v
-          unpacked_result << t if t
-        else
-          dc = lookup_unicode_compatibility(cp)
-          unless dc
-            unpacked_result << cp
-          else
-            unpacked_result.concat(unicode_decompose(dc.unpack("U*")))
-          end
-        end
-      end
-      return unpacked_result
-    end
-    private_class_method :unicode_decompose
-
-    def self.unicode_decompose_hangul(codepoint)
-      sindex = codepoint - HANGUL_SBASE;
-      if sindex < 0 || sindex >= HANGUL_SCOUNT
-        l = codepoint
-        v = t = nil
-        return l, v, t
-      end
-      l = HANGUL_LBASE + sindex / HANGUL_NCOUNT
-      v = HANGUL_VBASE + (sindex % HANGUL_NCOUNT) / HANGUL_TCOUNT
-      t = HANGUL_TBASE + sindex % HANGUL_TCOUNT
-      if t == HANGUL_TBASE
-        t = nil
-      end
-      return l, v, t
-    end
-    private_class_method :unicode_decompose_hangul
-
-    def self.lookup_unicode_combining_class(codepoint)
-      codepoint_data = UNICODE_DATA[codepoint]
-      (codepoint_data ?
-        (codepoint_data[UNICODE_DATA_COMBINING_CLASS] || 0) :
-        0)
-    end
-    private_class_method :lookup_unicode_combining_class
-
-    def self.lookup_unicode_compatibility(codepoint)
-      codepoint_data = UNICODE_DATA[codepoint]
-      (codepoint_data ?
-        codepoint_data[UNICODE_DATA_COMPATIBILITY] : nil)
-    end
-    private_class_method :lookup_unicode_compatibility
-
     def self.lookup_unicode_lowercase(codepoint)
       codepoint_data = UNICODE_DATA[codepoint]
       (codepoint_data ?
         (codepoint_data[UNICODE_DATA_LOWERCASE] || codepoint) :
         codepoint)
     end
     private_class_method :lookup_unicode_lowercase
-
-    def self.lookup_unicode_composition(unpacked)
-      return COMPOSITION_TABLE[unpacked]
-    end
-    private_class_method :lookup_unicode_composition
-
-    HANGUL_SBASE =  0xac00
-    HANGUL_LBASE =  0x1100
-    HANGUL_LCOUNT = 19
-    HANGUL_VBASE =  0x1161
-    HANGUL_VCOUNT = 21
-    HANGUL_TBASE =  0x11a7
-    HANGUL_TCOUNT = 28
-    HANGUL_NCOUNT = HANGUL_VCOUNT * HANGUL_TCOUNT # 588
-    HANGUL_SCOUNT = HANGUL_LCOUNT * HANGUL_NCOUNT # 11172
 
     UNICODE_DATA_COMBINING_CLASS = 0
     UNICODE_DATA_EXCLUSION = 1
     UNICODE_DATA_CANONICAL = 2
     UNICODE_DATA_COMPATIBILITY = 3