# encoding: UTF-8 # Copyright 2012 Twitter, Inc # http://www.apache.org/licenses/LICENSE-2.0 module TwitterCldr # Normalizers module includes algorithm for Unicode normalization. Basic information on this topic can be found in the # Unicode Standard Annex #15 "Unicode Normalization Forms" at http://www.unicode.org/reports/tr15/. More detailed # description is given in the section "3.11 Normalization Forms" of the Unicode Standard core specification. The # latest version at the moment (for Unicode 6.1) is available at http://www.unicode.org/versions/Unicode6.1.0/ch03.pdf. # module Normalizers # Implements normalization of a Unicode string to Normalization Form KD (NFKD). # This normalization form includes only compatibility decomposition. # class NFKD < Base class << self def normalize(string) code_points = TwitterCldr::Utils::CodePoints.from_string(string) normalized_code_points = normalize_code_points(code_points) TwitterCldr::Utils::CodePoints.to_string(normalized_code_points) end def normalize_code_points(code_points) canonical_ordering(decomposition(code_points)) end protected def decomposition(code_points) code_points.map { |code_point| decompose_recursively(code_point) }.flatten end # Recursively decomposes a given code point with the values in its Decomposition Mapping property. # def decompose_recursively(code_point) unicode_data = TwitterCldr::Shared::CodePoint.for_hex(code_point) return code_point unless unicode_data if unicode_data.hangul_type == :compositions decompose_hangul(code_point) else decompose_regular(code_point, decomposition_mapping(unicode_data)) end end # Decomposes regular (non-Hangul) code point. # def decompose_regular(code_point, mapping) if mapping && !mapping.empty? mapping.map{ |cp| decompose_recursively(cp) }.flatten else code_point end end # Returns code point's Decomposition Mapping based on its Unicode data. # def decomposition_mapping(unicode_data) mapping = parse_decomposition_mapping(unicode_data) mapping.shift if compatibility_decomposition?(mapping) # remove compatibility formatting tag mapping end def compatibility_decomposition?(mapping) !!(COMPATIBILITY_FORMATTING_TAG_REGEXP =~ mapping.first) end def parse_decomposition_mapping(unicode_data) unicode_data.decomposition.split end # Special decomposition for Hangul syllables. Documented in Section 3.12 at # http://www.unicode.org/versions/Unicode6.1.0/ch03.pdf # def decompose_hangul(code_point) s_index = code_point.hex - HANGUL_DECOMPOSITION_CONSTANTS[:SBase] l_index = s_index / HANGUL_DECOMPOSITION_CONSTANTS[:NCount] v_index = (s_index % HANGUL_DECOMPOSITION_CONSTANTS[:NCount]) / HANGUL_DECOMPOSITION_CONSTANTS[:TCount] t_index = s_index % HANGUL_DECOMPOSITION_CONSTANTS[:TCount] result = [] result << (HANGUL_DECOMPOSITION_CONSTANTS[:LBase] + l_index).to_s(16).upcase result << (HANGUL_DECOMPOSITION_CONSTANTS[:VBase] + v_index).to_s(16).upcase result << (HANGUL_DECOMPOSITION_CONSTANTS[:TBase] + t_index).to_s(16).upcase if t_index > 0 result end # Performs the Canonical Ordering Algorithm by stable sorting of every subsequence of combining code points # (code points that have combining class greater than zero). # def canonical_ordering(code_points) code_points_with_cc = code_points.map { |cp| [cp, combining_class_for(cp)] } result = [] accum = [] code_points_with_cc.each do |cp_with_cc| if cp_with_cc[1] == 0 unless accum.empty? result.concat(stable_sort(accum)) accum = [] end result << cp_with_cc else accum << cp_with_cc end end result.concat(stable_sort(accum)) unless accum.empty? result.map { |cp_with_cc| cp_with_cc[0] } end # Performs stable sorting of a sequence of [code_point, combining_class] pairs. For sorting a regular bubble # sort is used (with a small optimization that stops the algorithm if none of the elements were swapped during # the iteration). # def stable_sort(code_points_with_cc) n = code_points_with_cc.size - 2 code_points_with_cc.size.times do swapped = false (0..n).each do |j| if code_points_with_cc[j][1] > code_points_with_cc[j + 1][1] code_points_with_cc[j], code_points_with_cc[j + 1] = code_points_with_cc[j + 1], code_points_with_cc[j] swapped = true end end break unless swapped n -= 1 end code_points_with_cc end def combining_class_for(code_point) TwitterCldr::Shared::CodePoint.for_hex(code_point).combining_class.to_i rescue NoMethodError 0 end end COMPATIBILITY_FORMATTING_TAG_REGEXP = /^<.*>$/ end end end