# encoding: UTF-8 # Copyright 2012 Twitter, Inc # http://www.apache.org/licenses/LICENSE-2.0 module TwitterCldr module Normalizers # Implements normalization of a Unicode string to Normalization Form KC (NFKC). # This normalization form includes compatibility decomposition followed by compatibility composition. # class NFKC < Base class << self def normalize(string) code_points = TwitterCldr::Utils::CodePoints.from_string(string) normalized_code_points = normalize_code_points(code_points) TwitterCldr::Utils::CodePoints.to_string(normalized_code_points) end def normalize_code_points(code_points) compose(TwitterCldr::Normalizers::NFKD.normalize_code_points(code_points)) end protected def compose(code_points) final = [] hangul_code_points = [] code_points.each_with_index do |code_point, index| final << code_point hangul_type = TwitterCldr::Shared::CodePoint.hangul_type(code_point) next_hangul_type = TwitterCldr::Shared::CodePoint.hangul_type(code_points[index + 1]) if valid_hangul_sequence?(hangul_code_points.size, hangul_type) hangul_code_points << code_point unless valid_hangul_sequence?(hangul_code_points.size, next_hangul_type) next_hangul_type = nil end else hangul_code_points.clear end if hangul_code_points.size > 1 && !next_hangul_type hangul_code_points.size.times { final.pop } final << compose_hangul(hangul_code_points) hangul_code_points.clear end end compose_normal(final) final end def valid_hangul_sequence?(buffer_size, hangul_type) case [buffer_size, hangul_type] when [0, :lparts], [1, :vparts], [2, :tparts] true else false end end # Special composition for Hangul syllables. Documented in Section 3.12 at # http://www.unicode.org/versions/Unicode6.1.0/ch03.pdf # def compose_hangul(code_points) l_index = code_points.first.hex - HANGUL_DECOMPOSITION_CONSTANTS[:LBase] v_index = code_points[1].hex - HANGUL_DECOMPOSITION_CONSTANTS[:VBase] t_index = code_points[2] ? code_points[2].hex - HANGUL_DECOMPOSITION_CONSTANTS[:TBase] : 0 # tpart may be missing, that's ok lv_index = (l_index * HANGUL_DECOMPOSITION_CONSTANTS[:NCount]) + (v_index * HANGUL_DECOMPOSITION_CONSTANTS[:TCount]) (HANGUL_DECOMPOSITION_CONSTANTS[:SBase] + lv_index + t_index).to_s(16).upcase.rjust(4, "0") end # Implements composition of Unicode code points following the guidelines here: # http://www.unicode.org/versions/Unicode6.1.0/ch03.pdf - Section 3.12 # Combining code points are combined with their base characters. For example, "ñ" # can be decomposed into 006E 0303, one code point for the "n" and the "˜" respectively. # Composition reverses this process, turning 006E 0303 into a single 00F1 code point. # def compose_normal(code_points) index = 1 while index < code_points.size code_point = code_points[index] combining_class = combining_class_for(code_point) starter_index = find_starter_index(index, code_points) # is this character blocked from combining with the last starter? if starter_index < index - 1 previous_combining_class = combining_class_for(code_points[index - 1]) blocked = (previous_combining_class == 0) || (previous_combining_class >= combining_class) else blocked = false end unless blocked # do a reverse-lookup for the decomposed code points decomp_data = TwitterCldr::Shared::CodePoint.for_decomposition([code_points[starter_index], code_point]) # check if two code points are canonically equivalent if decomp_data && !decomp_data.excluded_from_composition? # combine the characters code_points[starter_index] = decomp_data.code_point code_points.delete_at(index) index -= 1 end end index += 1 end end def find_starter_index(start_pos, code_points) start_pos.times do |i| return start_pos - i - 1 if combining_class_for(code_points[start_pos - i - 1]) == 0 end end end end end end