# encoding: UTF-8 module TwitterCldr module Normalizers class NFD < Base @@hangul_constants = {:SBase => "AC00".hex, :LBase => "1100".hex, :VBase => "1161".hex, :TBase => "11A7".hex, :Scount => 11172, :LCount => 19, :VCount => 21, :TCount => 28, :NCount => 588, :Scount => 1172} class << self def normalize(string) #Convert string to code points code_points = string.split('').map { |char| char_to_code_point(char) } #Normalize code points normalized_code_points = normalize_code_points(code_points) #Convert normalized code points back to string normalized_code_points.map { |code_point| code_point_to_char(code_point) }.join end def normalize_code_points(code_points) code_points = code_points.map { |code_point| decompose code_point }.flatten reorder code_points code_points end #Recursively replace the given code point with the values in its Decomposition_Mapping property def decompose(code_point) unicode_data = TwitterCldr::Shared::UnicodeData.for_code_point(code_point) return code_point unless unicode_data decomposition_mapping = unicode_data.decomposition.split # Special decomposition for Hangul syllables. # Documented in Section 3.12 at http://www.unicode.org/versions/Unicode6.1.0/ch03.pdf if unicode_data.name.include? 'Hangul' sIndex = code_point.hex - @@hangul_constants[:SBase] lIndex = sIndex / @@hangul_constants[:NCount] vIndex = (sIndex % @@hangul_constants[:NCount]) / @@hangul_constants[:TCount] tIndex = sIndex % @@hangul_constants[:TCount] lPart = (@@hangul_constants[:LBase] + lIndex).to_s(16).upcase vPart = (@@hangul_constants[:VBase] + vIndex).to_s(16).upcase tPart = (@@hangul_constants[:TBase] + tIndex).to_s(16).upcase if tIndex > 0 [lPart, vPart, tPart].compact #Return the code point if compatibility mapping or if no mapping exists elsif decomposition_mapping.first =~ /<.*>/ || decomposition_mapping.empty? code_point else decomposition_mapping.map do |decomposition_code_point| decompose(decomposition_code_point) end.flatten end end #Swap any two adjacent code points A & B if ccc(A) > ccc(B) > 0 def reorder(code_points) (code_points.size).times do code_points.each_with_index do |cp, i| unless i == (code_points.size - 1) ccc_a, ccc_b = combining_class_for(cp), combining_class_for(code_points[i+1]) if (ccc_a > ccc_b) && (ccc_b > 0) code_points[i], code_points[i+1] = code_points[i+1], code_points[i] end end end end end def combining_class_for(code_point) begin unicode_data = TwitterCldr::Shared::UnicodeData.for_code_point(code_point).combining_class.to_i rescue NoMethodError 0 end end end end end end