# encoding: UTF-8 # Copyright 2012 Twitter, Inc # http://www.apache.org/licenses/LICENSE-2.0 module TwitterCldr module Normalization module Hangul class << self SBASE = 0xAC00 LBASE = 0x1100 VBASE = 0x1161 TBASE = 0x11A7 LCOUNT = 19 VCOUNT = 21 TCOUNT = 28 NCOUNT = VCOUNT * TCOUNT # 588 SCOUNT = LCOUNT * NCOUNT # 11172 LLIMIT = LBASE + LCOUNT # 0x1113 = 4371 VLIMIT = VBASE + VCOUNT # 0x1176 = 4470 TLIMIT = TBASE + TCOUNT # 0x11C3 = 4547 SLIMIT = SBASE + SCOUNT # 0xD7A4 = 55204 # Special composition for Hangul syllables. Documented in Section 3.12 at # http://www.unicode.org/versions/Unicode6.1.0/ch03.pdf # def compose(code_points) l = code_points.first - LBASE v = code_points[1] - VBASE t = code_points[2] ? code_points[2] - TBASE : 0 # T part may be missing, that's ok SBASE + l * NCOUNT + v * TCOUNT + t end # Special decomposition for Hangul syllables. Documented in Section 3.12 at http://www.unicode.org/versions/Unicode6.1.0/ch03.pdf # Also see http://source.icu-project.org/repos/icu/icuhtml/trunk/design/collation/ICU_collation_design.htm#Hangul_Implicit_CEs # def decompose(code_point) decomposition_cache[code_point] ||= begin l = code_point - SBASE t = l % TCOUNT l /= TCOUNT v = l % VCOUNT l /= VCOUNT result = [] result << LBASE + l result << VBASE + v result << TBASE + t if t > 0 result end end def hangul_syllable?(code_point) (SBASE...SLIMIT).include?(code_point) end private def syllable_cache @syllable_cache ||= {} end def decomposition_cache @decomposition_cache ||= {} end end end end end