lib/gimchi.rb in gimchi-0.1.9 vs lib/gimchi.rb in gimchi-0.2.0

- old
+ new

@@ -1,11 +1,381 @@ #!/usr/bin/env ruby # encoding: UTF-8 # Junegunn Choi (junegunn.c@gmail.com) -require 'gimchi/korean' +require 'yaml' +require 'set' require 'gimchi/char' require 'gimchi/pronouncer' -if RUBY_VERSION =~ /^1\.8\./ - require 'gimchi/patch_1.8' -end +class Gimchi + class << self + def setup + @@default ||= Gimchi.new + end + + def Char ch + @@default.kchar ch + end + + [ + :decompose, + :compose, + :korean_char?, + :complete_korean_char?, + :kchar, + :kchar?, + :chosung?, + :jungsung?, + :jongsung?, + :read_number, + :pronounce, + :romanize + ].each do |sym| + define_method(sym) do |*arg, &b| + @@default.send sym, *arg, &b + end + end + end + + CONFIG_FILE_PATH = File.expand_path('../../config/default.yml', __FILE__) + attr_reader :config, :chosungs, :jungsungs, :jongsungs + + # Initialize Gimchi::Korean. + def initialize + symbolize_keys = lambda do |val| + case val + when Hash + {}.tap do |h| + val.each do |k, v| + k = k.gsub(' ', '_').to_sym if k =~ /[a-z0-9 ]/ + h[k] = symbolize_keys.call v + end + end + when Array + val.map { |v| symbolize_keys.call v } + else + val + end + end + @config = symbolize_keys.call YAML.load(File.read CONFIG_FILE_PATH) + + [ + @config[:romanization][:post_substitution], + @config[:number][:post_substitution], + @config[:number][:alt_notation][:post_substitution] + ].each do |r| + r.keys.each do |k| + r[Regexp.compile k.to_s] = r.delete k + end + end + @config.freeze + + @pronouncer = Gimchi::Pronouncer.send :new, self + + @chosungs = config[:structure][:chosung] + @jungsungs = config[:structure][:jungsung] + @jongsungs = config[:structure][:jongsung] + @chosung_set = Set[*@chosungs] + @jungsung_set = Set[*@jungsungs] + @jongsung_set = Set[*@jongsungs] + @all = @chosung_set + @jungsung_set + @jongsung_set + end + + # Decompose a Korean character into 3 components + # @param [String] ch Korean character + # @return [Array] + def decompose ch + kchar(ch).to_a + end + + # Compose 3 elements into a Korean character String + # @param [String] chosung + # @param [String] jungsung + # @param [String] jongsung + # @return [String] + def compose chosung, jungsung = nil, jongsung = nil + if chosung.nil? && jungsung.nil? + "" + elsif chosung && jungsung + n1, n2, n3 = + n1 = chosungs.index(chosung) || 0 + n2 = jungsungs.index(jungsung) || 0 + n3 = ([nil] + jongsungs).index(jongsung) || 0 + [ 0xAC00 + n1 * (21 * 28) + n2 * 28 + n3 ].pack('U') + else + chosung || jungsung + end + end + + # @param [String] ch + # @return [Boolean] + def chosung? ch + @chosung_set.include? ch + end + + # @param [String] ch + # @return [Boolean] + def jungsung? ch + @jungsung_set.include? ch + end + + # @param [String] ch + # @return [Boolean] + def jongsung? ch + @jongsung_set.include? ch + end + + # Checks if the given character is a korean character. + # @param [String] ch A string of size 1 + def korean_char? ch + raise ArgumentError.new('Lengthy input') if str_length(ch) > 1 + + complete_korean_char?(ch) || @all.include?(ch) + end + alias kchar? korean_char? + + # Checks if the given character is a "complete" korean character. + # "Complete" Korean character must have chosung and jungsung, with optional jongsung. + # @param [String] ch A string of size 1 + def complete_korean_char? ch + raise ArgumentError.new('Lengthy input') if str_length(ch) > 1 + + # Range of Korean chracters in Unicode 2.0: AC00(가) ~ D7A3(힣) + ch.unpack('U').all? { | c | c >= 0xAC00 && c <= 0xD7A3 } + end + + # Returns a Gimchi::Char object for the given Korean character. + # @param [String] ch Korean character in String + # @return [Gimchi::Char] Gimchi::Char instance + def kchar ch + Gimchi::Char.new(self, ch) + end + + # Reads numeric expressions in Korean way. + # @param [String, Number] str Numeric type or String containing numeric expressions + # @return [String] Output string + def read_number str + str.to_s.gsub(/(([+-]\s*)?[0-9,]*,*[0-9]+(\.[0-9]+(e[+-][0-9]+)?)?)(\s*.)?/) { + read_number_sub($1, $5) + } + end + + # Returns the pronunciation of the given string containing Korean characters. + # Takes optional options hash. + # + # @param [String] Input string + # @param [Hash] options Options + # @option options [Boolean] each_char Each character of the string is pronounced respectively. + # @option options [Boolean] slur Strings separated by whitespaces are processed again as if they were contiguous. + # @option options [Boolean] number Numberic parts of the string is also pronounced in Korean. + # @option options [Array] except Allows you to skip certain transformations. + # @return [String] Output string + def pronounce str, options = {} + options = { + :each_char => false, + :slur => false, + :number => true, + :except => [], + :debug => false + }.merge options + + str = read_number(str) if options[:number] + + result, transforms = @pronouncer.send :pronounce!, str, options + + if options[:debug] + return result, transforms + else + return result + end + end + + # Returns the romanization (alphabetical notation) of the given Korean string. + # http://en.wikipedia.org/wiki/Korean_romanization + # @param [String] str Input Korean string + # @param [Hash] options Options + # @option options [Boolean] as_pronounced If true, #pronounce is internally called before romanize + # @option options [Boolean] number Whether to read numeric expressions in the string + # @option options [Boolean] slur Same as :slur in #pronounce + # @return [String] Output string in Roman Alphabet + # @see Korean#pronounce + def romanize str, options = {} + options = { + :as_pronounced => true, + :number => true, + :slur => false + }.merge options + + rdata = config[:romanization] + post_subs = rdata[:post_substitution] + rdata = [rdata[:chosung], rdata[:jungsung], rdata[:jongsung]] + + str = pronounce str, + :each_char => !options[:as_pronounced], + :number => options[:number], + :slur => options[:slur], + # 제1항 [붙임 1] ‘ㅢ’는 ‘ㅣ’로 소리 나더라도 ‘ui’로 적는다. + :except => %w[rule_5_3] + dash = rdata[0]["ㅇ"] + romanization = "" + + romanize_chunk = lambda do |chunk| + chunk.each_char.map { |ch| kchar(ch) rescue ch }.each do |kc| + kc.to_a.each_with_index do |comp, idx| + next if comp.nil? + comp = rdata[idx][comp] || comp + comp = comp[1..-1] if comp[0, 1] == dash && + (romanization.empty? || romanization[-1, 1] =~ /\s/) + romanization += comp + end + end + + return post_subs.keys.inject(romanization) { | output, pattern | + output.gsub(pattern, post_subs[pattern]) + } + end + + k_chunk = "" + str.each_char do | c | + if korean_char? c + k_chunk += c + else + unless k_chunk.empty? + romanization = romanize_chunk.call k_chunk + k_chunk = "" + end + romanization += c + end + end + romanization = romanize_chunk.call k_chunk unless k_chunk.empty? + romanization + end + +private + def str_length str + str.length + end + + def read_number_sub num, next_char + nconfig = config[:number] + + if num == '0' + return nconfig[:digits].first + end + + num = num.gsub(',', '') + next_char = next_char.to_s + is_float = num.match(/[\.e]/) != nil + + # Alternative notation for integers with proper suffix + alt = false + if is_float == false && + nconfig[:alt_notation][:when_suffix].keys.include?(next_char.strip) + max = nconfig[:alt_notation][:when_suffix][next_char.strip][:max] + + if max.nil? || num.to_i <= max + alt = true + end + end + + # Sign + sign = [] + negative = false + if num =~ /^-/ + num = num.sub(/^-\s*/, '') + sign << nconfig[:negative] + negative = true + elsif num =~ /^\+/ + num = num.sub(/^\+\s*/, '') + sign << nconfig[:positive] + end + + if is_float + below = nconfig[:decimal_point] + below = nconfig[:digits][0] + below if num.to_f < 1 + + if md = num.match(/(.*)e(.*)/) + dp = md[1].index('.') + num = md[1].tr '.', '' + exp = md[2].to_i + + dp += exp + if dp > num.length + num = num.ljust(dp, '0') + num = num.sub(/^0+([1-9])/, "\\1") + + below = "" + elsif dp < 0 + num = '0.' + '0' * (-dp) + num + else + num[dp, 1] = '.' + num[dp, 1] + end + end + num.sub(/.*\./, '').each_char do | char | + below += nconfig[:digits][char.to_i] + end if num.include? '.' + num = num.sub(/\..*/, '') + else + below = "" + end + + tokens = [] + unit_idx = -1 + num = num.to_i + while num > 0 + v = num % 10000 + + unit_idx += 1 + if v > 0 + if alt == false || unit_idx >= 1 + str = "" + # Cannot use hash as they're unordered in 1.8 + [[1000, '천'], + [100, '백'], + [10, '십']].each do | arr | + u, sub_unit = arr + str += (nconfig[:digits][v/u] if v/u != 1).to_s + sub_unit + ' ' if v / u > 0 + v %= u + end + str += nconfig[:digits][v] if v > 0 + + raise RangeError, "number too large" unless nconfig[:units][unit_idx] + tokens << str.sub(/ $/, '') + nconfig[:units][unit_idx] + else + str = "" + tenfolds = nconfig[:alt_notation][:tenfolds] + digits = nconfig[:alt_notation][:digits] + alt_post_subs = nconfig[:alt_notation][:post_substitution] + + # Likewise. + [[1000, '천'], + [100, '백']].each do |u, sub_unit| + str += (nconfig[:digits][v/u] if v/u != 1).to_s + sub_unit + ' ' if v / u > 0 + v %= u + end + + str += tenfolds[(v / 10) - 1] if v / 10 > 0 + v %= 10 + str += digits[v] if v > 0 + + alt_post_subs.each do |p, s| + str.gsub!(p, s) + end if alt + tokens << str.sub(/ $/, '') + nconfig[:units][unit_idx] + end + end + num /= 10000 + end + + tokens += sign unless sign.empty? + ret = tokens.reverse.join(' ') + below + next_char + nconfig[:post_substitution].each do |p, s| + ret.gsub!(p, s) + end + ret + end +end#Gimchi + +require 'gimchi/patch_1.8' + +Gimchi.setup