lib/gimchi.rb in gimchi-0.1.9 vs lib/gimchi.rb in gimchi-0.2.0
- old
+ new
@@ -1,11 +1,381 @@
#!/usr/bin/env ruby
# encoding: UTF-8
# Junegunn Choi (junegunn.c@gmail.com)
-require 'gimchi/korean'
+require 'yaml'
+require 'set'
require 'gimchi/char'
require 'gimchi/pronouncer'
-if RUBY_VERSION =~ /^1\.8\./
- require 'gimchi/patch_1.8'
-end
+class Gimchi
+ class << self
+ def setup
+ @@default ||= Gimchi.new
+ end
+
+ def Char ch
+ @@default.kchar ch
+ end
+
+ [
+ :decompose,
+ :compose,
+ :korean_char?,
+ :complete_korean_char?,
+ :kchar,
+ :kchar?,
+ :chosung?,
+ :jungsung?,
+ :jongsung?,
+ :read_number,
+ :pronounce,
+ :romanize
+ ].each do |sym|
+ define_method(sym) do |*arg, &b|
+ @@default.send sym, *arg, &b
+ end
+ end
+ end
+
+ CONFIG_FILE_PATH = File.expand_path('../../config/default.yml', __FILE__)
+ attr_reader :config, :chosungs, :jungsungs, :jongsungs
+
+ # Initialize Gimchi::Korean.
+ def initialize
+ symbolize_keys = lambda do |val|
+ case val
+ when Hash
+ {}.tap do |h|
+ val.each do |k, v|
+ k = k.gsub(' ', '_').to_sym if k =~ /[a-z0-9 ]/
+ h[k] = symbolize_keys.call v
+ end
+ end
+ when Array
+ val.map { |v| symbolize_keys.call v }
+ else
+ val
+ end
+ end
+ @config = symbolize_keys.call YAML.load(File.read CONFIG_FILE_PATH)
+
+ [
+ @config[:romanization][:post_substitution],
+ @config[:number][:post_substitution],
+ @config[:number][:alt_notation][:post_substitution]
+ ].each do |r|
+ r.keys.each do |k|
+ r[Regexp.compile k.to_s] = r.delete k
+ end
+ end
+ @config.freeze
+
+ @pronouncer = Gimchi::Pronouncer.send :new, self
+
+ @chosungs = config[:structure][:chosung]
+ @jungsungs = config[:structure][:jungsung]
+ @jongsungs = config[:structure][:jongsung]
+ @chosung_set = Set[*@chosungs]
+ @jungsung_set = Set[*@jungsungs]
+ @jongsung_set = Set[*@jongsungs]
+ @all = @chosung_set + @jungsung_set + @jongsung_set
+ end
+
+ # Decompose a Korean character into 3 components
+ # @param [String] ch Korean character
+ # @return [Array]
+ def decompose ch
+ kchar(ch).to_a
+ end
+
+ # Compose 3 elements into a Korean character String
+ # @param [String] chosung
+ # @param [String] jungsung
+ # @param [String] jongsung
+ # @return [String]
+ def compose chosung, jungsung = nil, jongsung = nil
+ if chosung.nil? && jungsung.nil?
+ ""
+ elsif chosung && jungsung
+ n1, n2, n3 =
+ n1 = chosungs.index(chosung) || 0
+ n2 = jungsungs.index(jungsung) || 0
+ n3 = ([nil] + jongsungs).index(jongsung) || 0
+ [ 0xAC00 + n1 * (21 * 28) + n2 * 28 + n3 ].pack('U')
+ else
+ chosung || jungsung
+ end
+ end
+
+ # @param [String] ch
+ # @return [Boolean]
+ def chosung? ch
+ @chosung_set.include? ch
+ end
+
+ # @param [String] ch
+ # @return [Boolean]
+ def jungsung? ch
+ @jungsung_set.include? ch
+ end
+
+ # @param [String] ch
+ # @return [Boolean]
+ def jongsung? ch
+ @jongsung_set.include? ch
+ end
+
+ # Checks if the given character is a korean character.
+ # @param [String] ch A string of size 1
+ def korean_char? ch
+ raise ArgumentError.new('Lengthy input') if str_length(ch) > 1
+
+ complete_korean_char?(ch) || @all.include?(ch)
+ end
+ alias kchar? korean_char?
+
+ # Checks if the given character is a "complete" korean character.
+ # "Complete" Korean character must have chosung and jungsung, with optional jongsung.
+ # @param [String] ch A string of size 1
+ def complete_korean_char? ch
+ raise ArgumentError.new('Lengthy input') if str_length(ch) > 1
+
+ # Range of Korean chracters in Unicode 2.0: AC00(가) ~ D7A3(힣)
+ ch.unpack('U').all? { | c | c >= 0xAC00 && c <= 0xD7A3 }
+ end
+
+ # Returns a Gimchi::Char object for the given Korean character.
+ # @param [String] ch Korean character in String
+ # @return [Gimchi::Char] Gimchi::Char instance
+ def kchar ch
+ Gimchi::Char.new(self, ch)
+ end
+
+ # Reads numeric expressions in Korean way.
+ # @param [String, Number] str Numeric type or String containing numeric expressions
+ # @return [String] Output string
+ def read_number str
+ str.to_s.gsub(/(([+-]\s*)?[0-9,]*,*[0-9]+(\.[0-9]+(e[+-][0-9]+)?)?)(\s*.)?/) {
+ read_number_sub($1, $5)
+ }
+ end
+
+ # Returns the pronunciation of the given string containing Korean characters.
+ # Takes optional options hash.
+ #
+ # @param [String] Input string
+ # @param [Hash] options Options
+ # @option options [Boolean] each_char Each character of the string is pronounced respectively.
+ # @option options [Boolean] slur Strings separated by whitespaces are processed again as if they were contiguous.
+ # @option options [Boolean] number Numberic parts of the string is also pronounced in Korean.
+ # @option options [Array] except Allows you to skip certain transformations.
+ # @return [String] Output string
+ def pronounce str, options = {}
+ options = {
+ :each_char => false,
+ :slur => false,
+ :number => true,
+ :except => [],
+ :debug => false
+ }.merge options
+
+ str = read_number(str) if options[:number]
+
+ result, transforms = @pronouncer.send :pronounce!, str, options
+
+ if options[:debug]
+ return result, transforms
+ else
+ return result
+ end
+ end
+
+ # Returns the romanization (alphabetical notation) of the given Korean string.
+ # http://en.wikipedia.org/wiki/Korean_romanization
+ # @param [String] str Input Korean string
+ # @param [Hash] options Options
+ # @option options [Boolean] as_pronounced If true, #pronounce is internally called before romanize
+ # @option options [Boolean] number Whether to read numeric expressions in the string
+ # @option options [Boolean] slur Same as :slur in #pronounce
+ # @return [String] Output string in Roman Alphabet
+ # @see Korean#pronounce
+ def romanize str, options = {}
+ options = {
+ :as_pronounced => true,
+ :number => true,
+ :slur => false
+ }.merge options
+
+ rdata = config[:romanization]
+ post_subs = rdata[:post_substitution]
+ rdata = [rdata[:chosung], rdata[:jungsung], rdata[:jongsung]]
+
+ str = pronounce str,
+ :each_char => !options[:as_pronounced],
+ :number => options[:number],
+ :slur => options[:slur],
+ # 제1항 [붙임 1] ‘ㅢ’는 ‘ㅣ’로 소리 나더라도 ‘ui’로 적는다.
+ :except => %w[rule_5_3]
+ dash = rdata[0]["ㅇ"]
+ romanization = ""
+
+ romanize_chunk = lambda do |chunk|
+ chunk.each_char.map { |ch| kchar(ch) rescue ch }.each do |kc|
+ kc.to_a.each_with_index do |comp, idx|
+ next if comp.nil?
+ comp = rdata[idx][comp] || comp
+ comp = comp[1..-1] if comp[0, 1] == dash &&
+ (romanization.empty? || romanization[-1, 1] =~ /\s/)
+ romanization += comp
+ end
+ end
+
+ return post_subs.keys.inject(romanization) { | output, pattern |
+ output.gsub(pattern, post_subs[pattern])
+ }
+ end
+
+ k_chunk = ""
+ str.each_char do | c |
+ if korean_char? c
+ k_chunk += c
+ else
+ unless k_chunk.empty?
+ romanization = romanize_chunk.call k_chunk
+ k_chunk = ""
+ end
+ romanization += c
+ end
+ end
+ romanization = romanize_chunk.call k_chunk unless k_chunk.empty?
+ romanization
+ end
+
+private
+ def str_length str
+ str.length
+ end
+
+ def read_number_sub num, next_char
+ nconfig = config[:number]
+
+ if num == '0'
+ return nconfig[:digits].first
+ end
+
+ num = num.gsub(',', '')
+ next_char = next_char.to_s
+ is_float = num.match(/[\.e]/) != nil
+
+ # Alternative notation for integers with proper suffix
+ alt = false
+ if is_float == false &&
+ nconfig[:alt_notation][:when_suffix].keys.include?(next_char.strip)
+ max = nconfig[:alt_notation][:when_suffix][next_char.strip][:max]
+
+ if max.nil? || num.to_i <= max
+ alt = true
+ end
+ end
+
+ # Sign
+ sign = []
+ negative = false
+ if num =~ /^-/
+ num = num.sub(/^-\s*/, '')
+ sign << nconfig[:negative]
+ negative = true
+ elsif num =~ /^\+/
+ num = num.sub(/^\+\s*/, '')
+ sign << nconfig[:positive]
+ end
+
+ if is_float
+ below = nconfig[:decimal_point]
+ below = nconfig[:digits][0] + below if num.to_f < 1
+
+ if md = num.match(/(.*)e(.*)/)
+ dp = md[1].index('.')
+ num = md[1].tr '.', ''
+ exp = md[2].to_i
+
+ dp += exp
+ if dp > num.length
+ num = num.ljust(dp, '0')
+ num = num.sub(/^0+([1-9])/, "\\1")
+
+ below = ""
+ elsif dp < 0
+ num = '0.' + '0' * (-dp) + num
+ else
+ num[dp, 1] = '.' + num[dp, 1]
+ end
+ end
+ num.sub(/.*\./, '').each_char do | char |
+ below += nconfig[:digits][char.to_i]
+ end if num.include? '.'
+ num = num.sub(/\..*/, '')
+ else
+ below = ""
+ end
+
+ tokens = []
+ unit_idx = -1
+ num = num.to_i
+ while num > 0
+ v = num % 10000
+
+ unit_idx += 1
+ if v > 0
+ if alt == false || unit_idx >= 1
+ str = ""
+ # Cannot use hash as they're unordered in 1.8
+ [[1000, '천'],
+ [100, '백'],
+ [10, '십']].each do | arr |
+ u, sub_unit = arr
+ str += (nconfig[:digits][v/u] if v/u != 1).to_s + sub_unit + ' ' if v / u > 0
+ v %= u
+ end
+ str += nconfig[:digits][v] if v > 0
+
+ raise RangeError, "number too large" unless nconfig[:units][unit_idx]
+ tokens << str.sub(/ $/, '') + nconfig[:units][unit_idx]
+ else
+ str = ""
+ tenfolds = nconfig[:alt_notation][:tenfolds]
+ digits = nconfig[:alt_notation][:digits]
+ alt_post_subs = nconfig[:alt_notation][:post_substitution]
+
+ # Likewise.
+ [[1000, '천'],
+ [100, '백']].each do |u, sub_unit|
+ str += (nconfig[:digits][v/u] if v/u != 1).to_s + sub_unit + ' ' if v / u > 0
+ v %= u
+ end
+
+ str += tenfolds[(v / 10) - 1] if v / 10 > 0
+ v %= 10
+ str += digits[v] if v > 0
+
+ alt_post_subs.each do |p, s|
+ str.gsub!(p, s)
+ end if alt
+ tokens << str.sub(/ $/, '') + nconfig[:units][unit_idx]
+ end
+ end
+ num /= 10000
+ end
+
+ tokens += sign unless sign.empty?
+ ret = tokens.reverse.join(' ') + below + next_char
+ nconfig[:post_substitution].each do |p, s|
+ ret.gsub!(p, s)
+ end
+ ret
+ end
+end#Gimchi
+
+require 'gimchi/patch_1.8'
+
+Gimchi.setup