# encoding: UTF-8 module Gimchi class Korean DEFAULT_CONFIG_FILE_PATH = File.dirname(__FILE__) + '/../../config/default.yml' # Returns the YAML configuration used by this Korean instance. # @return [String] attr_reader :config # Initialize Gimchi::Korean. # @param [String] config_file You can override many parts of the implementation by customizing config file def initialize config_file = DEFAULT_CONFIG_FILE_PATH require 'yaml' @config = YAML.load(File.read config_file) @config.freeze @pronouncer = Korean::Pronouncer.send :new, self end # Array of chosung's. # # @return [Array] Array of chosung strings def chosungs config['structure']['chosung'] end # Array of jungsung's. # @return [Array] Array of jungsung strings def jungsungs config['structure']['jungsung'] end # Array of jongsung's. # @return [Array] Array of jongsung strings def jongsungs config['structure']['jongsung'] end # Checks if the given character is a korean character. # @param [String] ch A string of size 1 def korean_char? ch raise ArgumentError.new('Lengthy input') if ch.length > 1 complete_korean_char?(ch) || (chosungs + jungsungs + jongsungs).include?(ch) end # Checks if the given character is a "complete" korean character. # "Complete" Korean character must have chosung and jungsung, with optional jongsung. # @param [String] ch A string of size 1 def complete_korean_char? ch raise ArgumentError.new('Lengthy input') if ch.length > 1 # Range of Korean chracters in Unicode 2.0: AC00(가) ~ D7A3(힣) ch.unpack('U').all? { | c | c >= 0xAC00 && c <= 0xD7A3 } end # Splits the given string into an array of Korean::Char's and Strings of length 1. # @param [String] str Input string. # @return [Array] Mixed array of Korean::Char instances and Strings of length 1 (for non-korean characters) def dissect str str.each_char.map { |c| korean_char?(c) ? Korean::Char.new(self, c) : c } end # Reads numeric expressions in Korean way. # @param [String, Number] str Numeric type or String containing numeric expressions # @return [String] Output string def read_number str nconfig = config['number'] str.to_s.gsub(/(([+-]\s*)?[0-9,]*,*[0-9]+(\.[0-9]+(e[+-][0-9]+)?)?)(\s*.)?/) { read_number_sub($1, $5) } end # Returns the pronunciation of the given string containing Korean characters. # Takes optional options hash. # # @param [String] Input string # @param [Boolean] options[:pronounce_each_char] Each character of the string is pronounced respectively. # @param [Boolean] options[:slur] Strings separated by whitespaces are processed again as if they were contiguous. # @param [Boolean] options[:number] Numberic parts of the string is also pronounced in Korean. # @param [Array] options[:except] Allows you to skip certain transformations. # @return [String] Output string def pronounce str, options = {} options = { :pronounce_each_char => false, :slur => false, :number => true, :except => [], :debug => false }.merge options str = read_number(str) if options[:number] result, transforms = @pronouncer.send :pronounce!, str, options if options[:debug] return result, transforms else return result end end # Returns the romanization (alphabetical notation) of the given Korean string. # http://en.wikipedia.org/wiki/Korean_romanization # @param [String] str Input Korean string # @param [Boolean] options[:as_pronounced] If true, #pronounce is internally called before romanize # @param [Boolean] options[:number] Whether to read numeric expressions in the string # @param [Boolean] options[:slur] Same as :slur in #pronounce # @return [String] Output string in Roman Alphabet # @see Korean#pronounce def romanize str, options = {} options = { :as_pronounced => true, :number => true, :slur => false }.merge options require 'yaml' rdata = config['romanization'] post_subs = rdata["post substitution"] rdata = [rdata["chosung"], rdata["jungsung"], rdata["jongsung"]] str = pronounce str, :pronounce_each_char => !options[:as_pronounced], :number => options[:number], :slur => options[:slur], # 제1항 [붙임 1] ‘ㅢ’는 ‘ㅣ’로 소리 나더라도 ‘ui’로 적는다. :except => %w[rule_5_3] dash = rdata[0]["ㅇ"] romanization = "" romanize_chunk = lambda do | chunk | dissect(chunk).each do | kc | kc.to_a.each_with_index do | comp, idx | next if comp.nil? comp = rdata[idx][comp] || comp comp = comp[1..-1] if comp[0, 1] == dash && (romanization.empty? || romanization[-1, 1] =~ /\s/) romanization += comp end end return post_subs.keys.inject(romanization) { | output, pattern | output.gsub(pattern, post_subs[pattern]) } end k_chunk = "" str.each_char do | c | if korean_char? c k_chunk += c else unless k_chunk.empty? romanization = romanize_chunk.call k_chunk k_chunk = "" end romanization += c end end romanization = romanize_chunk.call k_chunk unless k_chunk.empty? romanization end private def read_number_sub num, next_char nconfig = config['number'] num = num.gsub(',', '') next_char = next_char.to_s is_float = num.match(/[\.e]/) != nil # Alternative notation for integers with proper suffix alt = false if is_float == false && nconfig['alt notation']['when suffix'].keys.include?(next_char.strip) max = nconfig['alt notation']['when suffix'][next_char.strip]['max'] if max.nil? || num.to_i <= max alt = true end end # Sign sign = [] negative = false if num =~ /^-/ num = num.sub(/^-\s*/, '') sign << nconfig['negative'] negative = true elsif num =~ /^\+/ num = num.sub(/^\+\s*/, '') sign << nconfig['positive'] end if is_float below = nconfig['decimal point'] below = nconfig['digits'][0] + below if num.to_f < 1 if md = num.match(/(.*)e(.*)/) dp = md[1].index('.') num = md[1].tr '.', '' exp = md[2].to_i dp += exp if dp > num.length num = num.ljust(dp, '0') num = num.sub(/^0+([1-9])/, "\\1") below = "" elsif dp < 0 num = '0.' + '0' * (-dp) + num else num[dp] = '.' + num[dp] end end num.sub(/.*\./, '').each_char do | char | below += nconfig['digits'][char.to_i] end if num.include? '.' num = num.sub(/\..*/, '') else below = "" end tokens = [] unit_idx = -1 num = num.to_i while num > 0 v = num % 10000 unit_idx += 1 if v > 0 if alt == false || unit_idx >= 1 str = "" # Cannot use hash as they're unordered in 1.8 [[1000, '천'], [100, '백'], [10, '십']].each do | arr | u, sub_unit = arr str += (nconfig['digits'][v/u] if v/u != 1).to_s + sub_unit + ' ' if v / u > 0 v %= u end str += nconfig['digits'][v] if v > 0 tokens << str.sub(/ $/, '') + nconfig['units'][unit_idx] else str = "" tenfolds = nconfig['alt notation']['tenfolds'] digits = nconfig['alt notation']['digits'] alt_post_subs = nconfig['alt notation']['post substitution'] # Likewise. [[1000, '천'], [100, '백']].each do | u, sub_unit | str += (nconfig['digits'][v/u] if v/u != 1).to_s + sub_unit + ' ' if v / u > 0 v %= u end str += tenfolds[(v / 10) - 1] if v / 10 > 0 v %= 10 str += digits[v] if v > 0 alt_post_subs.each do | k, v | str.gsub!(k, v) end if alt tokens << str.sub(/ $/, '') + nconfig['units'][unit_idx] end end num /= 10000 end tokens += sign unless sign.empty? ret = tokens.reverse.join(' ') + below + next_char nconfig['post substitution'].each do | k, v | ret.gsub!(k, v) end ret end end#Korean end#Gimchi