# encoding: UTF-8 module Gimchi class Korean DEFAULT_CONFIG_FILE_PATH = File.dirname(__FILE__) + '/../../config/default.yml' # Returns the YAML configuration used by this Korean instance. # @return [String] attr_reader :config # Initialize Gimchi::Korean. # @param [String] config_file You can override many parts of the implementation by customizing config file def initialize config_file = DEFAULT_CONFIG_FILE_PATH require 'yaml' @config = YAML.load(File.read config_file) @config.freeze @pronouncer = Korean::Pronouncer.send :new, self end # Array of chosung's. # # @return [Array] Array of chosung strings def chosungs config['structure']['chosung'] end # Array of jungsung's. # @return [Array] Array of jungsung strings def jungsungs config['structure']['jungsung'] end # Array of jongsung's. # @return [Array] Array of jongsung strings def jongsungs config['structure']['jongsung'] end # Checks if the given character is a korean character. # @param [String] ch A string of size 1 def korean_char? ch raise ArgumentError.new('Lengthy input') if ch.length > 1 complete_korean_char?(ch) || (chosungs + jungsungs + jongsungs).include?(ch) end # Checks if the given character is a "complete" korean character. # "Complete" Korean character must have chosung and jungsung, with optional jongsung. # @param [String] ch A string of size 1 def complete_korean_char? ch raise ArgumentError.new('Lengthy input') if ch.length > 1 # Range of Korean chracters in Unicode 2.0: AC00(가) ~ D7A3(힣) ch.unpack('U').all? { | c | c >= 0xAC00 && c <= 0xD7A3 } end # Splits the given string into an array of Korean::Char's and Strings of length 1. # @param [String] str Input string. # @return [Array] Mixed array of Korean::Char instances and Strings of length 1 (for non-korean characters) def dissect str str.each_char.map { |c| korean_char?(c) ? Korean::Char.new(self, c) : c } end # Reads numeric expressions in Korean way. # @param [String, Number] str Numeric type or String containing numeric expressions # @return [String] Output string def read_number str nconfig = config['number'] str.to_s.gsub(/([+-]\s*)?[0-9,]*,*[0-9]+(\.[0-9]+)?(\s*.)?/) { read_number_sub($&, $3) } end # Returns the pronunciation of the given string containing Korean characters. # Takes optional options hash. # # @param [String] Input string # @param [Boolean] options[:pronounce_each_char] Each character of the string is pronounced respectively. # @param [Boolean] options[:slur] Strings separated by whitespaces are processed again as if they were contiguous. # @param [Boolean] options[:number] Numberic parts of the string is also pronounced in Korean. # @param [Array] options[:except] Allows you to skip certain transformations. # @return [String] Output string def pronounce str, options = {} options = { :pronounce_each_char => false, :slur => false, :number => true, :except => [], :debug => false }.merge options str = read_number(str) if options[:number] result, transforms = @pronouncer.send :pronounce!, str, options if options[:debug] return result, transforms else return result end end # Returns the romanization (alphabetical notation) of the given Korean string. # http://en.wikipedia.org/wiki/Korean_romanization # @param [String] str Input Korean string # @param [Boolean] options[:as_pronounced] If true, #pronounce is internally called before romanize # @param [Boolean] options[:number] Whether to read numeric expressions in the string # @param [Boolean] options[:slur] Same as :slur in #pronounce # @return [String] Output string in Roman Alphabet # @see Korean#pronounce def romanize str, options = {} options = { :as_pronounced => true, :number => true, :slur => false }.merge options require 'yaml' rdata = config['romanization'] post_subs = rdata["post substitution"] rdata = [rdata["chosung"], rdata["jungsung"], rdata["jongsung"]] str = pronounce str, :pronounce_each_char => !options[:as_pronounced], :number => options[:number], :slur => options[:slur], # 제1항 [붙임 1] ‘ㅢ’는 ‘ㅣ’로 소리 나더라도 ‘ui’로 적는다. :except => %w[rule_5_3] dash = rdata[0]["ㅇ"] romanization = "" romanize_chunk = lambda do | chunk | dissect(chunk).each do | kc | kc.to_a.each_with_index do | comp, idx | next if comp.nil? comp = rdata[idx][comp] || comp comp = comp[1..-1] if comp[0, 1] == dash && (romanization.empty? || romanization[-1, 1] =~ /\s/) romanization += comp end end return post_subs.keys.inject(romanization) { | output, pattern | output.gsub(pattern, post_subs[pattern]) } end k_chunk = "" str.each_char do | c | if korean_char? c k_chunk += c else unless k_chunk.empty? romanization = romanize_chunk.call k_chunk k_chunk = "" end romanization += c end end romanization = romanize_chunk.call k_chunk unless k_chunk.empty? romanization end private def read_number_sub num, next_char = nil nconfig = config['number'] # To number if num.is_a? String num = num.gsub(/[\s,]/, '') raise ArgumentError.new("Invalid number format") unless num =~ /[-+]?[0-9,]*\.?[0-9]*/ num = num.to_f == num.to_i ? num.to_i : num.to_f end # Alternative notation for integers with proper suffix alt = false if num.is_a?(Float) == false && nconfig['alt notation']['when suffix'].keys.include?(next_char.to_s.strip) max = nconfig['alt notation']['when suffix'][next_char.strip]['max'] if max.nil? || num <= max alt = true end end # Sign if num < 0 num = -1 * num negative = true else negative = false end if num.is_a? Float below = nconfig['decimal point'] below = nconfig['digits'][0] + below if num < 1 s = num.to_s if md = s.match(/(.*)e(.*)/) s = md[1].tr '.', '' exp = md[2].to_i if exp > 0 s = s.ljust(exp + 1, '0') else s = '0.' + '0' * (-exp - 1) + s end end s.sub(/.*\./, '').each_char do | char | below += nconfig['digits'][char.to_i] end num = num.floor.to_i else below = "" end tokens = [] unit_idx = -1 while num > 0 v = num % 10000 if alt == false || unit_idx >= 0 str = "" # Cannot use hash as they're unordered in 1.8 [[1000, '천'], [100, '백'], [10, '십']].each do | arr | u, sub_unit = arr str += (nconfig['digits'][v/u] if v/u != 1).to_s + sub_unit + ' ' if v / u > 0 v %= u end str += nconfig['digits'][v] if v > 0 tokens << str.sub(/ $/, '') + nconfig['units'][unit_idx += 1] else str = "" tenfolds = nconfig['alt notation']['tenfolds'] digits = nconfig['alt notation']['digits'] post_subs = nconfig['alt notation']['post substitution'] # Likewise. [[1000, '천'], [100, '백']].each do | u, sub_unit | str += (nconfig['digits'][v/u] if v/u != 1).to_s + sub_unit + ' ' if v / u > 0 v %= u end str += tenfolds[(v / 10) - 1] if v / 10 > 0 v %= 10 str += digits[v] if v > 0 suffix = next_char.strip str = str + suffix post_subs.each do | k, v | str.gsub!(k, v) end str.sub!(/#{suffix}$/, '') tokens << str.sub(/ $/, '') + nconfig['units'][unit_idx += 1] end num /= 10000 end tokens << nconfig['negative'] if negative tokens.reverse.join(' ') + next_char.to_s + below end end#Korean end#Gimchi