lib/gimchi/korean.rb in gimchi-0.1.0 vs lib/gimchi/korean.rb in gimchi-0.1.1

- old
+ new

@@ -3,127 +3,119 @@ module Gimchi class Korean DEFAULT_CONFIG_FILE_PATH = File.dirname(__FILE__) + '/../../config/default.yml' + # Returns the YAML configuration used by this Korean instance. + # @return [String] attr_reader :config - attr_accessor :pronouncer # Initialize Gimchi::Korean. - # You can override many part of the implementation with customized config file. + # @param [String] config_file You can override many parts of the implementation by customizing config file def initialize config_file = DEFAULT_CONFIG_FILE_PATH require 'yaml' @config = YAML.load(File.read config_file) @config.freeze - @pronouncer = Korean::Pronouncer.new(self) + @pronouncer = Korean::Pronouncer.send :new, self end - # Array of chosung's + # Array of chosung's. + # + # @return [Array] Array of chosung strings def chosungs config['structure']['chosung'] end - # Array of jungsung's + # Array of jungsung's. + # @return [Array] Array of jungsung strings def jungsungs config['structure']['jungsung'] end - # Array of jongsung's + # Array of jongsung's. + # @return [Array] Array of jongsung strings def jongsungs config['structure']['jongsung'] end - # Checks if the given character is a korean character + # Checks if the given character is a korean character. + # @param [String] ch A string of size 1 def korean_char? ch raise ArgumentError.new('Lengthy input') if ch.length > 1 complete_korean_char?(ch) || (chosungs + jungsungs + jongsungs).include?(ch) end # Checks if the given character is a "complete" korean character. # "Complete" Korean character must have chosung and jungsung, with optional jongsung. + # @param [String] ch A string of size 1 def complete_korean_char? ch raise ArgumentError.new('Lengthy input') if ch.length > 1 # Range of Korean chracters in Unicode 2.0: AC00(가) ~ D7A3(힣) ch.unpack('U').all? { | c | c >= 0xAC00 && c <= 0xD7A3 } end - # Splits the given string into an array of Korean::Char's and strings. + # Splits the given string into an array of Korean::Char's and Strings of length 1. + # @param [String] str Input string. + # @return [Array] Mixed array of Korean::Char instances and Strings of length 1 (for non-korean characters) def dissect str str.each_char.map { |c| korean_char?(c) ? Korean::Char.new(self, c) : c } end - # Reads a string with numbers in Korean way. + # Reads numeric expressions in Korean way. + # @param [String, Number] str Numeric type or String containing numeric expressions + # @return [String] Output string def read_number str nconfig = config['number'] str.to_s.gsub(/([+-]\s*)?[0-9,]*,*[0-9]+(\.[0-9]+)?(\s*.)?/) { read_number_sub($&, $3) } end # Returns the pronunciation of the given string containing Korean characters. # Takes optional options hash. - # - If :pronounce_each_char is true, each character of the string is pronounced respectively. - # - If :slur is true, characters separated by whitespaces are treated as if they were contiguous. - # - If :number is true, numberic parts of the string is also pronounced in Korean. - # - :except array allows you to skip certain transformations. + # + # @param [String] Input string + # @param [Boolean] options[:pronounce_each_char] Each character of the string is pronounced respectively. + # @param [Boolean] options[:slur] Strings separated by whitespaces are processed again as if they were contiguous. + # @param [Boolean] options[:number] Numberic parts of the string is also pronounced in Korean. + # @param [Array] options[:except] Allows you to skip certain transformations. + # @return [String] Output string def pronounce str, options = {} options = { :pronounce_each_char => false, :slur => false, :number => true, :except => [], :debug => false }.merge options str = read_number(str) if options[:number] - chars = dissect str - transforms = [] - idx = -1 - while (idx += 1) < chars.length - c = chars[idx] + result, transforms = @pronouncer.send :pronounce!, str, options - next if c.is_a?(Korean::Char) == false - - next_c = chars[idx + 1] - next_kc = (options[:pronounce_each_char] == false && - next_c.is_a?(Korean::Char) && - next_c.complete?) ? next_c : nil - - transforms += @pronouncer.transform(c, next_kc, :except => options[:except]) - - # Slur (TBD) - if options[:slur] && options[:pronounce_each_char] == false && next_c =~ /\s/ - chars[(idx + 1)..-1].each_with_index do | nc, new_idx | - next if nc =~ /\s/ - - if nc.is_a?(Korean::Char) && nc.complete? - transforms += @pronouncer.transform(c, nc, :except => options[:except]) - end - - idx = idx + 1 + new_idx - 1 - break - end - end - end - if options[:debug] - return chars.join, transforms + return result, transforms else - chars.join + return result end end # Returns the romanization (alphabetical notation) of the given Korean string. # http://en.wikipedia.org/wiki/Korean_romanization + # @param [String] str Input Korean string + # @param [Boolean] options[:as_pronounced] If true, #pronounce is internally called before romanize + # @param [Boolean] options[:number] Whether to read numeric expressions in the string + # @param [Boolean] options[:slur] Same as :slur in #pronounce + # @return [String] Output string in Roman Alphabet + # @see Korean#pronounce def romanize str, options = {} options = { :as_pronounced => true, :number => true, :slur => false @@ -140,26 +132,40 @@ :slur => options[:slur], # 제1항 [붙임 1] ‘ㅢ’는 ‘ㅣ’로 소리 나더라도 ‘ui’로 적는다. :except => %w[rule_5_3] dash = rdata[0]["ㅇ"] romanization = "" - (chars = str.each_char.to_a).each_with_index do | kc, cidx | - if korean_char? kc - Korean::Char.new(self, kc).to_a.each_with_index do | comp, idx | + + romanize_chunk = lambda do | chunk | + dissect(chunk).each do | kc | + kc.to_a.each_with_index do | comp, idx | next if comp.nil? comp = rdata[idx][comp] || comp comp = comp[1..-1] if comp[0] == dash && (romanization.empty? || romanization[-1] =~ /\s/ || comp[1] == 'w') romanization += comp end - else - romanization += kc end + + return post_subs.keys.inject(romanization) { | output, pattern | + output.gsub(pattern, post_subs[pattern]) + } end - post_subs.keys.inject(romanization) { | output, pattern | - output.gsub(pattern, post_subs[pattern]) - }.capitalize + k_chunk = "" + str.each_char do | c | + if korean_char? c + k_chunk += c + else + unless k_chunk.empty? + romanization = romanize_chunk.call k_chunk + k_chunk = "" + end + romanization += c + end + end + romanization = romanize_chunk.call k_chunk unless k_chunk.empty? + romanization end private def read_number_sub num, next_char = nil nconfig = config['number']