lib/gimchi/korean.rb in gimchi-0.1.6 vs lib/gimchi/korean.rb in gimchi-0.1.7
- old
+ new
@@ -1,287 +1,301 @@
# encoding: UTF-8
module Gimchi
class Korean
- DEFAULT_CONFIG_FILE_PATH =
- File.dirname(__FILE__) + '/../../config/default.yml'
+ DEFAULT_CONFIG_FILE_PATH =
+ File.dirname(__FILE__) + '/../../config/default.yml'
- # Returns the YAML configuration used by this Korean instance.
- # @return [String]
- attr_reader :config
+ # Returns the YAML configuration used by this Korean instance.
+ # @return [String]
+ attr_reader :config
- # Initialize Gimchi::Korean.
- # @param [String] config_file You can override many parts of the implementation by customizing config file
- def initialize config_file = DEFAULT_CONFIG_FILE_PATH
- require 'yaml'
- @config = YAML.load(File.read config_file)
- @config.freeze
+ # Initialize Gimchi::Korean.
+ # @param [String] config_file You can override many parts of the implementation by customizing config file
+ def initialize config_file = DEFAULT_CONFIG_FILE_PATH
+ require 'yaml'
+ @config = YAML.load(File.read config_file)
- @pronouncer = Korean::Pronouncer.send :new, self
- end
+ [
+ @config['romanization']['post substitution'],
+ @config['number']['post substitution'],
+ @config['number']['alt notation']['post substitution']
+ ].each do |r|
+ r.keys.each do |k|
+ r[Regexp.compile k] = r.delete k
+ end
+ end
+ @config.freeze
- # Array of chosung's.
- #
- # @return [Array] Array of chosung strings
- def chosungs
- config['structure']['chosung']
- end
+ @pronouncer = Korean::Pronouncer.send :new, self
+ end
- # Array of jungsung's.
- # @return [Array] Array of jungsung strings
- def jungsungs
- config['structure']['jungsung']
- end
+ # Array of chosung's.
+ #
+ # @return [Array] Array of chosung strings
+ def chosungs
+ config['structure']['chosung']
+ end
- # Array of jongsung's.
- # @return [Array] Array of jongsung strings
- def jongsungs
- config['structure']['jongsung']
- end
+ # Array of jungsung's.
+ # @return [Array] Array of jungsung strings
+ def jungsungs
+ config['structure']['jungsung']
+ end
- # Checks if the given character is a korean character.
- # @param [String] ch A string of size 1
- def korean_char? ch
- raise ArgumentError.new('Lengthy input') if ch.length > 1
+ # Array of jongsung's.
+ # @return [Array] Array of jongsung strings
+ def jongsungs
+ config['structure']['jongsung']
+ end
- complete_korean_char?(ch) ||
- (chosungs + jungsungs + jongsungs).include?(ch)
- end
+ # Checks if the given character is a korean character.
+ # @param [String] ch A string of size 1
+ def korean_char? ch
+ raise ArgumentError.new('Lengthy input') if ch.length > 1
- # Checks if the given character is a "complete" korean character.
- # "Complete" Korean character must have chosung and jungsung, with optional jongsung.
- # @param [String] ch A string of size 1
- def complete_korean_char? ch
- raise ArgumentError.new('Lengthy input') if ch.length > 1
+ complete_korean_char?(ch) ||
+ (chosungs + jungsungs + jongsungs).include?(ch)
+ end
- # Range of Korean chracters in Unicode 2.0: AC00(가) ~ D7A3(힣)
- ch.unpack('U').all? { | c | c >= 0xAC00 && c <= 0xD7A3 }
- end
+ # Checks if the given character is a "complete" korean character.
+ # "Complete" Korean character must have chosung and jungsung, with optional jongsung.
+ # @param [String] ch A string of size 1
+ def complete_korean_char? ch
+ raise ArgumentError.new('Lengthy input') if ch.length > 1
- # Splits the given string into an array of Korean::Char's and Strings of length 1.
- # @param [String] str Input string.
- # @return [Array] Mixed array of Korean::Char instances and Strings of length 1 (for non-korean characters)
- def dissect str
- str.each_char.map { |c|
- korean_char?(c) ? Korean::Char.new(self, c) : c
- }
- end
+ # Range of Korean chracters in Unicode 2.0: AC00(가) ~ D7A3(힣)
+ ch.unpack('U').all? { | c | c >= 0xAC00 && c <= 0xD7A3 }
+ end
- # Reads numeric expressions in Korean way.
- # @param [String, Number] str Numeric type or String containing numeric expressions
- # @return [String] Output string
- def read_number str
- nconfig = config['number']
-
- str.to_s.gsub(/(([+-]\s*)?[0-9,]*,*[0-9]+(\.[0-9]+(e[+-][0-9]+)?)?)(\s*.)?/) {
- read_number_sub($1, $5)
- }
- end
+ # Splits the given string into an array of Korean::Char's and Strings of length 1.
+ # @param [String] str Input string.
+ # @return [Array] Mixed array of Korean::Char instances and Strings of length 1 (for non-korean characters)
+ def dissect str
+ str.each_char.map { |c|
+ korean_char?(c) ? Korean::Char.new(self, c) : c
+ }
+ end
- # Returns the pronunciation of the given string containing Korean characters.
- # Takes optional options hash.
- #
- # @param [String] Input string
- # @param [Boolean] options[:pronounce_each_char] Each character of the string is pronounced respectively.
- # @param [Boolean] options[:slur] Strings separated by whitespaces are processed again as if they were contiguous.
- # @param [Boolean] options[:number] Numberic parts of the string is also pronounced in Korean.
- # @param [Array] options[:except] Allows you to skip certain transformations.
- # @return [String] Output string
- def pronounce str, options = {}
- options = {
- :pronounce_each_char => false,
- :slur => false,
- :number => true,
- :except => [],
- :debug => false
- }.merge options
+ # Reads numeric expressions in Korean way.
+ # @param [String, Number] str Numeric type or String containing numeric expressions
+ # @return [String] Output string
+ def read_number str
+ nconfig = config['number']
- str = read_number(str) if options[:number]
+ str.to_s.gsub(/(([+-]\s*)?[0-9,]*,*[0-9]+(\.[0-9]+(e[+-][0-9]+)?)?)(\s*.)?/) {
+ read_number_sub($1, $5)
+ }
+ end
- result, transforms = @pronouncer.send :pronounce!, str, options
+ # Returns the pronunciation of the given string containing Korean characters.
+ # Takes optional options hash.
+ #
+ # @param [String] Input string
+ # @param [Boolean] options[:pronounce_each_char] Each character of the string is pronounced respectively.
+ # @param [Boolean] options[:slur] Strings separated by whitespaces are processed again as if they were contiguous.
+ # @param [Boolean] options[:number] Numberic parts of the string is also pronounced in Korean.
+ # @param [Array] options[:except] Allows you to skip certain transformations.
+ # @return [String] Output string
+ def pronounce str, options = {}
+ options = {
+ :pronounce_each_char => false,
+ :slur => false,
+ :number => true,
+ :except => [],
+ :debug => false
+ }.merge options
- if options[:debug]
- return result, transforms
- else
- return result
- end
- end
+ str = read_number(str) if options[:number]
- # Returns the romanization (alphabetical notation) of the given Korean string.
- # http://en.wikipedia.org/wiki/Korean_romanization
- # @param [String] str Input Korean string
- # @param [Boolean] options[:as_pronounced] If true, #pronounce is internally called before romanize
- # @param [Boolean] options[:number] Whether to read numeric expressions in the string
- # @param [Boolean] options[:slur] Same as :slur in #pronounce
- # @return [String] Output string in Roman Alphabet
- # @see Korean#pronounce
- def romanize str, options = {}
- options = {
- :as_pronounced => true,
- :number => true,
- :slur => false
- }.merge options
+ result, transforms = @pronouncer.send :pronounce!, str, options
- require 'yaml'
- rdata = config['romanization']
- post_subs = rdata["post substitution"]
- rdata = [rdata["chosung"], rdata["jungsung"], rdata["jongsung"]]
+ if options[:debug]
+ return result, transforms
+ else
+ return result
+ end
+ end
- str = pronounce str,
- :pronounce_each_char => !options[:as_pronounced],
- :number => options[:number],
- :slur => options[:slur],
- # 제1항 [붙임 1] ‘ㅢ’는 ‘ㅣ’로 소리 나더라도 ‘ui’로 적는다.
- :except => %w[rule_5_3]
- dash = rdata[0]["ㅇ"]
- romanization = ""
+ # Returns the romanization (alphabetical notation) of the given Korean string.
+ # http://en.wikipedia.org/wiki/Korean_romanization
+ # @param [String] str Input Korean string
+ # @param [Boolean] options[:as_pronounced] If true, #pronounce is internally called before romanize
+ # @param [Boolean] options[:number] Whether to read numeric expressions in the string
+ # @param [Boolean] options[:slur] Same as :slur in #pronounce
+ # @return [String] Output string in Roman Alphabet
+ # @see Korean#pronounce
+ def romanize str, options = {}
+ options = {
+ :as_pronounced => true,
+ :number => true,
+ :slur => false
+ }.merge options
- romanize_chunk = lambda do | chunk |
- dissect(chunk).each do | kc |
- kc.to_a.each_with_index do | comp, idx |
- next if comp.nil?
- comp = rdata[idx][comp] || comp
- comp = comp[1..-1] if comp[0, 1] == dash &&
- (romanization.empty? || romanization[-1, 1] =~ /\s/)
- romanization += comp
- end
- end
+ require 'yaml'
+ rdata = config['romanization']
+ post_subs = rdata["post substitution"]
+ rdata = [rdata["chosung"], rdata["jungsung"], rdata["jongsung"]]
- return post_subs.keys.inject(romanization) { | output, pattern |
- output.gsub(pattern, post_subs[pattern])
- }
- end
+ str = pronounce str,
+ :pronounce_each_char => !options[:as_pronounced],
+ :number => options[:number],
+ :slur => options[:slur],
+ # 제1항 [붙임 1] ‘ㅢ’는 ‘ㅣ’로 소리 나더라도 ‘ui’로 적는다.
+ :except => %w[rule_5_3]
+ dash = rdata[0]["ㅇ"]
+ romanization = ""
- k_chunk = ""
- str.each_char do | c |
- if korean_char? c
- k_chunk += c
- else
- unless k_chunk.empty?
- romanization = romanize_chunk.call k_chunk
- k_chunk = ""
- end
- romanization += c
- end
- end
- romanization = romanize_chunk.call k_chunk unless k_chunk.empty?
- romanization
- end
+ romanize_chunk = lambda do | chunk |
+ dissect(chunk).each do | kc |
+ kc.to_a.each_with_index do | comp, idx |
+ next if comp.nil?
+ comp = rdata[idx][comp] || comp
+ comp = comp[1..-1] if comp[0, 1] == dash &&
+ (romanization.empty? || romanization[-1, 1] =~ /\s/)
+ romanization += comp
+ end
+ end
+ return post_subs.keys.inject(romanization) { | output, pattern |
+ output.gsub(pattern, post_subs[pattern])
+ }
+ end
+
+ k_chunk = ""
+ str.each_char do | c |
+ if korean_char? c
+ k_chunk += c
+ else
+ unless k_chunk.empty?
+ romanization = romanize_chunk.call k_chunk
+ k_chunk = ""
+ end
+ romanization += c
+ end
+ end
+ romanization = romanize_chunk.call k_chunk unless k_chunk.empty?
+ romanization
+ end
+
private
- def read_number_sub num, next_char
- nconfig = config['number']
+ def read_number_sub num, next_char
+ nconfig = config['number']
- num = num.gsub(',', '')
- next_char = next_char.to_s
- is_float = num.match(/[\.e]/) != nil
+ if num == '0'
+ return nconfig['digits'].first
+ end
- # Alternative notation for integers with proper suffix
- alt = false
- if is_float == false &&
- nconfig['alt notation']['when suffix'].keys.include?(next_char.strip)
- max = nconfig['alt notation']['when suffix'][next_char.strip]['max']
+ num = num.gsub(',', '')
+ next_char = next_char.to_s
+ is_float = num.match(/[\.e]/) != nil
- if max.nil? || num.to_i <= max
- alt = true
- end
- end
+ # Alternative notation for integers with proper suffix
+ alt = false
+ if is_float == false &&
+ nconfig['alt notation']['when suffix'].keys.include?(next_char.strip)
+ max = nconfig['alt notation']['when suffix'][next_char.strip]['max']
- # Sign
- sign = []
- negative = false
- if num =~ /^-/
- num = num.sub(/^-\s*/, '')
- sign << nconfig['negative']
- negative = true
- elsif num =~ /^\+/
- num = num.sub(/^\+\s*/, '')
- sign << nconfig['positive']
- end
+ if max.nil? || num.to_i <= max
+ alt = true
+ end
+ end
- if is_float
- below = nconfig['decimal point']
- below = nconfig['digits'][0] + below if num.to_f < 1
+ # Sign
+ sign = []
+ negative = false
+ if num =~ /^-/
+ num = num.sub(/^-\s*/, '')
+ sign << nconfig['negative']
+ negative = true
+ elsif num =~ /^\+/
+ num = num.sub(/^\+\s*/, '')
+ sign << nconfig['positive']
+ end
- if md = num.match(/(.*)e(.*)/)
- dp = md[1].index('.')
- num = md[1].tr '.', ''
- exp = md[2].to_i
+ if is_float
+ below = nconfig['decimal point']
+ below = nconfig['digits'][0] + below if num.to_f < 1
- dp += exp
- if dp > num.length
- num = num.ljust(dp, '0')
- num = num.sub(/^0+([1-9])/, "\\1")
+ if md = num.match(/(.*)e(.*)/)
+ dp = md[1].index('.')
+ num = md[1].tr '.', ''
+ exp = md[2].to_i
- below = ""
- elsif dp < 0
- num = '0.' + '0' * (-dp) + num
- else
- num[dp] = '.' + num[dp]
- end
- end
- num.sub(/.*\./, '').each_char do | char |
- below += nconfig['digits'][char.to_i]
- end if num.include? '.'
- num = num.sub(/\..*/, '')
- else
- below = ""
- end
+ dp += exp
+ if dp > num.length
+ num = num.ljust(dp, '0')
+ num = num.sub(/^0+([1-9])/, "\\1")
- tokens = []
- unit_idx = -1
- num = num.to_i
- while num > 0
- v = num % 10000
+ below = ""
+ elsif dp < 0
+ num = '0.' + '0' * (-dp) + num
+ else
+ num[dp, 1] = '.' + num[dp, 1]
+ end
+ end
+ num.sub(/.*\./, '').each_char do | char |
+ below += nconfig['digits'][char.to_i]
+ end if num.include? '.'
+ num = num.sub(/\..*/, '')
+ else
+ below = ""
+ end
- unit_idx += 1
- if v > 0
- if alt == false || unit_idx >= 1
- str = ""
- # Cannot use hash as they're unordered in 1.8
- [[1000, '천'],
- [100, '백'],
- [10, '십']].each do | arr |
- u, sub_unit = arr
- str += (nconfig['digits'][v/u] if v/u != 1).to_s + sub_unit + ' ' if v / u > 0
- v %= u
- end
- str += nconfig['digits'][v] if v > 0
+ tokens = []
+ unit_idx = -1
+ num = num.to_i
+ while num > 0
+ v = num % 10000
- tokens << str.sub(/ $/, '') + nconfig['units'][unit_idx]
- else
- str = ""
- tenfolds = nconfig['alt notation']['tenfolds']
- digits = nconfig['alt notation']['digits']
- alt_post_subs = nconfig['alt notation']['post substitution']
+ unit_idx += 1
+ if v > 0
+ if alt == false || unit_idx >= 1
+ str = ""
+ # Cannot use hash as they're unordered in 1.8
+ [[1000, '천'],
+ [100, '백'],
+ [10, '십']].each do | arr |
+ u, sub_unit = arr
+ str += (nconfig['digits'][v/u] if v/u != 1).to_s + sub_unit + ' ' if v / u > 0
+ v %= u
+ end
+ str += nconfig['digits'][v] if v > 0
- # Likewise.
- [[1000, '천'],
- [100, '백']].each do | u, sub_unit |
- str += (nconfig['digits'][v/u] if v/u != 1).to_s + sub_unit + ' ' if v / u > 0
- v %= u
- end
+ tokens << str.sub(/ $/, '') + nconfig['units'][unit_idx]
+ else
+ str = ""
+ tenfolds = nconfig['alt notation']['tenfolds']
+ digits = nconfig['alt notation']['digits']
+ alt_post_subs = nconfig['alt notation']['post substitution']
- str += tenfolds[(v / 10) - 1] if v / 10 > 0
- v %= 10
- str += digits[v] if v > 0
+ # Likewise.
+ [[1000, '천'],
+ [100, '백']].each do | u, sub_unit |
+ str += (nconfig['digits'][v/u] if v/u != 1).to_s + sub_unit + ' ' if v / u > 0
+ v %= u
+ end
- alt_post_subs.each do | k, v |
- str.gsub!(k, v)
- end if alt
- tokens << str.sub(/ $/, '') + nconfig['units'][unit_idx]
- end
- end
- num /= 10000
- end
+ str += tenfolds[(v / 10) - 1] if v / 10 > 0
+ v %= 10
+ str += digits[v] if v > 0
- tokens += sign unless sign.empty?
- ret = tokens.reverse.join(' ') + below + next_char
- nconfig['post substitution'].each do | k, v |
- ret.gsub!(k, v)
- end
- ret
- end
+ alt_post_subs.each do | k, v |
+ str.gsub!(k, v)
+ end if alt
+ tokens << str.sub(/ $/, '') + nconfig['units'][unit_idx]
+ end
+ end
+ num /= 10000
+ end
+
+ tokens += sign unless sign.empty?
+ ret = tokens.reverse.join(' ') + below + next_char
+ nconfig['post substitution'].each do | k, v |
+ ret.gsub!(k, v)
+ end
+ ret
+ end
end#Korean
end#Gimchi