lib/ting/conversions.rb in ting-0.3.0 vs lib/ting/conversions.rb in ting-0.9.0

- old
+ new

@@ -1,80 +1,88 @@ -require 'csv' -require 'yaml' - -module Ting - module Conversions - All=[] - - DATA_DIR=File.dirname(__FILE__)+'/data/' - - #Load various representations for initials and finals - %w(Initial Final).each do |c| - klazz=Ting.const_get c - begin - CSV.open(DATA_DIR+c.downcase+'.csv', 'r').each do |name, *values| - next if name == "name" - All << name.to_s unless All.include?(name) || name =~ /standalone/i - klazz.class_eval {attr_accessor name.to_sym} - values.each_with_index do |v,i| - klazz::All[i].send(name+'=', v) - end - end - rescue - puts "Bad data in #{c.downcase}.csv : " + $! - raise - end - - end - - #Substitution rules - @@rules=YAML::load(IO.read(DATA_DIR+'rules.yaml')) - - def self.parse(type, string) - if (final = Final::All.find {|f| f.respond_to?("#{type}_standalone") && f.send("#{type}_standalone") == string}) - TonelessSyllable.new(Initial::Empty, final) - else - finals = Final::All.dup - finals.unshift(finals.delete(Final::Uo)) #hack : move Uo to the front - #otherwise wadegiles parses 'lo' as Le+O rather than Le+Uo - #probably better to add a hardcoded 'overrule' table for these cases - Initial::All.each do |ini| - finals.each do |fin| - next if TonelessSyllable.illegal?(ini,fin) - return TonelessSyllable.new(ini,fin) if apply_rules(type, (ini.send(type)||'') + (fin.send(type)||'')) == string - end - end - end - end - - def self.unparse(type, tsyll) - if tsyll.initial.send(type) - apply_rules(type, tsyll.initial.send(type) + (tsyll.final.send(type) || '')) - elsif tsyll.final.respond_to?(type.to_s+'_standalone') && standalone = tsyll.final.send(type.to_s+'_standalone') - standalone - else - apply_rules(type, tsyll.final.send(type)) - end - end - - def self.tokenize(str) - returning [] do |ary| - str,pos = str.dup, 0 - while s=str.slice!(/[^' ]*/) and s != "" - ary << [s.strip, pos] - pos+=s.length - str.slice!(/[' ]/) - end - end - end - - private - def self.apply_rules(type, string) - returning string.dup do |s| - @@rules[type] && @@rules[type].each do |rule| - s.gsub!(Regexp.new(rule['match']),rule['subst']) - end - end - end - - end -end +# coding: utf-8 + +require 'csv' +require 'yaml' + +module Ting + module Conversions + All=[] + + DATA_DIR=File.dirname(__FILE__)+'/data/' + + #Load various representations for initials and finals + %w(Initial Final).each do |c| + klazz=Ting.const_get c + begin + CSV.open(DATA_DIR+c.downcase+'.csv', 'r:utf-8').each do |name, *values| + next if name == "name" + All << name.to_s unless All.include?(name) || name =~ /standalone/i + klazz.class_eval {attr_accessor name.to_sym} + values.each_with_index do |v,i| + klazz::All[i].send(name+'=', v && v.force_encoding('UTF-8')) + end + end + rescue + STDERR << "Bad data in #{c.downcase}.csv : #{$!}" + raise + end + + end + + #Substitution rules + @@rules=YAML::load(IO.read(DATA_DIR+'rules.yaml')) + + def self.parse(type, string) + capitalized = (string.downcase != string && string.downcase.capitalize == string) + string = string.to_s.downcase + if (final = Final::All.find {|f| f.respond_to?("#{type}_standalone") && f.send("#{type}_standalone") == string}) + Syllable.new(Initial::Empty, final, nil, capitalized) + else + finals = Final::All.dup + finals.unshift(finals.delete(Final::Uo)) #hack : move Uo to the front + #otherwise wadegiles parses 'lo' as Le+O rather than Le+Uo + #probably better to add a hardcoded 'overrule' table for these cases + Initial.each do |ini| + finals.each do |fin| + next if Syllable.illegal?(ini,fin) + if string == apply_rules(type, (ini.send(type)||'') + (fin.send(type)||'')) + return Syllable.new(ini, fin, nil, capitalized) + end + end + end + raise "Can't parse `#{string.inspect}'" + end + end + + def self.unparse(type, tsyll) + str = if tsyll.initial.send(type) + apply_rules(type, tsyll.initial.send(type) + (tsyll.final.send(type) || '')) + elsif tsyll.final.respond_to?(type.to_s+'_standalone') && standalone = tsyll.final.send(type.to_s+'_standalone') + standalone + else + apply_rules(type, tsyll.final.send(type)) + end + (tsyll.capitalized? ? str.capitalize : str).force_encoding('UTF-8') + end + + def self.tokenize(str) + [].tap do |tokens| + str,pos = str.dup, 0 + while str && token = str[/[^' ]*/] + tokens << [token.strip, pos] + pos += token.length + str = str[/[' ]+(.*)/, 1] + end + end + end + + private + def self.apply_rules(type, string) + string.dup.tap do |s| + @@rules[type] && @@rules[type].each do |rule| + s.gsub!(Regexp.new(rule['match']), rule['subst']) + end + end + end + + end +end