lib/ting/conversions.rb in ting-0.3.0 vs lib/ting/conversions.rb in ting-0.9.0
- old
+ new
@@ -1,80 +1,88 @@
-require 'csv'
-require 'yaml'
-
-module Ting
- module Conversions
- All=[]
-
- DATA_DIR=File.dirname(__FILE__)+'/data/'
-
- #Load various representations for initials and finals
- %w(Initial Final).each do |c|
- klazz=Ting.const_get c
- begin
- CSV.open(DATA_DIR+c.downcase+'.csv', 'r').each do |name, *values|
- next if name == "name"
- All << name.to_s unless All.include?(name) || name =~ /standalone/i
- klazz.class_eval {attr_accessor name.to_sym}
- values.each_with_index do |v,i|
- klazz::All[i].send(name+'=', v)
- end
- end
- rescue
- puts "Bad data in #{c.downcase}.csv : " + $!
- raise
- end
-
- end
-
- #Substitution rules
- @@rules=YAML::load(IO.read(DATA_DIR+'rules.yaml'))
-
- def self.parse(type, string)
- if (final = Final::All.find {|f| f.respond_to?("#{type}_standalone") && f.send("#{type}_standalone") == string})
- TonelessSyllable.new(Initial::Empty, final)
- else
- finals = Final::All.dup
- finals.unshift(finals.delete(Final::Uo)) #hack : move Uo to the front
- #otherwise wadegiles parses 'lo' as Le+O rather than Le+Uo
- #probably better to add a hardcoded 'overrule' table for these cases
- Initial::All.each do |ini|
- finals.each do |fin|
- next if TonelessSyllable.illegal?(ini,fin)
- return TonelessSyllable.new(ini,fin) if apply_rules(type, (ini.send(type)||'') + (fin.send(type)||'')) == string
- end
- end
- end
- end
-
- def self.unparse(type, tsyll)
- if tsyll.initial.send(type)
- apply_rules(type, tsyll.initial.send(type) + (tsyll.final.send(type) || ''))
- elsif tsyll.final.respond_to?(type.to_s+'_standalone') && standalone = tsyll.final.send(type.to_s+'_standalone')
- standalone
- else
- apply_rules(type, tsyll.final.send(type))
- end
- end
-
- def self.tokenize(str)
- returning [] do |ary|
- str,pos = str.dup, 0
- while s=str.slice!(/[^' ]*/) and s != ""
- ary << [s.strip, pos]
- pos+=s.length
- str.slice!(/[' ]/)
- end
- end
- end
-
- private
- def self.apply_rules(type, string)
- returning string.dup do |s|
- @@rules[type] && @@rules[type].each do |rule|
- s.gsub!(Regexp.new(rule['match']),rule['subst'])
- end
- end
- end
-
- end
-end
+# coding: utf-8
+
+require 'csv'
+require 'yaml'
+
+module Ting
+ module Conversions
+ All=[]
+
+ DATA_DIR=File.dirname(__FILE__)+'/data/'
+
+ #Load various representations for initials and finals
+ %w(Initial Final).each do |c|
+ klazz=Ting.const_get c
+ begin
+ CSV.open(DATA_DIR+c.downcase+'.csv', 'r:utf-8').each do |name, *values|
+ next if name == "name"
+ All << name.to_s unless All.include?(name) || name =~ /standalone/i
+ klazz.class_eval {attr_accessor name.to_sym}
+ values.each_with_index do |v,i|
+ klazz::All[i].send(name+'=', v && v.force_encoding('UTF-8'))
+ end
+ end
+ rescue
+ STDERR << "Bad data in #{c.downcase}.csv : #{$!}"
+ raise
+ end
+
+ end
+
+ #Substitution rules
+ @@rules=YAML::load(IO.read(DATA_DIR+'rules.yaml'))
+
+ def self.parse(type, string)
+ capitalized = (string.downcase != string && string.downcase.capitalize == string)
+ string = string.to_s.downcase
+ if (final = Final::All.find {|f| f.respond_to?("#{type}_standalone") && f.send("#{type}_standalone") == string})
+ Syllable.new(Initial::Empty, final, nil, capitalized)
+ else
+ finals = Final::All.dup
+ finals.unshift(finals.delete(Final::Uo)) #hack : move Uo to the front
+ #otherwise wadegiles parses 'lo' as Le+O rather than Le+Uo
+ #probably better to add a hardcoded 'overrule' table for these cases
+ Initial.each do |ini|
+ finals.each do |fin|
+ next if Syllable.illegal?(ini,fin)
+ if string == apply_rules(type, (ini.send(type)||'') + (fin.send(type)||''))
+ return Syllable.new(ini, fin, nil, capitalized)
+ end
+ end
+ end
+ raise "Can't parse `#{string.inspect}'"
+ end
+ end
+
+ def self.unparse(type, tsyll)
+ str = if tsyll.initial.send(type)
+ apply_rules(type, tsyll.initial.send(type) + (tsyll.final.send(type) || ''))
+ elsif tsyll.final.respond_to?(type.to_s+'_standalone') && standalone = tsyll.final.send(type.to_s+'_standalone')
+ standalone
+ else
+ apply_rules(type, tsyll.final.send(type))
+ end
+ (tsyll.capitalized? ? str.capitalize : str).force_encoding('UTF-8')
+ end
+
+ def self.tokenize(str)
+ [].tap do |tokens|
+ str,pos = str.dup, 0
+ while str && token = str[/[^' ]*/]
+ tokens << [token.strip, pos]
+ pos += token.length
+ str = str[/[' ]+(.*)/, 1]
+ end
+ end
+ end
+
+ private
+ def self.apply_rules(type, string)
+ string.dup.tap do |s|
+ @@rules[type] && @@rules[type].each do |rule|
+ s.gsub!(Regexp.new(rule['match']), rule['subst'])
+ end
+ end
+ end
+
+ end
+end