lib/tataki/converters/skk_jisyo.rb in tataki-0.0.3 vs lib/tataki/converters/skk_jisyo.rb in tataki-0.0.4

- old
+ new

@@ -1,125 +1,122 @@ # coding: utf-8 require "yaml" require "time" require "skk/jisyo" -require "trie" module Tataki module Converter class SkkJisyo < Base DEFAULT_CONFIG_PATH = "../../../../data/skk-jisyo.yml" DEFAULT_JISYO_SUFFIXES = %w[M] def initialize(jisyo_types = DEFAULT_JISYO_SUFFIXES) @jisyo_paths = jisyo_types.map{|suffix| Skk::Jisyo.path(suffix) } - @trie_cache_path = trie_cache_path(jisyo_types.join("_")) + @table_cache_path = table_cache_path(jisyo_types.join("_")) config_file = File.expand_path(DEFAULT_CONFIG_PATH, __FILE__) config_data = YAML.load_file(config_file) @roman_data = config_data["roman_table"] @ignore_kana = config_data["ignore_kana"] - @trie = setup_jisyo.freeze + tables = setup_jisyo + @match_table = tables[0].freeze + @okurigana_table = tables[1].freeze end def setup_jisyo - if File.exist?(@trie_cache_path) - trie = Marshal.load(File.read(@trie_cache_path)) + if File.exist?(@table_cache_path) + tables = Marshal.load(File.read(@table_cache_path)) else - trie = Trie.new + match_table = {} + okurigana_table = {} @jisyo_paths.each do |jisyo_path| - add_jisyo(trie, jisyo_path) + add_jisyo(match_table, okurigana_table, jisyo_path) end - File.binwrite(@trie_cache_path, Marshal.dump(trie)) - File.write("#{@trie_cache_path}.timestamp", Time.now.to_s) + tables = [match_table, okurigana_table] + File.binwrite(@table_cache_path, Marshal.dump(tables)) + File.write("#{@table_cache_path}.timestamp", Time.now.to_s) end - trie + tables end - def add_jisyo(trie, jisyo_path) + def add_jisyo(match_table, okurigana_table, jisyo_path) File.open(jisyo_path, "rb:euc-jp") do |jisyo_file| jisyo_file.each_line do |line| next if line.empty? || line[0] == ";" || line.include?("#") kana, kanji_part = line.encode("utf-8").split(" ") next unless kana && kanji_part kana.gsub!(/[^ぁ-んa-z]/, "") next if kana.empty? || !(kana =~ /^[ぁ-ん]+[a-z]?/) || @ignore_kana.include?(kana) kanji_part.gsub!(/^\/|;.+|\/$/, "") + + table = kana =~ /^(.+)([a-z])$/ ? okurigana_table : match_table kanji_part.split("/").each do |kanji| - trie.insert(kanji, kana) + kanji_prefix = kanji[0] + table_entry = table[kanji_prefix] + table[kanji_prefix] = table_entry = [] unless table_entry + table_entry.push($2 ? [kanji, $1, $2] : [kanji, kana]) + table_entry.sort_by!{|entry| - (entry[0].size) } end end end end def jisyo_path File.expand_path("../../../../data/jisyo", __FILE__) end - def trie_cache_path(name) - File.join(jisyo_path, "SKK-JISYO.#{name}.trie.cache") + def table_cache_path(name) + File.join(jisyo_path, "SKK-JISYO.#{name}.table.cache") end def jisyo_timestamp(path) Time.parse(File.read("#{path}.timestamp")) end def to_kana(sentence) - _to_kana(sentence, "", "", @trie) + _to_kana(sentence, "") end private - def _to_kana(sentence, kana, prefix, trie, through_alphabet = true) - return if trie.empty? + def _to_kana(sentence, kana) return kana if sentence.empty? - next_ch = sentence[0] - next_sentence = sentence[1..-1] - next_trie = trie.find_prefix(next_ch) - next_trie_values = next_trie.values - next_trie_values.reject!{|value| value =~ /[a-z]/ } - next_set = next_trie.find([]) - next_set_values = next_set.values - okurigana = find_okurigana(next_set_values, next_sentence) - next_set_values.reject!{|value| value =~ /[a-z]/ } - if okurigana - return _to_kana(next_sentence, kana + okurigana, "", @trie) - elsif next_set_values.size > 0 && next_set_values.size == next_trie_values.size - return _to_kana(next_sentence, kana + next_set_values.sample, "", @trie) + table_entry = find_okurigana_entry(sentence) || find_match_entry(sentence) + if table_entry + next_kanji = table_entry[0] + next_kana = table_entry[1] + next_sentence = sentence[next_kanji.size .. -1] + return _to_kana(next_sentence, kana + next_kana) end - if next_sentence.empty? - if next_set_values.size > 0 - return kana + next_set_values.sample - elsif through_alphabet - return kana + prefix + next_ch - end - end + return _to_kana(sentence[1 .. -1], kana + sentence[0]) + end - next_kana = _to_kana(next_sentence, kana, prefix + next_ch, next_trie, false) + def find_okurigana_entry(sentence) + entries = @okurigana_table[sentence[0]] + return unless entries - if next_kana - return next_kana + entries.each do |entry| + kanji, yomi, alphabet = *entry + next unless sentence.start_with?(kanji) + next_ch = sentence[kanji.size] + okurigana_candidates = @roman_data[alphabet] + next unless okurigana_candidates + okurigana_candidates.each do |okurigana| + return entry if okurigana == next_ch + end end - - if next_set_values.size > 0 - return _to_kana(next_sentence, kana + next_set_values.sample, "", @trie) - elsif through_alphabet - return _to_kana(next_sentence, kana + prefix + next_ch, "", @trie) - else - return nil - end + nil end - def find_okurigana(yomi_candidates, next_sentence) - yomi_candidates.each do |yomi| - next unless yomi =~ /.+([a-z])$/ - okurigana_yomi = @roman_data[$1] - next unless okurigana_yomi - okurigana_yomi.each do |okurigana| - return yomi.gsub(/[a-z]$/, "") if next_sentence.start_with?(okurigana) - end + def find_match_entry(sentence) + entries = @match_table[sentence[0]] + return unless entries + + entries.each do |entry| + kanji, yomi = *entry + return entry if sentence.start_with?(kanji) end nil end end end