Rakefile in suika-0.2.0 vs Rakefile in suika-0.3.0

- old
+ new

@@ -1,6 +1,79 @@ -require "bundler/gem_tasks" -require "rspec/core/rake_task" +require 'bundler/gem_tasks' +require 'rspec/core/rake_task' +require 'csv' +require 'dartsclone' +require 'nkf' +require 'rubygems/package' +require 'zlib' + RSpec::Core::RakeTask.new(:spec) task :default => :spec + +desc 'Build suika system dictionary' +task :dictionary do + base_dir = "#{__dir__}/dict/mecab-ipadic-2.7.0-20070801" + unless File.directory?(base_dir) + puts "Download mecab-ipadic file and expand that under dict directory: #{__dir__}/dict/mecab-ipadic-2.7.0-20070801" + puts + puts 'Example:' + puts 'wget -O dict/mecab-ipadic.tgz https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7MWVlSDBCSXZMTXM' + puts 'cd dict' + puts 'tar xzf mecab-ipadic.tgz' + puts 'cd ../' + next # exit + end + + File.open("#{__dir__}/dict/mecab-ipadic-2.7.0-20070801/Reiwa.csv", 'w') do |f| + f.puts('令和,1288,1288,5904,名詞,固有名詞,一般,*,*,*,令和,レイワ,レイワ') + end + + unknowns = {} + File.open("#{base_dir}/unk.def") do |f| + f.each_line do |line| + row = NKF.nkf('-w', line.chomp).split(',') + unknowns[row[0]] ||= [] + unknowns[row[0]] << [row[1].to_i, row[2].to_i, row[3].to_i, *row[4..-1]] + end + end + + dict = {} + Dir.glob("#{base_dir}/*.csv").each do |filename| + File.open(filename) do |f| + f.each_line do |line| + row = NKF.nkf('-w', line.chomp).split(',') + dict[row[0]] ||= [] + dict[row[0]] << [row[1].to_i, row[2].to_i, row[3].to_i, *row[4..-1]] + end + end + end + + da = DartsClone::DoubleArray.new + words = dict.keys.sort + da.build(words) + features = words.map { |w| dict[w] } + + concosts = nil + File.open("#{base_dir}/matrix.def") do |f| + n_entries = f.readline.chomp.split.map(&:to_i).first + concosts = Array.new(n_entries) { Array.new(n_entries) } + f.each_line do |line| + row, col, cost = line.chomp.split.map(&:to_i) + concosts[row][col] = cost + end + end + + ipadic = { + trie: da.get_array, + features: features, + unknowns: unknowns, + concosts: concosts + } + + Zlib::GzipWriter.open("#{__dir__}/dict/sysdic.gz", Zlib::BEST_SPEED) { |f| f.write(Marshal.dump(ipadic)) } + + puts 'The system dictionary has been successfully built:' + puts "#{__dir__}/dict/sysdic.gz" + puts Digest::SHA1.file("#{__dir__}/dict/sysdic.gz").to_s +end