Rakefile in suika-0.2.0 vs Rakefile in suika-0.3.0
- old
+ new
@@ -1,6 +1,79 @@
-require "bundler/gem_tasks"
-require "rspec/core/rake_task"
+require 'bundler/gem_tasks'
+require 'rspec/core/rake_task'
+require 'csv'
+require 'dartsclone'
+require 'nkf'
+require 'rubygems/package'
+require 'zlib'
+
RSpec::Core::RakeTask.new(:spec)
task :default => :spec
+
+desc 'Build suika system dictionary'
+task :dictionary do
+ base_dir = "#{__dir__}/dict/mecab-ipadic-2.7.0-20070801"
+ unless File.directory?(base_dir)
+ puts "Download mecab-ipadic file and expand that under dict directory: #{__dir__}/dict/mecab-ipadic-2.7.0-20070801"
+ puts
+ puts 'Example:'
+ puts 'wget -O dict/mecab-ipadic.tgz https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7MWVlSDBCSXZMTXM'
+ puts 'cd dict'
+ puts 'tar xzf mecab-ipadic.tgz'
+ puts 'cd ../'
+ next # exit
+ end
+
+ File.open("#{__dir__}/dict/mecab-ipadic-2.7.0-20070801/Reiwa.csv", 'w') do |f|
+ f.puts('令和,1288,1288,5904,名詞,固有名詞,一般,*,*,*,令和,レイワ,レイワ')
+ end
+
+ unknowns = {}
+ File.open("#{base_dir}/unk.def") do |f|
+ f.each_line do |line|
+ row = NKF.nkf('-w', line.chomp).split(',')
+ unknowns[row[0]] ||= []
+ unknowns[row[0]] << [row[1].to_i, row[2].to_i, row[3].to_i, *row[4..-1]]
+ end
+ end
+
+ dict = {}
+ Dir.glob("#{base_dir}/*.csv").each do |filename|
+ File.open(filename) do |f|
+ f.each_line do |line|
+ row = NKF.nkf('-w', line.chomp).split(',')
+ dict[row[0]] ||= []
+ dict[row[0]] << [row[1].to_i, row[2].to_i, row[3].to_i, *row[4..-1]]
+ end
+ end
+ end
+
+ da = DartsClone::DoubleArray.new
+ words = dict.keys.sort
+ da.build(words)
+ features = words.map { |w| dict[w] }
+
+ concosts = nil
+ File.open("#{base_dir}/matrix.def") do |f|
+ n_entries = f.readline.chomp.split.map(&:to_i).first
+ concosts = Array.new(n_entries) { Array.new(n_entries) }
+ f.each_line do |line|
+ row, col, cost = line.chomp.split.map(&:to_i)
+ concosts[row][col] = cost
+ end
+ end
+
+ ipadic = {
+ trie: da.get_array,
+ features: features,
+ unknowns: unknowns,
+ concosts: concosts
+ }
+
+ Zlib::GzipWriter.open("#{__dir__}/dict/sysdic.gz", Zlib::BEST_SPEED) { |f| f.write(Marshal.dump(ipadic)) }
+
+ puts 'The system dictionary has been successfully built:'
+ puts "#{__dir__}/dict/sysdic.gz"
+ puts Digest::SHA1.file("#{__dir__}/dict/sysdic.gz").to_s
+end