Rakefile in sekka-1.2.1 vs Rakefile in sekka-1.2.2
- old
+ new
@@ -50,11 +50,11 @@
'emacs/*.el'].to_a
gemspec.executables = ["sekka-jisyo",
"sekka-server",
"sekka-benchmark",
"sekka-path"]
- gemspec.required_ruby_version = '>= 1.9.1'
+ gemspec.required_ruby_version = '>= 1.9.2'
gemspec.add_dependency( "eventmachine" )
gemspec.add_dependency( "memcache-client" )
gemspec.add_dependency( "nendo", "= 0.6.4" )
gemspec.add_dependency( "distributed-trie" )
gemspec.add_dependency( "rack" )
@@ -68,11 +68,11 @@
task :default => [:test] do
end
task :compile do
# generate version.rb
- dictVersion = "1.2.2"
+ dictVersion = "1.3.0"
vh = Jeweler::VersionHelper.new "."
open( "./lib/sekka/sekkaversion.rb", "w" ) {|f|
f.puts( "class SekkaVersion" )
f.printf( " def self.version() \"%s\" end\n", vh )
f.printf( " def self.dictVersion() \"%s\" end\n", dictVersion )
@@ -142,11 +142,11 @@
files << "./test/henkan-main.nnd tokyocabinet"
files << "./test/henkan-main.nnd pure"
end
files.each {|filename|
nendopath = `which nendo`.chomp
- sh sprintf( "time ruby -I ./lib %s %s", nendopath, filename )
+ sh sprintf( "ruby -I ./lib %s %s", nendopath, filename )
}
sh "cat test.record"
end
task :bench do
@@ -202,20 +202,29 @@
sh "time ./bin/sekka-jisyo dump ./data/SEKKA-JISYO.LARGE.#{x}.tch > ./data/SEKKA-JISYO.LARGE.#{x}.tsv"
}
end
-# Fetched data from
-# http://s-yata.jp/corpus/nwc2010/ngrams/
-task :phrase => [ "./data/6gm-0000.txt" ] do
- sh "time ruby -I ./lib /usr/local/bin/nendo ./data/hiragana_phrase_in_webcorpus.nnd ./data/6gm-0000.txt | sort | uniq > /tmp/tmp.txt"
- sh "time ruby -I ./lib /usr/local/bin/nendo ./data/writing_phrase_filter.nnd /tmp/tmp.txt | sort | uniq > ./data/SKK-JISYO.hiragana-phrase"
+task :phrase => [ "/tmp/jawiki.txt.gz", "./data/wikipedia/jawiki.hiragana.txt" ] do
+ sh "sort ./data/wikipedia/jawiki.hiragana.txt | uniq -c | sort > ./data/wikipedia/ranking.txt"
+ sh "ruby -I ./lib /usr/local/bin/nendo ./data/hiragana_phrase_in_wikipedia2.nnd ./data/wikipedia/ranking.txt > ./data/SKK-JISYO.hiragana-phrase"
end
-file "./data/6gm-0000.txt" do
- sh "wget http://dist.s-yata.jp/corpus/nwc2010/ngrams/word/over999/6gms/6gm-0000.xz -O /tmp/6gm-0000.xz"
- sh "xz -cd /tmp/6gm-0000.xz > ./data/6gm-0000.txt"
+file "./data/wikipedia/jawiki.hiragana.txt" do
+ sh "zcat /tmp/jawiki.txt.gz | mecab --input-buffer-size=65536 -O wakati --output=/tmp/jawiki.wakati.txt"
+ sh "ruby -I ./lib /usr/local/bin/nendo ./data/hiragana_phrase_in_wikipedia.nnd /tmp/jawiki.wakati.txt > ./data/wikipedia/jawiki.hiragana.txt"
+ sh "rm -f /tmp/jawiki.wakati.txt"
end
+
+file "/tmp/jawiki.txt.gz" do
+ sh "mkdir -p ./data/wikipedia/txt"
+ sh "wget http://dumps.wikimedia.org/jawiki/latest/jawiki-latest-pages-articles.xml.bz2 -O /tmp/jawiki-latest-pages-articles.xml.bz2"
+ sh "wp2txt --input-file /tmp/jawiki-latest-pages-articles.xml.bz2 --output-dir ./data/wikipedia/txt"
+ sh "cat ./data/wikipedia/txt/*.txt | gzip -c > /tmp/jawiki.txt.gz"
+ sh "rm -f ./data/wikipedia/txt/*.txt"
+ sh "rm -f /tmp/jawiki-latest-pages-articles.xml.bz2"
+end
+
task :phrase2 => [ "./data/ipadic.all.utf8.txt" ] do
sh "time ruby -I ./lib /usr/local/bin/nendo ./data/hiragana_phrase_in_ipadic.nnd ./data/ipadic.all.utf8.txt | sort | uniq > ./data/SKK-JISYO.hiragana-phrase2"
end