Rakefile in sekka-1.2.1 vs Rakefile in sekka-1.2.2

- old
+ new

@@ -50,11 +50,11 @@ 'emacs/*.el'].to_a gemspec.executables = ["sekka-jisyo", "sekka-server", "sekka-benchmark", "sekka-path"] - gemspec.required_ruby_version = '>= 1.9.1' + gemspec.required_ruby_version = '>= 1.9.2' gemspec.add_dependency( "eventmachine" ) gemspec.add_dependency( "memcache-client" ) gemspec.add_dependency( "nendo", "= 0.6.4" ) gemspec.add_dependency( "distributed-trie" ) gemspec.add_dependency( "rack" ) @@ -68,11 +68,11 @@ task :default => [:test] do end task :compile do # generate version.rb - dictVersion = "1.2.2" + dictVersion = "1.3.0" vh = Jeweler::VersionHelper.new "." open( "./lib/sekka/sekkaversion.rb", "w" ) {|f| f.puts( "class SekkaVersion" ) f.printf( " def self.version() \"%s\" end\n", vh ) f.printf( " def self.dictVersion() \"%s\" end\n", dictVersion ) @@ -142,11 +142,11 @@ files << "./test/henkan-main.nnd tokyocabinet" files << "./test/henkan-main.nnd pure" end files.each {|filename| nendopath = `which nendo`.chomp - sh sprintf( "time ruby -I ./lib %s %s", nendopath, filename ) + sh sprintf( "ruby -I ./lib %s %s", nendopath, filename ) } sh "cat test.record" end task :bench do @@ -202,20 +202,29 @@ sh "time ./bin/sekka-jisyo dump ./data/SEKKA-JISYO.LARGE.#{x}.tch > ./data/SEKKA-JISYO.LARGE.#{x}.tsv" } end -# Fetched data from -# http://s-yata.jp/corpus/nwc2010/ngrams/ -task :phrase => [ "./data/6gm-0000.txt" ] do - sh "time ruby -I ./lib /usr/local/bin/nendo ./data/hiragana_phrase_in_webcorpus.nnd ./data/6gm-0000.txt | sort | uniq > /tmp/tmp.txt" - sh "time ruby -I ./lib /usr/local/bin/nendo ./data/writing_phrase_filter.nnd /tmp/tmp.txt | sort | uniq > ./data/SKK-JISYO.hiragana-phrase" +task :phrase => [ "/tmp/jawiki.txt.gz", "./data/wikipedia/jawiki.hiragana.txt" ] do + sh "sort ./data/wikipedia/jawiki.hiragana.txt | uniq -c | sort > ./data/wikipedia/ranking.txt" + sh "ruby -I ./lib /usr/local/bin/nendo ./data/hiragana_phrase_in_wikipedia2.nnd ./data/wikipedia/ranking.txt > ./data/SKK-JISYO.hiragana-phrase" end -file "./data/6gm-0000.txt" do - sh "wget http://dist.s-yata.jp/corpus/nwc2010/ngrams/word/over999/6gms/6gm-0000.xz -O /tmp/6gm-0000.xz" - sh "xz -cd /tmp/6gm-0000.xz > ./data/6gm-0000.txt" +file "./data/wikipedia/jawiki.hiragana.txt" do + sh "zcat /tmp/jawiki.txt.gz | mecab --input-buffer-size=65536 -O wakati --output=/tmp/jawiki.wakati.txt" + sh "ruby -I ./lib /usr/local/bin/nendo ./data/hiragana_phrase_in_wikipedia.nnd /tmp/jawiki.wakati.txt > ./data/wikipedia/jawiki.hiragana.txt" + sh "rm -f /tmp/jawiki.wakati.txt" end + +file "/tmp/jawiki.txt.gz" do + sh "mkdir -p ./data/wikipedia/txt" + sh "wget http://dumps.wikimedia.org/jawiki/latest/jawiki-latest-pages-articles.xml.bz2 -O /tmp/jawiki-latest-pages-articles.xml.bz2" + sh "wp2txt --input-file /tmp/jawiki-latest-pages-articles.xml.bz2 --output-dir ./data/wikipedia/txt" + sh "cat ./data/wikipedia/txt/*.txt | gzip -c > /tmp/jawiki.txt.gz" + sh "rm -f ./data/wikipedia/txt/*.txt" + sh "rm -f /tmp/jawiki-latest-pages-articles.xml.bz2" +end + task :phrase2 => [ "./data/ipadic.all.utf8.txt" ] do sh "time ruby -I ./lib /usr/local/bin/nendo ./data/hiragana_phrase_in_ipadic.nnd ./data/ipadic.all.utf8.txt | sort | uniq > ./data/SKK-JISYO.hiragana-phrase2" end