Sha256: d15bf037725519f1ad6bb44b16dbd55845e2f93010d5df46296756c4d36bbe93
Contents?: true
Size: 1.73 KB
Versions: 1
Compression:
Stored size: 1.73 KB
Contents
#encoding: utf-8 require 'spec_helper' describe TinySegmenter do subject{ TinySegmenter.new } describe "#segment" do it "tokenizes Japanese text fairly accurately" do subject.segment("極めてコンパクトな日本語分かち書きソフトウェアです。").should == \ ["極めて", "コンパクト", "な", "日本", "語分", "かち", "書き", "ソフトウェア", "です", "。"] end it "removes any whitespace-only or empty tokens" do subject.segment("書かれた 極めて コンパクト").should_not include("", " ", nil) end it "removes full-width space (U+3000) tokens" do sentence = "すてき! 男性が歌う「夢やぶれて」もいいね。" full_width_space = " " sentence.should include(full_width_space) subject.segment(sentence).should_not include (full_width_space) end it "tokenizes interspersed non-Japanese words correctly" do subject.segment("TinySegmenterはRubyだけで").should == ["TinySegmenter", "は", "Ruby", "だけ", "で"] end context "with ignore_punctuation option not set" do it "includes punctuation-only tokens" do subject.segment("すてき!? 男性が、歌う「夢やぶれて」もいいね。...").should include("。", "!", "?", "、", "「", "」", "...") end end context "with ignore_punctuation option set" do it "removes all punctuation-only tokens" do subject.segment("すてき!? 男性が、歌う「夢やぶれて」もいいね。...", ignore_punctuation: true).should_not include("。", "!", "?", "、", "「", "」", "...") end end end it "has a version" do TinySegmenter::VERSION.should be_kind_of(String) end end
Version data entries
1 entries across 1 versions & 1 rubygems
Version | Path |
---|---|
tiny_segmenter-0.0.4 | spec/tiny_segmenter_spec.rb |