Sha256: d15bf037725519f1ad6bb44b16dbd55845e2f93010d5df46296756c4d36bbe93

Contents?: true

Size: 1.73 KB

Versions: 1

Compression:

Stored size: 1.73 KB

Contents

#encoding: utf-8
require 'spec_helper'

describe TinySegmenter do
  subject{ TinySegmenter.new }

  describe "#segment" do
    it "tokenizes Japanese text fairly accurately" do
      subject.segment("極めてコンパクトな日本語分かち書きソフトウェアです。").should == \
        ["極めて", "コンパクト", "な", "日本", "語分", "かち", "書き", "ソフトウェア", "です", "。"]
    end

    it "removes any whitespace-only or empty tokens" do
      subject.segment("書かれた 極めて    コンパクト").should_not include("", " ", nil)
    end

    it "removes full-width space (U+3000) tokens" do
      sentence = "すてき! 男性が歌う「夢やぶれて」もいいね。"
      full_width_space = " "
      sentence.should include(full_width_space)
      subject.segment(sentence).should_not include (full_width_space)
    end

    it "tokenizes interspersed non-Japanese words correctly" do
      subject.segment("TinySegmenterはRubyだけで").should == ["TinySegmenter", "は", "Ruby", "だけ", "で"]
    end

    context "with ignore_punctuation option not set" do
      it "includes punctuation-only tokens" do
        subject.segment("すてき!? 男性が、歌う「夢やぶれて」もいいね。...").should include("。", "!", "?", "、", "「", "」", "...")
      end
    end

    context "with ignore_punctuation option set" do
      it "removes all punctuation-only tokens" do
        subject.segment("すてき!? 男性が、歌う「夢やぶれて」もいいね。...", ignore_punctuation: true).should_not include("。", "!", "?", "、", "「", "」", "...")
      end
    end
  end

  it "has a version" do
    TinySegmenter::VERSION.should be_kind_of(String)
  end
end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
tiny_segmenter-0.0.4 spec/tiny_segmenter_spec.rb