Sha256: 81375b3ec913d320f01c5d8de8afe6e55e60c238843c835583981d8d7813ae6b

Contents?: true

Size: 813 Bytes

Versions: 1

Compression:

Stored size: 813 Bytes

Contents

#encoding: utf-8
require 'spec_helper'

describe TinySegmenter do
  subject{ TinySegmenter.new }

  it "tokenizes Japanese text fairly accurately" do
    subject.segment("極めてコンパクトな日本語分かち書きソフトウェアです。").should == \
      ["極めて", "コンパクト", "な", "日本", "語分", "かち", "書き", "ソフトウェア", "です", "。"]
  end

  it "removes any whitespace-only or empty tokens" do
    subject.segment("書かれた 極めて    コンパクト").should_not include("", " ")
  end

  it "tokenizes interspersed non-Japanese words correctly" do
    subject.segment("TinySegmenterはRubyだけで").should == ["TinySegmenter", "は", "Ruby", "だけ", "で"]
  end

  it "has a version" do
    TinySegmenter::VERSION.should_not be_empty
  end
end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
tiny_segmenter-0.0.2 spec/tiny_segmenter_spec.rb