Sha256: dffc9560fb3b584f338bb476fa7706ee796f16e27d83bb6b9895c516efa661c6

Contents?: true

Size: 1.93 KB

Versions: 2

Compression:

Stored size: 1.93 KB

Contents

require File.join(File.dirname(__FILE__), "..", "spec_helper")

describe Document do
  it "should remove punctuation from words" do
    Document.new("abc.").vector_of_features.should == [Feature.new("abc", 1)]
  end
  
  it "should remove numbers from words" do
    Document.new("abc1").vector_of_features.should == [Feature.new("abc", 1)]
  end
  
  it "should remove symbols from words" do
    Document.new("abc%").vector_of_features.should == [Feature.new("abc", 1)]
  end
  
  it "should lowercase text" do
    Document.new("ABC").vector_of_features.should == [Feature.new("abc", 1)]
  end
  
  it "should stem words" do
    Document.new("testing").vector_of_features.should == [Feature.new("test", 1)]
  end
  
  it "should count feature occurances" do
    Document.new("test doc test", :test).vector_of_features.should == 
      [Feature.new("doc", 1), Feature.new("test", 2)]
  end
end

describe UriDocument do
  
  def single_features(*uris)
    uris.flatten.map { |uri| Feature.new(uri.to_s, 1) }
  end
  
  it "should extract URI token separators &, ?, \\, /, =, [, ], and . separately" do
    expected_features = [:a,:b,:c,:d,:e,:f,:g,:h, :i, '&', '?', "\\", '/', '=', '[', ']', '.']
    expected = single_features(expected_features).sort
    UriDocument.new('a&b?c\d/e=f[g]h.i').feature_vectors.sort.should == expected
  end
  
  it "should extract two dots as a single feature instead of two dots" do
    UriDocument.new('..').feature_vectors.should == [Feature.new("..", 1)]
  end
  
  it "should extract two slashes as a single feature" do
    UriDocument.new('//').feature_vectors.should == [Feature.new('//', 1)]
    UriDocument.new("\\\\").feature_vectors.should == [Feature.new('\\\\', 1)]
  end
  
  it "should not stem words" do
    UriDocument.new("testing").feature_vectors.should == [Feature.new("testing", 1)]
  end
  
  it "should URI decode encoded strings" do
    UriDocument.new("%23%25").feature_vectors.should == [Feature.new("#%", 1)]
  end
  
end

Version data entries

2 entries across 2 versions & 2 rubygems

Version Path
danielsdeleo-basset-1.0.4 spec/unit/document_spec.rb
rjspotter-basset-1.0.5 spec/unit/document_spec.rb