spec/groupie_spec.rb in groupie-0.1.1 vs spec/groupie_spec.rb in groupie-0.2.2
- old
+ new
@@ -43,11 +43,41 @@
c = g.classify('discreetly')
c[:spam].should > c[:ham]
c2 = g.classify('user')
c2[:ham].should > c2[:spam]
end
+
+ describe "strategies" do
+ describe "sum" do
+ it "should weigh words for the sum of their occurances" do
+ g = Groupie.new
+ g[:spam].add %w[word] * 9
+ g[:ham].add %w[word]
+ g.classify('word', :sum).should == {:spam=>0.9, :ham=>0.1}
+ end
+ end
+
+ describe "sqrt" do
+ it "should weigh words for the square root of the sum of ocurances" do
+ g = Groupie.new
+ g[:spam].add %w[word] * 9
+ g[:ham].add %w[word]
+ g.classify('word', :sqrt).should == {:spam=>0.75, :ham=>0.25}
+ end
+ end
+
+ describe "log" do
+ it "should weigh words for log10 of their sum of occurances" do
+ g = Groupie.new
+ g[:spam].add %w[word] * 1000
+ g[:ham].add %w[word] * 10
+ g.classify('word', :log).should == {:spam=>0.75, :ham=>0.25}
+ end
+ end
+ end
end
+
context "classify_text" do
it 'should tokenized html emails' do
g = Groupie.new
spam_tokens = File.read(File.join(File.dirname(__FILE__), %w[fixtures spam spam.la-44118014.txt])).tokenize
ham_tokens = File.read(File.join(File.dirname(__FILE__), %w[fixtures ham spam.la-44116217.txt])).tokenize
@@ -68,8 +98,33 @@
g[:ham].add %w[buy flowers for your mom]
result = g.classify_text "Grow flowers to sell on our website".tokenize
result[:spam].should > result[:ham]
result2 = g.classify_text "Grow flowers to give to your mom".tokenize
result2[:ham].should == result2[:spam]
+ end
+
+ it "should skip unknown tokens" do
+ g = Groupie.new
+ g[:spam].add %w[buy viagra now]
+ g[:ham].add %w[buy flowers now]
+ g.classify_text(%w[buy buckets now]).should == {:spam=>0.5, :ham=>0.5}
+ end
+
+ it "should support the sqrt strategy" do
+ g = Groupie.new
+ g[:spam].add %w[one] * 9
+ g[:ham].add %w[one]
+ g[:spam].add %w[two] * 9
+ g[:ham].add %w[two]
+ g.classify_text(%w[one two three], :sqrt).should == {:spam=>0.75, :ham=>0.25}
+ end
+
+ it "should support the log strategy" do
+ g = Groupie.new
+ g[:spam].add %w[one] * 100
+ g[:ham].add %w[one]
+ g[:spam].add %w[two]
+ g[:ham].add %w[two] * 100
+ g.classify_text(%w[one two three], :log).should == {:spam=>0.5, :ham=>0.5}
end
end
end