require File.expand_path(File.dirname(__FILE__) + '/spec_helper') describe "StringSet" do describe "#new" do it "should accept a string and tokenize it" do s = StringSet.new "tokenize me" s.strings.should == %w[tokenize me] end it "should accept an array of tokens" do s = StringSet.new %w[tokenized list] s.strings.should == %w[tokenized list] end it "could accept an array of multi-word tokens" do s = StringSet.new ["foo bar", "bar"] end it "should know the max token length of the multiword tokenset" do s = StringSet.new ["foo bar", "bar"] s.max_token_size.should == 2 end it "should have the option to stem" do s = StringSet.new %w[tokenized list], :stem => true s.should be_stemming end end describe "#ngramize" do it "should make the correct ngrams" do s = StringSet.new s.ngramize(%w[a b c d], 3).should == ["a", "b", "c", "d", "a b", "b c", "c d", "a b c", "b c d"] end end describe "#substrings_in" do it "should return a list of common substrings" do s = StringSet.new "tokenize me" s.substrings_in("can you please tokenize me?").should == %w[tokenize me] end it "should handle multiword substrings" do s = StringSet.new ["tokenize me"] s.substrings_in("can you please tokenize me?").should == ["tokenize me"] end it "should handle multiword substrings with stemming" do s = StringSet.new ["tokenize me"], :stem => true s.substrings_in("can you please tokenize me?").should == ["token me"] end it "should account for stemming" do s = StringSet.new "token me", :stem => true s.substrings_in("can you please tokenize me?").should == %w[token me] end it "should be pretty fast" do needles = %[love thine soldiers bananas monkeys bachelors masters doctorate] hamlet = File.read(File.join(File.dirname(__FILE__), "hamlet.txt")) Benchmark.measure do s = StringSet.new(needles) s.substrings_in(hamlet) end.real.should < 0.1 end end end