base_spec.rb in picky-1.2.4

- old
+ new
@@ -1,169 +1,195 @@
 # encoding: utf-8
 #
 require 'spec_helper'
 
 describe Tokenizers::Base do
-
-  before(:each) do
-    @tokenizer = Tokenizers::Base.new
-  end
   
-  describe "substitute(s)_characters*" do
-    it "doesn't substitute if there is no substituter" do
-      @tokenizer.substitute_characters('abcdefghijklmnopqrstuvwxyzäöü').should == 'abcdefghijklmnopqrstuvwxyzäöü'
+  context 'with special instance' do
+    before(:each) do
+      @tokenizer = Tokenizers::Base.new reject_token_if: lambda { |token| token.to_s.length < 2 || token == :hello }
     end
-    it "uses the substituter to replace characters" do
-      @tokenizer.substitutes_characters_with CharacterSubstituters::WestEuropean.new
-      
-      @tokenizer.substitute_characters('abcdefghijklmnopqrstuvwxyzäöü').should == 'abcdefghijklmnopqrstuvwxyzaeoeue'
+    it 'rejects tokens with length < 2' do
+      @tokenizer.reject([:'', :a, :ab, :abc]).should == [:ab, :abc]
     end
-    it "uses the european substituter as default" do
-      @tokenizer.substitutes_characters_with
-      
-      @tokenizer.substitute_characters('abcdefghijklmnopqrstuvwxyzäöü').should == 'abcdefghijklmnopqrstuvwxyzaeoeue'
+    it 'rejects tokens that are called :hello' do
+      @tokenizer.reject([:hel, :hell, :hello]).should == [:hel, :hell]
     end
   end
   
-  describe "removes_characters_after_splitting" do
-    context "without removes_characters_after_splitting called" do
-      it "has remove_after_normalizing_illegals" do
-        lambda { @tokenizer.remove_after_normalizing_illegals('any') }.should_not raise_error
-      end
-      it 'should define a remove_after_normalizing_illegals normalize_with_patterns does nothing' do
-        unchanging = stub :unchanging
-        @tokenizer.remove_after_normalizing_illegals unchanging
-      end
+  context 'with normal instance' do
+    before(:each) do
+      @tokenizer = Tokenizers::Base.new
     end
-    context "with removes_characters_after_splitting called" do
-      before(:each) do
-        @tokenizer.removes_characters_after_splitting(/[afo]/)
+
+    describe 'reject_token_if' do
+      it 'rejects empty tokens by default' do
+        @tokenizer.reject(['a', nil, '', 'b']).should == ['a', 'b']
       end
-      it "has remove_after_normalizing_illegals" do
-        lambda { @tokenizer.remove_after_normalizing_illegals('abcdefghijklmnop') }.should_not raise_error
+      it 'rejects tokens based on the given rejection criteria if set' do
+        @tokenizer.reject_token_if &:nil?
+
+        @tokenizer.reject(['a', nil, '', 'b']).should == ['a', '', 'b']
       end
-      it "removes illegal characters" do
-        @tokenizer.remove_after_normalizing_illegals('abcdefghijklmnop').should == 'bcdeghijklmnp'
-      end
     end
-  end
-  
-  describe "normalizes_words" do
-    context "without normalizes_words called" do
-      it "has normalize_with_patterns" do
-        lambda { @tokenizer.normalize_with_patterns('any') }.should_not raise_error
+
+    describe "substitute(s)_characters*" do
+      it "doesn't substitute if there is no substituter" do
+        @tokenizer.substitute_characters('abcdefghijklmnopqrstuvwxyzäöü').should == 'abcdefghijklmnopqrstuvwxyzäöü'
       end
-      it 'should define a method normalize_with_patterns does nothing' do
-        unchanging = stub :unchanging
-        @tokenizer.normalize_with_patterns(unchanging).should == unchanging
+      it "uses the substituter to replace characters" do
+        @tokenizer.substitutes_characters_with CharacterSubstituters::WestEuropean.new
+
+        @tokenizer.substitute_characters('abcdefghijklmnopqrstuvwxyzäöü').should == 'abcdefghijklmnopqrstuvwxyzaeoeue'
       end
-    end
-    context "with normalizes_words called" do
-      before(:each) do
-        @tokenizer.normalizes_words([
-          [/st\./, 'sankt'],
-          [/stras?s?e?/, 'str']
-        ])
+      it "uses the european substituter as default" do
+        @tokenizer.substitutes_characters_with
+
+        @tokenizer.substitute_characters('abcdefghijklmnopqrstuvwxyzäöü').should == 'abcdefghijklmnopqrstuvwxyzaeoeue'
       end
-      it "has normalize_with_patterns" do
-        lambda { @tokenizer.normalize_with_patterns('a b/c.d') }.should_not raise_error
-      end
-      it "normalizes, but just the first one" do
-        @tokenizer.normalize_with_patterns('st. wegstrasse').should == 'sankt wegstrasse'
-      end
     end
-  end
-  
-  describe "splits_text_on" do
-    context "without splits_text_on called" do
-      it "has split" do
-        lambda { @tokenizer.split('any') }.should_not raise_error
+
+    describe "removes_characters_after_splitting" do
+      context "without removes_characters_after_splitting called" do
+        it "has remove_after_normalizing_illegals" do
+          lambda { @tokenizer.remove_after_normalizing_illegals('any') }.should_not raise_error
+        end
+        it 'should define a remove_after_normalizing_illegals normalize_with_patterns does nothing' do
+          unchanging = stub :unchanging
+          @tokenizer.remove_after_normalizing_illegals unchanging
+        end
       end
-      it 'should define a method split that splits by default on \s' do
-        @tokenizer.split('a b/c.d').should == ['a', 'b/c.d']
+      context "with removes_characters_after_splitting called" do
+        before(:each) do
+          @tokenizer.removes_characters_after_splitting(/[afo]/)
+        end
+        it "has remove_after_normalizing_illegals" do
+          lambda { @tokenizer.remove_after_normalizing_illegals('abcdefghijklmnop') }.should_not raise_error
+        end
+        it "removes illegal characters" do
+          @tokenizer.remove_after_normalizing_illegals('abcdefghijklmnop').should == 'bcdeghijklmnp'
+        end
       end
-      it 'splits text on /\s/ by default' do
-        @tokenizer.split('this is a test').should == ['this', 'is', 'a', 'test']
-      end
     end
-    context "with removes_characters called" do
-      before(:each) do
-        @tokenizer.splits_text_on(/[\s\.\/]/)
+
+    describe "normalizes_words" do
+      context "without normalizes_words called" do
+        it "has normalize_with_patterns" do
+          lambda { @tokenizer.normalize_with_patterns('any') }.should_not raise_error
+        end
+        it 'should define a method normalize_with_patterns does nothing' do
+          unchanging = stub :unchanging
+          @tokenizer.normalize_with_patterns(unchanging).should == unchanging
+        end
       end
-      it "has split" do
-        lambda { @tokenizer.split('a b/c.d') }.should_not raise_error
+      context "with normalizes_words called" do
+        before(:each) do
+          @tokenizer.normalizes_words([
+            [/st\./, 'sankt'],
+            [/stras?s?e?/, 'str']
+          ])
+        end
+        it "has normalize_with_patterns" do
+          lambda { @tokenizer.normalize_with_patterns('a b/c.d') }.should_not raise_error
+        end
+        it "normalizes, but just the first one" do
+          @tokenizer.normalize_with_patterns('st. wegstrasse').should == 'sankt wegstrasse'
+        end
       end
-      it "removes illegal characters" do
-        @tokenizer.split('a b/c.d').should == ['a','b','c','d']
-      end
     end
-  end
-  
-  describe "removes_characters" do
-    context "without removes_characters called" do
-      it "has remove_illegals" do
-        lambda { @tokenizer.remove_illegals('any') }.should_not raise_error
+
+    describe "splits_text_on" do
+      context "without splits_text_on called" do
+        it "has split" do
+          lambda { @tokenizer.split('any') }.should_not raise_error
+        end
+        it 'should define a method split that splits by default on \s' do
+          @tokenizer.split('a b/c.d').should == ['a', 'b/c.d']
+        end
+        it 'splits text on /\s/ by default' do
+          @tokenizer.split('this is a test').should == ['this', 'is', 'a', 'test']
+        end
       end
-      it 'should define a method remove_illegals that does nothing' do
-        unchanging = stub :unchanging
-        @tokenizer.remove_illegals unchanging
+      context "with removes_characters called" do
+        before(:each) do
+          @tokenizer.splits_text_on(/[\s\.\/]/)
+        end
+        it "has split" do
+          lambda { @tokenizer.split('a b/c.d') }.should_not raise_error
+        end
+        it "removes illegal characters" do
+          @tokenizer.split('a b/c.d').should == ['a','b','c','d']
+        end
       end
     end
-    context "with removes_characters called" do
-      before(:each) do
-        @tokenizer.removes_characters(/[afo]/)
+
+    describe "removes_characters" do
+      context "without removes_characters called" do
+        it "has remove_illegals" do
+          lambda { @tokenizer.remove_illegals('any') }.should_not raise_error
+        end
+        it 'should define a method remove_illegals that does nothing' do
+          unchanging = stub :unchanging
+          @tokenizer.remove_illegals unchanging
+        end
       end
-      it "has remove_illegals" do
-        lambda { @tokenizer.remove_illegals('abcdefghijklmnop') }.should_not raise_error
+      context "with removes_characters called" do
+        before(:each) do
+          @tokenizer.removes_characters(/[afo]/)
+        end
+        it "has remove_illegals" do
+          lambda { @tokenizer.remove_illegals('abcdefghijklmnop') }.should_not raise_error
+        end
+        it "removes illegal characters" do
+          @tokenizer.remove_illegals('abcdefghijklmnop').should == 'bcdeghijklmnp'
+        end
       end
-      it "removes illegal characters" do
-        @tokenizer.remove_illegals('abcdefghijklmnop').should == 'bcdeghijklmnp'
-      end
     end
-  end
-  
-  describe 'stopwords' do
-    context 'without stopwords given' do
-      it 'should define a method remove_stopwords' do
-        lambda { @tokenizer.remove_stopwords('from this text') }.should_not raise_error
+
+    describe 'stopwords' do
+      context 'without stopwords given' do
+        it 'should define a method remove_stopwords' do
+          lambda { @tokenizer.remove_stopwords('from this text') }.should_not raise_error
+        end
+        it 'should define a method remove_stopwords that does nothing' do
+          @tokenizer.remove_stopwords('from this text').should == 'from this text'
+        end
+        it 'should define a method remove_non_single_stopwords' do
+          lambda { @tokenizer.remove_non_single_stopwords('from this text') }.should_not raise_error
+
+        end
       end
-      it 'should define a method remove_stopwords that does nothing' do
-        @tokenizer.remove_stopwords('from this text').should == 'from this text'
+      context 'with stopwords given' do
+        before(:each) do
+          @tokenizer.stopwords(/r|e/)
+        end
+        it 'should define a method remove_stopwords' do
+          lambda { @tokenizer.remove_stopwords('from this text') }.should_not raise_error
+        end
+        it 'should define a method stopwords that removes stopwords' do
+          @tokenizer.remove_stopwords('from this text').should == 'fom this txt'
+        end
+        it 'should define a method remove_non_single_stopwords' do
+          lambda { @tokenizer.remove_non_single_stopwords('from this text') }.should_not raise_error
+        end
+        it 'should define a method remove_non_single_stopwords that removes non-single stopwords' do
+          @tokenizer.remove_non_single_stopwords('rerere rerere').should == ' '
+        end
+        it 'should define a method remove_non_single_stopwords that does not single stopwords' do
+          @tokenizer.remove_non_single_stopwords('rerere').should == 'rerere'
+        end
       end
-      it 'should define a method remove_non_single_stopwords' do
-        lambda { @tokenizer.remove_non_single_stopwords('from this text') }.should_not raise_error
-        
+      context 'error case' do
+        before(:each) do
+          @tokenizer.stopwords(/any/)
+        end
+        it 'should not remove non-single stopwords with a star' do
+          @tokenizer.remove_non_single_stopwords('a*').should == 'a*'
+        end
+        it 'should not remove non-single stopwords with a tilde' do
+          @tokenizer.remove_non_single_stopwords('a~').should == 'a~'
+        end
       end
     end
-    context 'with stopwords given' do
-      before(:each) do
-        @tokenizer.stopwords(/r|e/)
-      end
-      it 'should define a method remove_stopwords' do
-        lambda { @tokenizer.remove_stopwords('from this text') }.should_not raise_error
-      end
-      it 'should define a method stopwords that removes stopwords' do
-        @tokenizer.remove_stopwords('from this text').should == 'fom this txt'
-      end
-      it 'should define a method remove_non_single_stopwords' do
-        lambda { @tokenizer.remove_non_single_stopwords('from this text') }.should_not raise_error
-      end
-      it 'should define a method remove_non_single_stopwords that removes non-single stopwords' do
-        @tokenizer.remove_non_single_stopwords('rerere rerere').should == ' '
-      end
-      it 'should define a method remove_non_single_stopwords that does not single stopwords' do
-        @tokenizer.remove_non_single_stopwords('rerere').should == 'rerere'
-      end
-    end
-    context 'error case' do
-      before(:each) do
-        @tokenizer.stopwords(/any/)
-      end
-      it 'should not remove non-single stopwords with a star' do
-        @tokenizer.remove_non_single_stopwords('a*').should == 'a*'
-      end
-      it 'should not remove non-single stopwords with a tilde' do
-        @tokenizer.remove_non_single_stopwords('a~').should == 'a~'
-      end
-    end
   end
+  
 end
\ No newline at end of file