base.rb in picky-1.5.3

- old
+ new

@@ -1,20 +1,37 @@
 module Internals
-  
+
   module Tokenizers # :nodoc:all
-  
+
     # Defines tokenizing processes used both in indexing and querying.
     #
     class Base
-    
+
       # TODO Move EMPTY_STRING top level.
       #
       EMPTY_STRING = ''.freeze
-    
+
+      def to_s
+        reject_condition_location = @reject_condition.to_s[/:(\d+) \(lambda\)/, 1]
+        <<-TOKENIZER
+Removes characters:        #{@removes_characters_regexp ? "/#{@removes_characters_regexp.source}/" : '-'}
+Stopwords:                 #{@remove_stopwords_regexp ? "/#{@remove_stopwords_regexp.source}/" : '-'}
+Splits text on:            #{@splits_text_on.respond_to?(:source) ? "/#{@splits_text_on.source}/" : (@splits_text_on ? @splits_text_on : '-')}
+Removes chars after split: #{@removes_characters_after_splitting_regexp ? "/#{@removes_characters_after_splitting_regexp.source}/" : '-'}
+Normalizes words:          #{@normalizes_words_regexp_replaces ? @normalizes_words_regexp_replaces : '-'}
+Rejects tokens?            #{reject_condition_location ? "Yes, see line #{reject_condition_location} in app/application.rb" : '-'}
+Substitutes chars?         #{@substituter ? "Yes, using #{@substituter}." : '-' }
+TOKENIZER
+      end
+
       # Stopwords.
       #
+      # We only allow regexps (even if string would be okay
+      # too for gsub! - it's too hard to understand)
+      #
       def stopwords regexp
+        check_argument_in __method__, Regexp, regexp
         @remove_stopwords_regexp = regexp
       end
       def remove_stopwords text
         text.gsub! @remove_stopwords_regexp, EMPTY_STRING if @remove_stopwords_regexp
         text
@@ -22,82 +39,101 @@
       @@non_single_stopword_regexp = /^\b[\w:]+?\b[\.\*\~]?\s?$/
       def remove_non_single_stopwords text
         return text if text.match @@non_single_stopword_regexp
         remove_stopwords text
       end
-    
+
       # Illegals.
       #
-      # TODO Should there be a legal?
+      # We only allow regexps (even if string would be okay
+      # too for gsub! - it's too hard to understand)
       #
       def removes_characters regexp
+        check_argument_in __method__, Regexp, regexp
         @removes_characters_regexp = regexp
       end
       def remove_illegals text
         text.gsub! @removes_characters_regexp, EMPTY_STRING if @removes_characters_regexp
         text
       end
-    
+
       # Splitting.
       #
-      def splits_text_on regexp
-        @splits_text_on_regexp = regexp
+      # We allow Strings and Regexps.
+      # Note: We do not test against to_str since symbols do not work with String#split.
+      #
+      def splits_text_on regexp_or_string
+        raise ArgumentError.new "#{__method__} takes a Regexp or String as argument, not a #{regexp_or_string.class}." unless Regexp === regexp_or_string || String === regexp_or_string
+        @splits_text_on = regexp_or_string
       end
       def split text
-        text.split @splits_text_on_regexp
+        text.split @splits_text_on
       end
-    
+
       # Normalizing.
       #
+      # We only allow arrays.
+      #
       def normalizes_words regexp_replaces
+        raise ArgumentError.new "#{__method__} takes an Array of replaces as argument, not a #{regexp_replaces.class}." unless regexp_replaces.respond_to?(:to_ary)
         @normalizes_words_regexp_replaces = regexp_replaces
       end
       def normalize_with_patterns text
         return text unless @normalizes_words_regexp_replaces
-      
+
         @normalizes_words_regexp_replaces.each do |regex, replace|
           # This should be sufficient
           #
           text.gsub!(regex, replace) and break
         end
         remove_after_normalizing_illegals text
         text
       end
-    
+
       # Illegal after normalizing.
       #
+      # We only allow regexps (even if string would be okay
+      # too for gsub! - it's too hard to understand)
+      #
       def removes_characters_after_splitting regexp
+        check_argument_in __method__, Regexp, regexp
         @removes_characters_after_splitting_regexp = regexp
       end
       def remove_after_normalizing_illegals text
         text.gsub! @removes_characters_after_splitting_regexp, EMPTY_STRING if @removes_characters_after_splitting_regexp
       end
-    
+
       # Substitute Characters with this substituter.
       #
       # Default is European Character substitution.
       #
       def substitutes_characters_with substituter = CharacterSubstituters::WestEuropean.new
-        # TODO Raise if it doesn't quack substitute?
+        raise ArgumentError.new "The substitutes_characters_with option needs a character substituter, which responds to #substitute." unless substituter.respond_to?(:substitute)
         @substituter = substituter
       end
       def substitute_characters text
-        substituter?? substituter.substitute(text) : text 
+        substituter?? substituter.substitute(text) : text
       end
-    
+
       # Reject tokens after tokenizing based on the given criteria.
       #
       # Note: Currently only for indexing. TODO Redesign and write for both!
       #
       def reject_token_if &condition
         @reject_condition = condition
       end
       def reject tokens
         tokens.reject! &@reject_condition
       end
-    
-    
+
+      # Checks if the right argument type has been given.
+      #
+      def check_argument_in method, type, argument, &condition
+        raise ArgumentError.new "Application##{method} takes a #{type} as argument, not a #{argument.class}." unless type === argument
+      end
+
+
       # Returns a number of tokens, generated from the given text.
       #
       # Note:
       #  * preprocess, pretokenize are hooks
       #
@@ -107,31 +143,31 @@
         words  = pretokenize text # splitting and preparations for tokenizing
         return empty_tokens if words.empty?
         tokens = tokens_for words # creating tokens / strings
                  process tokens   # processing tokens / strings
       end
-    
+
       attr_reader :substituter
       alias substituter? substituter
-    
+
       def initialize options = {}
         removes_characters options[:removes_characters]                                 if options[:removes_characters]
         contracts_expressions *options[:contracts_expressions]                          if options[:contracts_expressions]
         stopwords options[:stopwords]                                                   if options[:stopwords]
         normalizes_words options[:normalizes_words]                                     if options[:normalizes_words]
         removes_characters_after_splitting options[:removes_characters_after_splitting] if options[:removes_characters_after_splitting]
         substitutes_characters_with options[:substitutes_characters_with]               if options[:substitutes_characters_with]
-      
+
         # Defaults.
         #
         splits_text_on options[:splits_text_on] || /\s/
         reject_token_if &(options[:reject_token_if] || :blank?)
       end
-    
+
       # Hooks.
       #
-    
+
       # Preprocessing.
       #
       def preprocess text; end
       # Pretokenizing.
       #
@@ -140,11 +176,11 @@
       #
       def process tokens
         reject tokens # Reject any tokens that don't meet criteria
         tokens
       end
-    
+
       # Converts words into real tokens.
       #
       def tokens_for words
         Internals::Query::Tokens.new words.collect! { |word| token_for word }
       end
@@ -156,11 +192,11 @@
       # Returns a tokens object.
       #
       def empty_tokens
         Internals::Query::Tokens.new
       end
-    
+
     end
-    
+
   end
-  
+
 end
\ No newline at end of file