base.rb in picky-2.1.0

- old
+ new

@@ -18,10 +18,11 @@
 Splits text on:            #{@splits_text_on.respond_to?(:source) ? "/#{@splits_text_on.source}/" : (@splits_text_on ? @splits_text_on : '-')}
 Removes chars after split: #{@removes_characters_after_splitting_regexp ? "/#{@removes_characters_after_splitting_regexp.source}/" : '-'}
 Normalizes words:          #{@normalizes_words_regexp_replaces ? @normalizes_words_regexp_replaces : '-'}
 Rejects tokens?            #{reject_condition_location ? "Yes, see line #{reject_condition_location} in app/application.rb" : '-'}
 Substitutes chars?         #{@substituter ? "Yes, using #{@substituter}." : '-' }
+Case sensitive?            #{@case_sensitive ? "Yes." : "-"}
         TOKENIZER
       end
 
       # Stopwords.
       #
@@ -123,10 +124,17 @@
       end
       def reject tokens
         tokens.reject! &@reject_condition
       end
 
+      def case_sensitive case_sensitive
+        @case_sensitive = case_sensitive
+      end
+      def downcase?
+        !@case_sensitive
+      end
+
       # Checks if the right argument type has been given.
       #
       def check_argument_in method, type, argument, &condition
         raise ArgumentError.new "Application##{method} takes a #{type} as argument, not a #{argument.class}." unless type === argument
       end
@@ -154,46 +162,64 @@
         contracts_expressions *options[:contracts_expressions]                          if options[:contracts_expressions]
         stopwords options[:stopwords]                                                   if options[:stopwords]
         normalizes_words options[:normalizes_words]                                     if options[:normalizes_words]
         removes_characters_after_splitting options[:removes_characters_after_splitting] if options[:removes_characters_after_splitting]
         substitutes_characters_with options[:substitutes_characters_with]               if options[:substitutes_characters_with]
+        case_sensitive options[:case_sensitive]                                         unless options[:case_sensitive].nil?
 
         # Defaults.
         #
         splits_text_on options[:splits_text_on] || /\s/
         reject_token_if &(options[:reject_token_if] || :blank?)
       end
 
-      # Hooks.
+      # Default preprocessing hook.
       #
-
-      # Preprocessing.
+      # Does:
+      # 1. Character substitution.
+      # 2. Remove illegal expressions.
+      # 3. Remove non-single stopwords. (Stopwords that occur with other words)
       #
-      def preprocess text; end
+      def preprocess text
+        text = substitute_characters text
+        remove_illegals text
+        # We do not remove single stopwords e.g. in the indexer for
+        # an entirely different reason than in the query tokenizer.
+        # An indexed thing with just name "UND" (a possible stopword)
+        # should not lose its name.
+        #
+        remove_non_single_stopwords text
+        text
+      end
       # Pretokenizing.
       #
-      def pretokenize text; end
-      # Postprocessing.
+      # Does:
+      # 1. Split the text into words.
+      # 2. Normalize each word.
       #
+      def pretokenize text
+        words = split text
+        words.collect! do |word|
+          normalize_with_patterns word
+          word
+        end
+      end
+      # Basic postprocessing (overridden in both query/index tokenizers).
+      #
       def process tokens
         reject tokens # Reject any tokens that don't meet criteria
         tokens
       end
 
-      # Converts words into real tokens.
-      #
-      def tokens_for words
-        Internals::Query::Tokens.new words.collect! { |word| token_for word }
-      end
+      # # Converts words into real tokens.
+      # #
+      # def tokens_for words
+      #   Internals::Query::Tokens.new words.collect! { |word| token_for word }
+      # end
       # Turns non-blank text into symbols.
       #
       def symbolize text
         text.blank? ? nil : text.to_sym
-      end
-      # Returns a tokens object.
-      #
-      def empty_tokens
-        Internals::Query::Tokens.new
       end
 
     end
 
   end
\ No newline at end of file