module Internals
  
  module Tokenizers # :nodoc:all
  
    # Defines tokenizing processes used both in indexing and querying.
    #
    class Base
    
      # TODO Move EMPTY_STRING top level.
      #
      EMPTY_STRING = ''.freeze
    
      # Stopwords.
      #
      def stopwords regexp
        @remove_stopwords_regexp = regexp
      end
      def remove_stopwords text
        text.gsub! @remove_stopwords_regexp, EMPTY_STRING if @remove_stopwords_regexp
        text
      end
      @@non_single_stopword_regexp = /^\b[\w:]+?\b[\.\*\~]?\s?$/
      def remove_non_single_stopwords text
        return text if text.match @@non_single_stopword_regexp
        remove_stopwords text
      end
    
      # Illegals.
      #
      # TODO Should there be a legal?
      #
      def removes_characters regexp
        @removes_characters_regexp = regexp
      end
      def remove_illegals text
        text.gsub! @removes_characters_regexp, EMPTY_STRING if @removes_characters_regexp
        text
      end
    
      # Splitting.
      #
      def splits_text_on regexp
        @splits_text_on_regexp = regexp
      end
      def split text
        text.split @splits_text_on_regexp
      end
    
      # Normalizing.
      #
      def normalizes_words regexp_replaces
        @normalizes_words_regexp_replaces = regexp_replaces
      end
      def normalize_with_patterns text
        return text unless @normalizes_words_regexp_replaces
      
        @normalizes_words_regexp_replaces.each do |regex, replace|
          # This should be sufficient
          #
          text.gsub!(regex, replace) and break
        end
        remove_after_normalizing_illegals text
        text
      end
    
      # Illegal after normalizing.
      #
      def removes_characters_after_splitting regexp
        @removes_characters_after_splitting_regexp = regexp
      end
      def remove_after_normalizing_illegals text
        text.gsub! @removes_characters_after_splitting_regexp, EMPTY_STRING if @removes_characters_after_splitting_regexp
      end
    
      # Substitute Characters with this substituter.
      #
      # Default is European Character substitution.
      #
      def substitutes_characters_with substituter = CharacterSubstituters::WestEuropean.new
        # TODO Raise if it doesn't quack substitute?
        @substituter = substituter
      end
      def substitute_characters text
        substituter?? substituter.substitute(text) : text 
      end
    
      # Reject tokens after tokenizing based on the given criteria.
      #
      # Note: Currently only for indexing. TODO Redesign and write for both!
      #
      def reject_token_if &condition
        @reject_condition = condition
      end
      def reject tokens
        tokens.reject! &@reject_condition
      end
    
    
      # Returns a number of tokens, generated from the given text.
      #
      # Note:
      #  * preprocess, pretokenize are hooks
      #
      def tokenize text
        text   = preprocess text  # processing the text
        return empty_tokens if text.blank?
        words  = pretokenize text # splitting and preparations for tokenizing
        return empty_tokens if words.empty?
        tokens = tokens_for words # creating tokens / strings
                 process tokens   # processing tokens / strings
      end
    
      attr_reader :substituter
      alias substituter? substituter
    
      def initialize options = {}
        removes_characters options[:removes_characters]                                 if options[:removes_characters]
        contracts_expressions *options[:contracts_expressions]                          if options[:contracts_expressions]
        stopwords options[:stopwords]                                                   if options[:stopwords]
        normalizes_words options[:normalizes_words]                                     if options[:normalizes_words]
        removes_characters_after_splitting options[:removes_characters_after_splitting] if options[:removes_characters_after_splitting]
        substitutes_characters_with options[:substitutes_characters_with]               if options[:substitutes_characters_with]
      
        # Defaults.
        #
        splits_text_on options[:splits_text_on] || /\s/
        reject_token_if &(options[:reject_token_if] || :blank?)
      end
    
      # Hooks.
      #
    
      # Preprocessing.
      #
      def preprocess text; end
      # Pretokenizing.
      #
      def pretokenize text; end
      # Postprocessing.
      #
      def process tokens
        reject tokens # Reject any tokens that don't meet criteria
        tokens
      end
    
      # Converts words into real tokens.
      #
      def tokens_for words
        Internals::Query::Tokens.new words.collect! { |word| token_for word }
      end
      # Turns non-blank text into symbols.
      #
      def symbolize text
        text.blank? ? nil : text.to_sym
      end
      # Returns a tokens object.
      #
      def empty_tokens
        Internals::Query::Tokens.new
      end
    
    end
    
  end
  
end