Sha256: e5143681a2b4b74818a287c7c63580c0e0d9617ba9574723db4a71d977abc661

Contents?: true

Size: 1.73 KB

Versions: 6

Compression:

Stored size: 1.73 KB

Contents

module Tokenizers
  # The base indexing tokenizer.
  #
  # Override in indexing subclasses and define in configuration.
  #
  class Index < Base
    
    include UmlautSubstituter
    
    # Default handling definitions. Override in config.
    #
    illegal_characters(//)
    stopwords(//)
    contract_expressions(//, '')
    split_text_on(/\s/)
    normalize_words([])
    illegal_characters_after_splitting(//)
    
    # Default indexing preprocessing hook.
    #
    # Does:
    #   1. Umlaut substitution.
    #   2. Downcasing.
    #   3. Remove illegal expressions.
    #   4. Contraction.
    #   5. Remove non-single stopwords. (Stopwords that occur with other words)
    #
    def preprocess text
      text = substitute_umlauts text
      text.downcase!
      remove_illegals text
      contract text
      # we do not remove single stopwords for an entirely different
      # reason than in the query tokenizer.
      # An indexed thing with just name "UND" (a stopword) should not lose its name.
      #
      remove_non_single_stopwords text
      text
    end
    
    # Default indexing pretokenizing hook.
    #
    # Does:
    #   1. Split the text into words.
    #   2. Normalize each word.
    #
    # TODO Rename into wordize? Or somesuch?
    #
    def pretokenize text
      words = split text
      words.collect! do |word|
        normalize_with_patterns word
        word
      end
    end
    
    # Does not actually return a token, but a
    # symbol "token".
    #
    def token_for text
      symbolize text
    end
    
    # Rejects tokens if they are too short (or blank).
    #
    # Override in subclasses to redefine behaviour.
    #
    def reject tokens
      tokens.reject! { |token| token.to_s.size < 2 }
    end
    
  end
end

Version data entries

6 entries across 6 versions & 1 rubygems

Version Path
picky-0.0.9 lib/picky/tokenizers/index.rb
picky-0.0.8 lib/picky/tokenizers/index.rb
picky-0.0.7 lib/picky/tokenizers/index.rb
picky-0.0.6 lib/picky/tokenizers/index.rb
picky-0.0.5 lib/picky/tokenizers/index.rb
picky-0.0.4 lib/picky/tokenizers/index.rb