Sha256: 56734ce489299b4eca0569174a8494ffc0fbf5aa9defedb07f0e6f825932a171
Contents?: true
Size: 1.43 KB
Versions: 5
Compression:
Stored size: 1.43 KB
Contents
module Internals module Tokenizers # The base indexing tokenizer. # # Override in indexing subclasses and define in configuration. # class Index < Base def self.default= new_default @default = new_default end def self.default @default ||= new end # Default indexing preprocessing hook. # # Does: # 1. Character substitution. # 2. Downcasing. # 3. Remove illegal expressions. # 4. Remove non-single stopwords. (Stopwords that occur with other words) # def preprocess text text = substitute_characters text text.downcase! remove_illegals text # we do not remove single stopwords for an entirely different # reason than in the query tokenizer. # An indexed thing with just name "UND" (a possible stopword) should not lose its name. # remove_non_single_stopwords text text end # Default indexing pretokenizing hook. # # Does: # 1. Split the text into words. # 2. Normalize each word. # def pretokenize text words = split text words.collect! do |word| normalize_with_patterns word word end end # Does not actually return a token, but a # symbol "token". # def token_for text symbolize text end end end end
Version data entries
5 entries across 5 versions & 1 rubygems