RubygemsResearch

Sha256: 727d8010f0fd561daf6a1bf4eb79520eef3b9b49003d4d1dcdb4a752a0eb153a

Contents?: true

Size: 1.86 KB

Versions: 6

Compression:

Stored size: 1.86 KB

# encoding: utf-8
#
module Tokenizers
  
  # There are a few class methods that you can use to configure how a query works.
  #
  # illegal_characters regexp
  # illegal_after_normalizing regexp
  # stopwords regexp
  # contract_expressions regexp, to_string
  # split_text_on regexp
  # normalize_words [[/regexp1/, 'replacement1'], [/regexp2/, 'replacement2']]
  #
  class Query < Base
    
    include UmlautSubstituter
    
    # Default query tokenizer behaviour. Override in config.
    #
    illegal_characters(//)
    stopwords(//)
    contract_expressions(//, '')
    split_text_on(/\s/)
    normalize_words([])
    illegal_characters_after_splitting(//)
    
    def preprocess text
      remove_illegals text             # Remove illegal characters
      remove_non_single_stopwords text # remove stop words
      contract text                    # contract st sankt etc
      text
    end
    
    # Split the text and put some back together.
    #
    def pretokenize text
      split text
    end
    
    # Let each token process itself.
    # Reject, limit, and partialize tokens.
    #
    def process tokens
      tokens.tokenize_with self
      tokens.reject          # Reject any tokens that don't meet criteria
      tokens.cap             # Cut off superfluous tokens
      tokens.partialize_last # Set certain tokens as partial
      tokens
    end
    
    # Called by the token.
    #
    # TODO Perhaps move to Normalizer?
    #
    def normalize text
      text = substitute_umlauts text # Substitute special characters TODO Move to subclass
      text.downcase!                 # Downcase all text
      normalize_with_patterns text   # normalize
      text.to_sym                    # symbolize
    end
    
    # Returns a token for a word.
    # The basic query tokenizer uses new tokens.
    #
    def token_for word
      ::Query::Token.processed word
    end
    
  end
end

Version data entries

6 entries across 6 versions & 1 rubygems

Version	Path
picky-0.0.9	lib/picky/tokenizers/query.rb
picky-0.0.8	lib/picky/tokenizers/query.rb
picky-0.0.7	lib/picky/tokenizers/query.rb
picky-0.0.6	lib/picky/tokenizers/query.rb
picky-0.0.5	lib/picky/tokenizers/query.rb
picky-0.0.4	lib/picky/tokenizers/query.rb