Sha256: a54ab6bb3e689e09cb744f3655c25d435417d1690065ed18188edb9ff6f618a4

Contents?: true

Size: 1.88 KB

Versions: 17

Compression:

Stored size: 1.88 KB

Contents

# encoding: utf-8
#
module Tokenizers
  
  # There are a few class methods that you can use to configure how a query works.
  #
  # removes_characters regexp
  # illegal_after_normalizing regexp
  # stopwords regexp
  # contracts_expressions regexp, to_string
  # splits_text_on regexp
  # normalizes_words [[/regexp1/, 'replacement1'], [/regexp2/, 'replacement2']]
  #
  class Query < Base
    
    def self.default= new_default
      @default = new_default
    end
    def self.default
      @default ||= new
    end
    
    attr_reader :maximum_tokens
    
    def initialize options = {}
      super options
      @maximum_tokens = options[:maximum_tokens] || 5
    end
    
    def preprocess text
      remove_illegals text             # Remove illegal characters
      remove_non_single_stopwords text # remove stop words
      text
    end
    
    # Split the text and put some back together.
    #
    # TODO Make the same as in indexing?
    #
    def pretokenize text
      split text
    end
    
    # Let each token process itself.
    # Reject, limit, and partialize tokens.
    #
    def process tokens
      tokens.tokenize_with self
      tokens.reject              # Reject any tokens that don't meet criteria
      tokens.cap maximum_tokens  # Cut off superfluous tokens
      tokens.partialize_last     # Set certain tokens as partial
      tokens
    end
    
    # Called by the token.
    #
    # TODO Perhaps move to Normalizer?
    #
    def normalize text
      text = substitute_characters text # Substitute special characters TODO Move to subclass
      text.downcase!                    # Downcase all text
      normalize_with_patterns text      # normalize
      text.to_sym                       # symbolize
    end
    
    # Returns a token for a word.
    # The basic query tokenizer uses new tokens.
    #
    def token_for word
      ::Query::Token.processed word
    end
    
  end
end

Version data entries

17 entries across 17 versions & 1 rubygems

Version Path
picky-1.2.3 lib/picky/tokenizers/query.rb
picky-1.2.2 lib/picky/tokenizers/query.rb
picky-1.2.1 lib/picky/tokenizers/query.rb
picky-1.2.0 lib/picky/tokenizers/query.rb
picky-1.1.7 lib/picky/tokenizers/query.rb
picky-1.1.6 lib/picky/tokenizers/query.rb
picky-1.1.5 lib/picky/tokenizers/query.rb
picky-1.1.4 lib/picky/tokenizers/query.rb
picky-1.1.3 lib/picky/tokenizers/query.rb
picky-1.1.2 lib/picky/tokenizers/query.rb
picky-1.1.1 lib/picky/tokenizers/query.rb
picky-1.1.0 lib/picky/tokenizers/query.rb
picky-1.0.0 lib/picky/tokenizers/query.rb
picky-0.12.3 lib/picky/tokenizers/query.rb
picky-0.12.2 lib/picky/tokenizers/query.rb
picky-0.12.1 lib/picky/tokenizers/query.rb
picky-0.12.0 lib/picky/tokenizers/query.rb