lib/picky/internals/tokenizers/query.rb in picky-2.0.0 vs lib/picky/internals/tokenizers/query.rb in picky-2.1.0

- old
+ new

@@ -1,79 +1,59 @@ # encoding: utf-8 # module Internals module Tokenizers - + # There are a few class methods that you can use to configure how a query works. # # removes_characters regexp # illegal_after_normalizing regexp # stopwords regexp # contracts_expressions regexp, to_string # splits_text_on regexp # normalizes_words [[/regexp1/, 'replacement1'], [/regexp2/, 'replacement2']] # class Query < Base - + def self.default= new_default @default = new_default end def self.default @default ||= new end - + attr_reader :maximum_tokens - + def initialize options = {} super options @maximum_tokens = options[:maximum_tokens] || 5 end - - def preprocess text - remove_illegals text # Remove illegal characters - remove_non_single_stopwords text # remove stop words - text - end - - # Split the text and put some back together. - # - # TODO Make the same as in indexing? - # - def pretokenize text - split text - end - + # Let each token process itself. # Reject, limit, and partialize tokens. # + # In querying we work with real tokens (in indexing it's just symbols). + # def process tokens - tokens.tokenize_with self - tokens.reject # Reject any tokens that don't meet criteria - tokens.cap maximum_tokens # Cut off superfluous tokens - tokens.partialize_last # Set certain tokens as partial + tokens.reject # Reject any tokens that don't meet criteria. + tokens.cap maximum_tokens # Cut off superfluous tokens. + tokens.partialize_last # Set certain tokens as partial. tokens end - - # Called by the token. + + # Converts words into real tokens. # - # TODO Perhaps move to Normalizer? - # - def normalize text - text = substitute_characters text # Substitute special characters - text.downcase! # Downcase all text - normalize_with_patterns text # normalize - text.to_sym # symbolize + def tokens_for words + Internals::Query::Tokens.processed words, downcase? end - - # Returns a token for a word. - # The basic query tokenizer uses new tokens. + # Returns a tokens object. # - def token_for word - Internals::Query::Token.processed word + def empty_tokens + Internals::Query::Tokens.new end - + end - + end - + end \ No newline at end of file