lib/picky/internals/tokenizers/query.rb in picky-2.0.0 vs lib/picky/internals/tokenizers/query.rb in picky-2.1.0
- old
+ new
@@ -1,79 +1,59 @@
# encoding: utf-8
#
module Internals
module Tokenizers
-
+
# There are a few class methods that you can use to configure how a query works.
#
# removes_characters regexp
# illegal_after_normalizing regexp
# stopwords regexp
# contracts_expressions regexp, to_string
# splits_text_on regexp
# normalizes_words [[/regexp1/, 'replacement1'], [/regexp2/, 'replacement2']]
#
class Query < Base
-
+
def self.default= new_default
@default = new_default
end
def self.default
@default ||= new
end
-
+
attr_reader :maximum_tokens
-
+
def initialize options = {}
super options
@maximum_tokens = options[:maximum_tokens] || 5
end
-
- def preprocess text
- remove_illegals text # Remove illegal characters
- remove_non_single_stopwords text # remove stop words
- text
- end
-
- # Split the text and put some back together.
- #
- # TODO Make the same as in indexing?
- #
- def pretokenize text
- split text
- end
-
+
# Let each token process itself.
# Reject, limit, and partialize tokens.
#
+ # In querying we work with real tokens (in indexing it's just symbols).
+ #
def process tokens
- tokens.tokenize_with self
- tokens.reject # Reject any tokens that don't meet criteria
- tokens.cap maximum_tokens # Cut off superfluous tokens
- tokens.partialize_last # Set certain tokens as partial
+ tokens.reject # Reject any tokens that don't meet criteria.
+ tokens.cap maximum_tokens # Cut off superfluous tokens.
+ tokens.partialize_last # Set certain tokens as partial.
tokens
end
-
- # Called by the token.
+
+ # Converts words into real tokens.
#
- # TODO Perhaps move to Normalizer?
- #
- def normalize text
- text = substitute_characters text # Substitute special characters
- text.downcase! # Downcase all text
- normalize_with_patterns text # normalize
- text.to_sym # symbolize
+ def tokens_for words
+ Internals::Query::Tokens.processed words, downcase?
end
-
- # Returns a token for a word.
- # The basic query tokenizer uses new tokens.
+ # Returns a tokens object.
#
- def token_for word
- Internals::Query::Token.processed word
+ def empty_tokens
+ Internals::Query::Tokens.new
end
-
+
end
-
+
end
-
+
end
\ No newline at end of file