lib/picky/internals/tokenizers/base.rb in picky-2.0.0 vs lib/picky/internals/tokenizers/base.rb in picky-2.1.0
- old
+ new
@@ -18,10 +18,11 @@
Splits text on: #{@splits_text_on.respond_to?(:source) ? "/#{@splits_text_on.source}/" : (@splits_text_on ? @splits_text_on : '-')}
Removes chars after split: #{@removes_characters_after_splitting_regexp ? "/#{@removes_characters_after_splitting_regexp.source}/" : '-'}
Normalizes words: #{@normalizes_words_regexp_replaces ? @normalizes_words_regexp_replaces : '-'}
Rejects tokens? #{reject_condition_location ? "Yes, see line #{reject_condition_location} in app/application.rb" : '-'}
Substitutes chars? #{@substituter ? "Yes, using #{@substituter}." : '-' }
+Case sensitive? #{@case_sensitive ? "Yes." : "-"}
TOKENIZER
end
# Stopwords.
#
@@ -123,10 +124,17 @@
end
def reject tokens
tokens.reject! &@reject_condition
end
+ def case_sensitive case_sensitive
+ @case_sensitive = case_sensitive
+ end
+ def downcase?
+ !@case_sensitive
+ end
+
# Checks if the right argument type has been given.
#
def check_argument_in method, type, argument, &condition
raise ArgumentError.new "Application##{method} takes a #{type} as argument, not a #{argument.class}." unless type === argument
end
@@ -154,46 +162,64 @@
contracts_expressions *options[:contracts_expressions] if options[:contracts_expressions]
stopwords options[:stopwords] if options[:stopwords]
normalizes_words options[:normalizes_words] if options[:normalizes_words]
removes_characters_after_splitting options[:removes_characters_after_splitting] if options[:removes_characters_after_splitting]
substitutes_characters_with options[:substitutes_characters_with] if options[:substitutes_characters_with]
+ case_sensitive options[:case_sensitive] unless options[:case_sensitive].nil?
# Defaults.
#
splits_text_on options[:splits_text_on] || /\s/
reject_token_if &(options[:reject_token_if] || :blank?)
end
- # Hooks.
+ # Default preprocessing hook.
#
-
- # Preprocessing.
+ # Does:
+ # 1. Character substitution.
+ # 2. Remove illegal expressions.
+ # 3. Remove non-single stopwords. (Stopwords that occur with other words)
#
- def preprocess text; end
+ def preprocess text
+ text = substitute_characters text
+ remove_illegals text
+ # We do not remove single stopwords e.g. in the indexer for
+ # an entirely different reason than in the query tokenizer.
+ # An indexed thing with just name "UND" (a possible stopword)
+ # should not lose its name.
+ #
+ remove_non_single_stopwords text
+ text
+ end
# Pretokenizing.
#
- def pretokenize text; end
- # Postprocessing.
+ # Does:
+ # 1. Split the text into words.
+ # 2. Normalize each word.
#
+ def pretokenize text
+ words = split text
+ words.collect! do |word|
+ normalize_with_patterns word
+ word
+ end
+ end
+ # Basic postprocessing (overridden in both query/index tokenizers).
+ #
def process tokens
reject tokens # Reject any tokens that don't meet criteria
tokens
end
- # Converts words into real tokens.
- #
- def tokens_for words
- Internals::Query::Tokens.new words.collect! { |word| token_for word }
- end
+ # # Converts words into real tokens.
+ # #
+ # def tokens_for words
+ # Internals::Query::Tokens.new words.collect! { |word| token_for word }
+ # end
# Turns non-blank text into symbols.
#
def symbolize text
text.blank? ? nil : text.to_sym
- end
- # Returns a tokens object.
- #
- def empty_tokens
- Internals::Query::Tokens.new
end
end
end
\ No newline at end of file