lib/picky/internals/tokenizers/base.rb in picky-2.0.0 vs lib/picky/internals/tokenizers/base.rb in picky-2.1.0

- old
+ new

@@ -18,10 +18,11 @@ Splits text on: #{@splits_text_on.respond_to?(:source) ? "/#{@splits_text_on.source}/" : (@splits_text_on ? @splits_text_on : '-')} Removes chars after split: #{@removes_characters_after_splitting_regexp ? "/#{@removes_characters_after_splitting_regexp.source}/" : '-'} Normalizes words: #{@normalizes_words_regexp_replaces ? @normalizes_words_regexp_replaces : '-'} Rejects tokens? #{reject_condition_location ? "Yes, see line #{reject_condition_location} in app/application.rb" : '-'} Substitutes chars? #{@substituter ? "Yes, using #{@substituter}." : '-' } +Case sensitive? #{@case_sensitive ? "Yes." : "-"} TOKENIZER end # Stopwords. # @@ -123,10 +124,17 @@ end def reject tokens tokens.reject! &@reject_condition end + def case_sensitive case_sensitive + @case_sensitive = case_sensitive + end + def downcase? + !@case_sensitive + end + # Checks if the right argument type has been given. # def check_argument_in method, type, argument, &condition raise ArgumentError.new "Application##{method} takes a #{type} as argument, not a #{argument.class}." unless type === argument end @@ -154,46 +162,64 @@ contracts_expressions *options[:contracts_expressions] if options[:contracts_expressions] stopwords options[:stopwords] if options[:stopwords] normalizes_words options[:normalizes_words] if options[:normalizes_words] removes_characters_after_splitting options[:removes_characters_after_splitting] if options[:removes_characters_after_splitting] substitutes_characters_with options[:substitutes_characters_with] if options[:substitutes_characters_with] + case_sensitive options[:case_sensitive] unless options[:case_sensitive].nil? # Defaults. # splits_text_on options[:splits_text_on] || /\s/ reject_token_if &(options[:reject_token_if] || :blank?) end - # Hooks. + # Default preprocessing hook. # - - # Preprocessing. + # Does: + # 1. Character substitution. + # 2. Remove illegal expressions. + # 3. Remove non-single stopwords. (Stopwords that occur with other words) # - def preprocess text; end + def preprocess text + text = substitute_characters text + remove_illegals text + # We do not remove single stopwords e.g. in the indexer for + # an entirely different reason than in the query tokenizer. + # An indexed thing with just name "UND" (a possible stopword) + # should not lose its name. + # + remove_non_single_stopwords text + text + end # Pretokenizing. # - def pretokenize text; end - # Postprocessing. + # Does: + # 1. Split the text into words. + # 2. Normalize each word. # + def pretokenize text + words = split text + words.collect! do |word| + normalize_with_patterns word + word + end + end + # Basic postprocessing (overridden in both query/index tokenizers). + # def process tokens reject tokens # Reject any tokens that don't meet criteria tokens end - # Converts words into real tokens. - # - def tokens_for words - Internals::Query::Tokens.new words.collect! { |word| token_for word } - end + # # Converts words into real tokens. + # # + # def tokens_for words + # Internals::Query::Tokens.new words.collect! { |word| token_for word } + # end # Turns non-blank text into symbols. # def symbolize text text.blank? ? nil : text.to_sym - end - # Returns a tokens object. - # - def empty_tokens - Internals::Query::Tokens.new end end end \ No newline at end of file