module Internals module Tokenizers # :nodoc:all # Defines tokenizing processes used both in indexing and querying. # class Base # TODO Move EMPTY_STRING top level. # EMPTY_STRING = ''.freeze def to_s reject_condition_location = @reject_condition.to_s[/:(\d+) \(lambda\)/, 1] <<-TOKENIZER Removes characters: #{@removes_characters_regexp ? "/#{@removes_characters_regexp.source}/" : '-'} Stopwords: #{@remove_stopwords_regexp ? "/#{@remove_stopwords_regexp.source}/" : '-'} Splits text on: #{@splits_text_on.respond_to?(:source) ? "/#{@splits_text_on.source}/" : (@splits_text_on ? @splits_text_on : '-')} Removes chars after split: #{@removes_characters_after_splitting_regexp ? "/#{@removes_characters_after_splitting_regexp.source}/" : '-'} Normalizes words: #{@normalizes_words_regexp_replaces ? @normalizes_words_regexp_replaces : '-'} Rejects tokens? #{reject_condition_location ? "Yes, see line #{reject_condition_location} in app/application.rb" : '-'} Substitutes chars? #{@substituter ? "Yes, using #{@substituter}." : '-' } TOKENIZER end # Stopwords. # # We only allow regexps (even if string would be okay # too for gsub! - it's too hard to understand) # def stopwords regexp check_argument_in __method__, Regexp, regexp @remove_stopwords_regexp = regexp end def remove_stopwords text text.gsub! @remove_stopwords_regexp, EMPTY_STRING if @remove_stopwords_regexp text end @@non_single_stopword_regexp = /^\b[\w:]+?\b[\.\*\~]?\s?$/ def remove_non_single_stopwords text return text if text.match @@non_single_stopword_regexp remove_stopwords text end # Illegals. # # We only allow regexps (even if string would be okay # too for gsub! - it's too hard to understand) # def removes_characters regexp check_argument_in __method__, Regexp, regexp @removes_characters_regexp = regexp end def remove_illegals text text.gsub! @removes_characters_regexp, EMPTY_STRING if @removes_characters_regexp text end # Splitting. # # We allow Strings and Regexps. # Note: We do not test against to_str since symbols do not work with String#split. # def splits_text_on regexp_or_string raise ArgumentError.new "#{__method__} takes a Regexp or String as argument, not a #{regexp_or_string.class}." unless Regexp === regexp_or_string || String === regexp_or_string @splits_text_on = regexp_or_string end def split text text.split @splits_text_on end # Normalizing. # # We only allow arrays. # def normalizes_words regexp_replaces raise ArgumentError.new "#{__method__} takes an Array of replaces as argument, not a #{regexp_replaces.class}." unless regexp_replaces.respond_to?(:to_ary) @normalizes_words_regexp_replaces = regexp_replaces end def normalize_with_patterns text return text unless @normalizes_words_regexp_replaces @normalizes_words_regexp_replaces.each do |regex, replace| # This should be sufficient # text.gsub!(regex, replace) and break end remove_after_normalizing_illegals text text end # Illegal after normalizing. # # We only allow regexps (even if string would be okay # too for gsub! - it's too hard to understand) # def removes_characters_after_splitting regexp check_argument_in __method__, Regexp, regexp @removes_characters_after_splitting_regexp = regexp end def remove_after_normalizing_illegals text text.gsub! @removes_characters_after_splitting_regexp, EMPTY_STRING if @removes_characters_after_splitting_regexp end # Substitute Characters with this substituter. # # Default is European Character substitution. # def substitutes_characters_with substituter = CharacterSubstituters::WestEuropean.new raise ArgumentError.new "The substitutes_characters_with option needs a character substituter, which responds to #substitute." unless substituter.respond_to?(:substitute) @substituter = substituter end def substitute_characters text substituter?? substituter.substitute(text) : text end # Reject tokens after tokenizing based on the given criteria. # # Note: Currently only for indexing. TODO Redesign and write for both! # def reject_token_if &condition @reject_condition = condition end def reject tokens tokens.reject! &@reject_condition end # Checks if the right argument type has been given. # def check_argument_in method, type, argument, &condition raise ArgumentError.new "Application##{method} takes a #{type} as argument, not a #{argument.class}." unless type === argument end # Returns a number of tokens, generated from the given text. # # Note: # * preprocess, pretokenize are hooks # def tokenize text text = preprocess text # processing the text return empty_tokens if text.blank? words = pretokenize text # splitting and preparations for tokenizing return empty_tokens if words.empty? tokens = tokens_for words # creating tokens / strings process tokens # processing tokens / strings end attr_reader :substituter alias substituter? substituter def initialize options = {} removes_characters options[:removes_characters] if options[:removes_characters] contracts_expressions *options[:contracts_expressions] if options[:contracts_expressions] stopwords options[:stopwords] if options[:stopwords] normalizes_words options[:normalizes_words] if options[:normalizes_words] removes_characters_after_splitting options[:removes_characters_after_splitting] if options[:removes_characters_after_splitting] substitutes_characters_with options[:substitutes_characters_with] if options[:substitutes_characters_with] # Defaults. # splits_text_on options[:splits_text_on] || /\s/ reject_token_if &(options[:reject_token_if] || :blank?) end # Hooks. # # Preprocessing. # def preprocess text; end # Pretokenizing. # def pretokenize text; end # Postprocessing. # def process tokens reject tokens # Reject any tokens that don't meet criteria tokens end # Converts words into real tokens. # def tokens_for words Internals::Query::Tokens.new words.collect! { |word| token_for word } end # Turns non-blank text into symbols. # def symbolize text text.blank? ? nil : text.to_sym end # Returns a tokens object. # def empty_tokens Internals::Query::Tokens.new end end end end