# encoding: utf-8 # module Picky # Defines tokenizing processes used both in indexing and querying. # class Tokenizer extend Picky::Helpers::Identification include API::Tokenizer::CharacterSubstituter def self.default_indexing_with options = {} @indexing = from options end def self.indexing @indexing ||= new end def self.default_searching_with options = {} @searching = from options end def self.searching @searching ||= new end def self.from thing, index_name = nil, category_name = nil return unless thing if thing.respond_to? :tokenize thing else if thing.respond_to? :[] Picky::Tokenizer.new thing else raise <<-ERROR indexing options #{identifier_for(index_name, category_name)}should be either * a Hash or * an object that responds to #tokenize(text) => [[token1, token2, ...], [original1, original2, ...]] ERROR end end end def to_s reject_condition_location = @reject_condition.to_s[/:(\d+) \(lambda\)/, 1] <<-TOKENIZER Removes characters: #{@removes_characters_regexp ? "/#{@removes_characters_regexp.source}/" : '-'} Stopwords: #{@remove_stopwords_regexp ? "/#{@remove_stopwords_regexp.source}/" : '-'} Splits text on: #{@splits_text_on.respond_to?(:source) ? "/#{@splits_text_on.source}/" : (@splits_text_on ? @splits_text_on : '-')} Normalizes words: #{@normalizes_words_regexp_replaces ? @normalizes_words_regexp_replaces : '-'} Rejects tokens? #{reject_condition_location ? "Yes, see line #{reject_condition_location} in app/application.rb" : '-'} Substitutes chars? #{@substituter ? "Yes, using #{@substituter}." : '-' } Case sensitive? #{@case_sensitive ? "Yes." : "-"} TOKENIZER end # Stopwords. # # We only allow regexps (even if string would be okay # too for gsub! - it's too hard to understand) # def stopwords regexp check_argument_in __method__, Regexp, regexp @remove_stopwords_regexp = regexp end def remove_stopwords text text.gsub! @remove_stopwords_regexp, EMPTY_STRING if @remove_stopwords_regexp text end @@non_single_stopword_regexp = /^\b[\w:]+?\b[\.\*\~]?\s?$/ def remove_non_single_stopwords text return text unless @remove_stopwords_regexp return text if text.match @@non_single_stopword_regexp remove_stopwords text end # Illegals. # # We only allow regexps (even if string would be okay # too for gsub! - it's too hard to understand) # def removes_characters regexp check_argument_in __method__, Regexp, regexp @removes_characters_regexp = regexp end def remove_illegals text text.gsub! @removes_characters_regexp, EMPTY_STRING if @removes_characters_regexp text end # Splitting. # # We allow Strings and Regexps. # Note: We do not test against to_str since symbols do not work with String#split. # def splits_text_on regexp_or_string raise ArgumentError.new "#{__method__} takes a Regexp or String as argument, not a #{regexp_or_string.class}." unless Regexp === regexp_or_string || String === regexp_or_string @splits_text_on = regexp_or_string end def split text text.split @splits_text_on end # Normalizing. # # We only allow arrays. # def normalizes_words regexp_replaces raise ArgumentError.new "#{__method__} takes an Array of replaces as argument, not a #{regexp_replaces.class}." unless regexp_replaces.respond_to?(:to_ary) @normalizes_words_regexp_replaces = regexp_replaces end def normalize_with_patterns text return text unless @normalizes_words_regexp_replaces @normalizes_words_regexp_replaces.each do |regex, replace| # This should be sufficient # text.gsub!(regex, replace) and break end text end def normalize_with_patterns? @normalizes_words_regexp_replaces end # Substitute Characters with this substituter. # # Default is European Character substitution. # def substitutes_characters_with substituter = CharacterSubstituters::WestEuropean.new @substituter = extract_character_substituter substituter end def substitute_characters text substituter?? substituter.substitute(text) : text end # Reject tokens after tokenizing based on the given criteria. # def rejects_token_if condition @reject_condition = condition end def reject tokens tokens.reject! &@reject_condition end # Case sensitivity. # # Note: If false, simply downcases the data/query. # def case_sensitive case_sensitive @case_sensitive = case_sensitive end def downcase? !@case_sensitive end # The maximum amount of words # to pass into the search engine. # def max_words amount @max_words = amount end def cap words words.slice!(@max_words..-1) if cap?(words) end def cap? words @max_words && words.size > @max_words end # Checks if the right argument type has been given. # def check_argument_in method, type, argument, &condition raise ArgumentError.new "Application##{method} takes a #{type} as argument, not a #{argument.class}." unless type === argument end attr_reader :substituter alias substituter? substituter def initialize options = {} options = default_options.merge options options.each do |method_name, value| send method_name, value unless value.nil? end rescue NoMethodError => e # TODO Print out valid options. # raise <<-ERROR The option "#{e.name}" is not a valid option for a Picky tokenizer. Please see https://github.com/floere/picky/wiki/Indexing-configuration for valid options. A short overview: removes_characters /regexp/ stopwords /regexp/ splits_text_on /regexp/ or "String", default /\s/ normalizes_words [[/replace (this)/, 'with this \\1'], ...] rejects_token_if Proc/lambda, default :blank?.to_proc substitutes_characters_with Picky::CharacterSubstituter or responds to #substitute(String) case_sensitive true/false ERROR end def default_options { splits_text_on: /\s/, rejects_token_if: :blank?.to_proc } end # Returns a number of tokens, generated from the given text, # based on the parameters given. # # Returns: # [[:token1, :token2], ["Original1", "Original2"]] # def tokenize text text = preprocess text.to_s # processing the text return empty_tokens if text.blank? words = pretokenize text # splitting and preparations for tokenizing return empty_tokens if words.empty? tokens = tokens_for words # creating tokens / strings [tokens, words] end # Default preprocessing hook. # # Does: # 1. Character substitution. # 2. Remove illegal expressions. # 3. Remove non-single stopwords. (Stopwords that occur with other words) # def preprocess text text = substitute_characters text remove_illegals text # We do not remove single stopwords e.g. in the indexer for # an entirely different reason than in the query tokenizer. # An indexed thing with just name "UND" (a possible stopword) # should not lose its name. # remove_non_single_stopwords text text end # Pretokenizing. # # Does: # * Split the text into words. # * Cap the amount of tokens if max_words is set. # def pretokenize text words = split text words.collect! { |word| normalize_with_patterns word } if normalize_with_patterns? reject words cap words if cap?(words) words end # Downcases. # def tokens_for words words.collect! { |word| word.downcase!; word } if downcase? words end # Returns empty tokens. # def empty_tokens [[], []] end end end