lib/picky/internals/tokenizers/base.rb in picky-1.5.2 vs lib/picky/internals/tokenizers/base.rb in picky-1.5.3

- old
+ new

@@ -1,20 +1,37 @@ module Internals - + module Tokenizers # :nodoc:all - + # Defines tokenizing processes used both in indexing and querying. # class Base - + # TODO Move EMPTY_STRING top level. # EMPTY_STRING = ''.freeze - + + def to_s + reject_condition_location = @reject_condition.to_s[/:(\d+) \(lambda\)/, 1] + <<-TOKENIZER +Removes characters: #{@removes_characters_regexp ? "/#{@removes_characters_regexp.source}/" : '-'} +Stopwords: #{@remove_stopwords_regexp ? "/#{@remove_stopwords_regexp.source}/" : '-'} +Splits text on: #{@splits_text_on.respond_to?(:source) ? "/#{@splits_text_on.source}/" : (@splits_text_on ? @splits_text_on : '-')} +Removes chars after split: #{@removes_characters_after_splitting_regexp ? "/#{@removes_characters_after_splitting_regexp.source}/" : '-'} +Normalizes words: #{@normalizes_words_regexp_replaces ? @normalizes_words_regexp_replaces : '-'} +Rejects tokens? #{reject_condition_location ? "Yes, see line #{reject_condition_location} in app/application.rb" : '-'} +Substitutes chars? #{@substituter ? "Yes, using #{@substituter}." : '-' } +TOKENIZER + end + # Stopwords. # + # We only allow regexps (even if string would be okay + # too for gsub! - it's too hard to understand) + # def stopwords regexp + check_argument_in __method__, Regexp, regexp @remove_stopwords_regexp = regexp end def remove_stopwords text text.gsub! @remove_stopwords_regexp, EMPTY_STRING if @remove_stopwords_regexp text @@ -22,82 +39,101 @@ @@non_single_stopword_regexp = /^\b[\w:]+?\b[\.\*\~]?\s?$/ def remove_non_single_stopwords text return text if text.match @@non_single_stopword_regexp remove_stopwords text end - + # Illegals. # - # TODO Should there be a legal? + # We only allow regexps (even if string would be okay + # too for gsub! - it's too hard to understand) # def removes_characters regexp + check_argument_in __method__, Regexp, regexp @removes_characters_regexp = regexp end def remove_illegals text text.gsub! @removes_characters_regexp, EMPTY_STRING if @removes_characters_regexp text end - + # Splitting. # - def splits_text_on regexp - @splits_text_on_regexp = regexp + # We allow Strings and Regexps. + # Note: We do not test against to_str since symbols do not work with String#split. + # + def splits_text_on regexp_or_string + raise ArgumentError.new "#{__method__} takes a Regexp or String as argument, not a #{regexp_or_string.class}." unless Regexp === regexp_or_string || String === regexp_or_string + @splits_text_on = regexp_or_string end def split text - text.split @splits_text_on_regexp + text.split @splits_text_on end - + # Normalizing. # + # We only allow arrays. + # def normalizes_words regexp_replaces + raise ArgumentError.new "#{__method__} takes an Array of replaces as argument, not a #{regexp_replaces.class}." unless regexp_replaces.respond_to?(:to_ary) @normalizes_words_regexp_replaces = regexp_replaces end def normalize_with_patterns text return text unless @normalizes_words_regexp_replaces - + @normalizes_words_regexp_replaces.each do |regex, replace| # This should be sufficient # text.gsub!(regex, replace) and break end remove_after_normalizing_illegals text text end - + # Illegal after normalizing. # + # We only allow regexps (even if string would be okay + # too for gsub! - it's too hard to understand) + # def removes_characters_after_splitting regexp + check_argument_in __method__, Regexp, regexp @removes_characters_after_splitting_regexp = regexp end def remove_after_normalizing_illegals text text.gsub! @removes_characters_after_splitting_regexp, EMPTY_STRING if @removes_characters_after_splitting_regexp end - + # Substitute Characters with this substituter. # # Default is European Character substitution. # def substitutes_characters_with substituter = CharacterSubstituters::WestEuropean.new - # TODO Raise if it doesn't quack substitute? + raise ArgumentError.new "The substitutes_characters_with option needs a character substituter, which responds to #substitute." unless substituter.respond_to?(:substitute) @substituter = substituter end def substitute_characters text - substituter?? substituter.substitute(text) : text + substituter?? substituter.substitute(text) : text end - + # Reject tokens after tokenizing based on the given criteria. # # Note: Currently only for indexing. TODO Redesign and write for both! # def reject_token_if &condition @reject_condition = condition end def reject tokens tokens.reject! &@reject_condition end - - + + # Checks if the right argument type has been given. + # + def check_argument_in method, type, argument, &condition + raise ArgumentError.new "Application##{method} takes a #{type} as argument, not a #{argument.class}." unless type === argument + end + + # Returns a number of tokens, generated from the given text. # # Note: # * preprocess, pretokenize are hooks # @@ -107,31 +143,31 @@ words = pretokenize text # splitting and preparations for tokenizing return empty_tokens if words.empty? tokens = tokens_for words # creating tokens / strings process tokens # processing tokens / strings end - + attr_reader :substituter alias substituter? substituter - + def initialize options = {} removes_characters options[:removes_characters] if options[:removes_characters] contracts_expressions *options[:contracts_expressions] if options[:contracts_expressions] stopwords options[:stopwords] if options[:stopwords] normalizes_words options[:normalizes_words] if options[:normalizes_words] removes_characters_after_splitting options[:removes_characters_after_splitting] if options[:removes_characters_after_splitting] substitutes_characters_with options[:substitutes_characters_with] if options[:substitutes_characters_with] - + # Defaults. # splits_text_on options[:splits_text_on] || /\s/ reject_token_if &(options[:reject_token_if] || :blank?) end - + # Hooks. # - + # Preprocessing. # def preprocess text; end # Pretokenizing. # @@ -140,11 +176,11 @@ # def process tokens reject tokens # Reject any tokens that don't meet criteria tokens end - + # Converts words into real tokens. # def tokens_for words Internals::Query::Tokens.new words.collect! { |word| token_for word } end @@ -156,11 +192,11 @@ # Returns a tokens object. # def empty_tokens Internals::Query::Tokens.new end - + end - + end - + end \ No newline at end of file