lib/picky/tokenizer.rb in picky-4.6.5 vs lib/picky/tokenizer.rb in picky-4.6.6

- old
+ new

@@ -6,10 +6,11 @@ # class Tokenizer extend Picky::Helpers::Identification include API::Tokenizer::CharacterSubstituter + include API::Tokenizer::Stemmer def self.default_indexing_with options = {} @indexing = from options end def self.indexing @@ -49,10 +50,11 @@ Stopwords: #{@remove_stopwords_regexp ? "/#{@remove_stopwords_regexp.source}/" : '-'} Splits text on: #{@splits_text_on.respond_to?(:source) ? "/#{@splits_text_on.source}/" : (@splits_text_on ? @splits_text_on : '-')} Normalizes words: #{@normalizes_words_regexp_replaces ? @normalizes_words_regexp_replaces : '-'} Rejects tokens? #{reject_condition_location ? "Yes, see line #{reject_condition_location} in app/application.rb" : '-'} Substitutes chars? #{@substituter ? "Yes, using #{@substituter}." : '-' } +Stems? #{@stemmer ? "Yes, using #{@stemmer}." : '-' } Case sensitive? #{@case_sensitive ? "Yes." : "-"} TOKENIZER end # Stopwords. @@ -133,10 +135,19 @@ @substituter = extract_character_substituter substituter end def substitute_characters text substituter?? substituter.substitute(text) : text end + + # Stems tokens with this stemmer. + # + def stems_with stemmer + @stemmer = extract_stemmer stemmer + end + def stem text + stemmer?? stemmer.stem(text) : text + end # Reject tokens after tokenizing based on the given criteria. # def rejects_token_if condition @reject_condition = condition @@ -173,12 +184,13 @@ # def check_argument_in method, type, argument, &condition raise ArgumentError.new "Application##{method} takes a #{type} as argument, not a #{argument.class}." unless type === argument end - attr_reader :substituter + attr_reader :substituter, :stemmer alias substituter? substituter + alias stemmer? stemmer def initialize options = {} options = default_options.merge options options.each do |method_name, value| send method_name, value unless value.nil? @@ -194,10 +206,11 @@ stopwords /regexp/ splits_text_on /regexp/ or "String", default /\s/ normalizes_words [[/replace (this)/, 'with this \\1'], ...] rejects_token_if Proc/lambda, default :blank?.to_proc substitutes_characters_with Picky::CharacterSubstituter or responds to #substitute(String) + stems_with Instance responds to #stem(String) case_sensitive true/false ERROR end def default_options @@ -257,9 +270,10 @@ # Downcases. # def tokens_for words words.collect! { |word| word.downcase!; word } if downcase? + words.collect! { |word| stem word } if stemmer? words end # Returns empty tokens. # \ No newline at end of file