lib/picky/tokenizer.rb in picky-4.6.5 vs lib/picky/tokenizer.rb in picky-4.6.6
- old
+ new
@@ -6,10 +6,11 @@
#
class Tokenizer
extend Picky::Helpers::Identification
include API::Tokenizer::CharacterSubstituter
+ include API::Tokenizer::Stemmer
def self.default_indexing_with options = {}
@indexing = from options
end
def self.indexing
@@ -49,10 +50,11 @@
Stopwords: #{@remove_stopwords_regexp ? "/#{@remove_stopwords_regexp.source}/" : '-'}
Splits text on: #{@splits_text_on.respond_to?(:source) ? "/#{@splits_text_on.source}/" : (@splits_text_on ? @splits_text_on : '-')}
Normalizes words: #{@normalizes_words_regexp_replaces ? @normalizes_words_regexp_replaces : '-'}
Rejects tokens? #{reject_condition_location ? "Yes, see line #{reject_condition_location} in app/application.rb" : '-'}
Substitutes chars? #{@substituter ? "Yes, using #{@substituter}." : '-' }
+Stems? #{@stemmer ? "Yes, using #{@stemmer}." : '-' }
Case sensitive? #{@case_sensitive ? "Yes." : "-"}
TOKENIZER
end
# Stopwords.
@@ -133,10 +135,19 @@
@substituter = extract_character_substituter substituter
end
def substitute_characters text
substituter?? substituter.substitute(text) : text
end
+
+ # Stems tokens with this stemmer.
+ #
+ def stems_with stemmer
+ @stemmer = extract_stemmer stemmer
+ end
+ def stem text
+ stemmer?? stemmer.stem(text) : text
+ end
# Reject tokens after tokenizing based on the given criteria.
#
def rejects_token_if condition
@reject_condition = condition
@@ -173,12 +184,13 @@
#
def check_argument_in method, type, argument, &condition
raise ArgumentError.new "Application##{method} takes a #{type} as argument, not a #{argument.class}." unless type === argument
end
- attr_reader :substituter
+ attr_reader :substituter, :stemmer
alias substituter? substituter
+ alias stemmer? stemmer
def initialize options = {}
options = default_options.merge options
options.each do |method_name, value|
send method_name, value unless value.nil?
@@ -194,10 +206,11 @@
stopwords /regexp/
splits_text_on /regexp/ or "String", default /\s/
normalizes_words [[/replace (this)/, 'with this \\1'], ...]
rejects_token_if Proc/lambda, default :blank?.to_proc
substitutes_characters_with Picky::CharacterSubstituter or responds to #substitute(String)
+ stems_with Instance responds to #stem(String)
case_sensitive true/false
ERROR
end
def default_options
@@ -257,9 +270,10 @@
# Downcases.
#
def tokens_for words
words.collect! { |word| word.downcase!; word } if downcase?
+ words.collect! { |word| stem word } if stemmer?
words
end
# Returns empty tokens.
#
\ No newline at end of file