lib/picky/tokenizer.rb in picky-4.6.3 vs lib/picky/tokenizer.rb in picky-4.6.4
- old
+ new
@@ -4,27 +4,45 @@
# Defines tokenizing processes used both in indexing and querying.
#
class Tokenizer
- extend API::Tokenizer
-
+ extend Picky::Helpers::Identification
include API::Tokenizer::CharacterSubstituter
def self.default_indexing_with options = {}
- @indexing = extract_tokenizer options
+ @indexing = from options
end
def self.indexing
@indexing ||= new
end
def self.default_searching_with options = {}
- @searching = extract_tokenizer options
+ @searching = from options
end
def self.searching
@searching ||= new
end
+
+ def self.from thing, index_name = nil, category_name = nil
+ return unless thing
+
+ if thing.respond_to? :tokenize
+ thing
+ else
+ if thing.respond_to? :[]
+ Picky::Tokenizer.new thing
+ else
+ raise <<-ERROR
+indexing options #{identifier_for(index_name, category_name)}should be either
+* a Hash
+or
+* an object that responds to #tokenize(text) => [[token1, token2, ...], [original1, original2, ...]]
+ERROR
+ end
+ end
+ end
def to_s
reject_condition_location = @reject_condition.to_s[/:(\d+) \(lambda\)/, 1]
<<-TOKENIZER
Removes characters: #{@removes_characters_regexp ? "/#{@removes_characters_regexp.source}/" : '-'}
@@ -118,11 +136,11 @@
substituter?? substituter.substitute(text) : text
end
# Reject tokens after tokenizing based on the given criteria.
#
- def rejects_token_if &condition
+ def rejects_token_if condition
@reject_condition = condition
end
def reject tokens
tokens.reject! &@reject_condition
end
@@ -159,17 +177,21 @@
attr_reader :substituter
alias substituter? substituter
def initialize options = {}
- substitutes_characters_with options[:substitutes_characters_with] if options[:substitutes_characters_with]
- removes_characters options[:removes_characters] if options[:removes_characters]
- stopwords options[:stopwords] if options[:stopwords]
- splits_text_on options[:splits_text_on] || /\s/
- normalizes_words options[:normalizes_words] if options[:normalizes_words]
- max_words options[:max_words]
- rejects_token_if &(options[:rejects_token_if] || :blank?)
- case_sensitive options[:case_sensitive] unless options[:case_sensitive].nil?
+ options = default_options.merge options
+ options.each do |method_name, value|
+ send method_name, value unless value.nil?
+ end
+ rescue NoMethodError => e
+ raise %Q{The option "#{e.name}" is not a valid option for a Picky tokenizer.\nPlease see https://github.com/floere/picky/wiki/Indexing-configuration for valid options.}
+ end
+ def default_options
+ {
+ splits_text_on: /\s/,
+ rejects_token_if: :blank?.to_proc
+ }
end
# Returns a number of tokens, generated from the given text,
# based on the parameters given.
#
\ No newline at end of file