lib/picky/internals/tokenizers/base.rb in picky-1.5.2 vs lib/picky/internals/tokenizers/base.rb in picky-1.5.3
- old
+ new
@@ -1,20 +1,37 @@
module Internals
-
+
module Tokenizers # :nodoc:all
-
+
# Defines tokenizing processes used both in indexing and querying.
#
class Base
-
+
# TODO Move EMPTY_STRING top level.
#
EMPTY_STRING = ''.freeze
-
+
+ def to_s
+ reject_condition_location = @reject_condition.to_s[/:(\d+) \(lambda\)/, 1]
+ <<-TOKENIZER
+Removes characters: #{@removes_characters_regexp ? "/#{@removes_characters_regexp.source}/" : '-'}
+Stopwords: #{@remove_stopwords_regexp ? "/#{@remove_stopwords_regexp.source}/" : '-'}
+Splits text on: #{@splits_text_on.respond_to?(:source) ? "/#{@splits_text_on.source}/" : (@splits_text_on ? @splits_text_on : '-')}
+Removes chars after split: #{@removes_characters_after_splitting_regexp ? "/#{@removes_characters_after_splitting_regexp.source}/" : '-'}
+Normalizes words: #{@normalizes_words_regexp_replaces ? @normalizes_words_regexp_replaces : '-'}
+Rejects tokens? #{reject_condition_location ? "Yes, see line #{reject_condition_location} in app/application.rb" : '-'}
+Substitutes chars? #{@substituter ? "Yes, using #{@substituter}." : '-' }
+TOKENIZER
+ end
+
# Stopwords.
#
+ # We only allow regexps (even if string would be okay
+ # too for gsub! - it's too hard to understand)
+ #
def stopwords regexp
+ check_argument_in __method__, Regexp, regexp
@remove_stopwords_regexp = regexp
end
def remove_stopwords text
text.gsub! @remove_stopwords_regexp, EMPTY_STRING if @remove_stopwords_regexp
text
@@ -22,82 +39,101 @@
@@non_single_stopword_regexp = /^\b[\w:]+?\b[\.\*\~]?\s?$/
def remove_non_single_stopwords text
return text if text.match @@non_single_stopword_regexp
remove_stopwords text
end
-
+
# Illegals.
#
- # TODO Should there be a legal?
+ # We only allow regexps (even if string would be okay
+ # too for gsub! - it's too hard to understand)
#
def removes_characters regexp
+ check_argument_in __method__, Regexp, regexp
@removes_characters_regexp = regexp
end
def remove_illegals text
text.gsub! @removes_characters_regexp, EMPTY_STRING if @removes_characters_regexp
text
end
-
+
# Splitting.
#
- def splits_text_on regexp
- @splits_text_on_regexp = regexp
+ # We allow Strings and Regexps.
+ # Note: We do not test against to_str since symbols do not work with String#split.
+ #
+ def splits_text_on regexp_or_string
+ raise ArgumentError.new "#{__method__} takes a Regexp or String as argument, not a #{regexp_or_string.class}." unless Regexp === regexp_or_string || String === regexp_or_string
+ @splits_text_on = regexp_or_string
end
def split text
- text.split @splits_text_on_regexp
+ text.split @splits_text_on
end
-
+
# Normalizing.
#
+ # We only allow arrays.
+ #
def normalizes_words regexp_replaces
+ raise ArgumentError.new "#{__method__} takes an Array of replaces as argument, not a #{regexp_replaces.class}." unless regexp_replaces.respond_to?(:to_ary)
@normalizes_words_regexp_replaces = regexp_replaces
end
def normalize_with_patterns text
return text unless @normalizes_words_regexp_replaces
-
+
@normalizes_words_regexp_replaces.each do |regex, replace|
# This should be sufficient
#
text.gsub!(regex, replace) and break
end
remove_after_normalizing_illegals text
text
end
-
+
# Illegal after normalizing.
#
+ # We only allow regexps (even if string would be okay
+ # too for gsub! - it's too hard to understand)
+ #
def removes_characters_after_splitting regexp
+ check_argument_in __method__, Regexp, regexp
@removes_characters_after_splitting_regexp = regexp
end
def remove_after_normalizing_illegals text
text.gsub! @removes_characters_after_splitting_regexp, EMPTY_STRING if @removes_characters_after_splitting_regexp
end
-
+
# Substitute Characters with this substituter.
#
# Default is European Character substitution.
#
def substitutes_characters_with substituter = CharacterSubstituters::WestEuropean.new
- # TODO Raise if it doesn't quack substitute?
+ raise ArgumentError.new "The substitutes_characters_with option needs a character substituter, which responds to #substitute." unless substituter.respond_to?(:substitute)
@substituter = substituter
end
def substitute_characters text
- substituter?? substituter.substitute(text) : text
+ substituter?? substituter.substitute(text) : text
end
-
+
# Reject tokens after tokenizing based on the given criteria.
#
# Note: Currently only for indexing. TODO Redesign and write for both!
#
def reject_token_if &condition
@reject_condition = condition
end
def reject tokens
tokens.reject! &@reject_condition
end
-
-
+
+ # Checks if the right argument type has been given.
+ #
+ def check_argument_in method, type, argument, &condition
+ raise ArgumentError.new "Application##{method} takes a #{type} as argument, not a #{argument.class}." unless type === argument
+ end
+
+
# Returns a number of tokens, generated from the given text.
#
# Note:
# * preprocess, pretokenize are hooks
#
@@ -107,31 +143,31 @@
words = pretokenize text # splitting and preparations for tokenizing
return empty_tokens if words.empty?
tokens = tokens_for words # creating tokens / strings
process tokens # processing tokens / strings
end
-
+
attr_reader :substituter
alias substituter? substituter
-
+
def initialize options = {}
removes_characters options[:removes_characters] if options[:removes_characters]
contracts_expressions *options[:contracts_expressions] if options[:contracts_expressions]
stopwords options[:stopwords] if options[:stopwords]
normalizes_words options[:normalizes_words] if options[:normalizes_words]
removes_characters_after_splitting options[:removes_characters_after_splitting] if options[:removes_characters_after_splitting]
substitutes_characters_with options[:substitutes_characters_with] if options[:substitutes_characters_with]
-
+
# Defaults.
#
splits_text_on options[:splits_text_on] || /\s/
reject_token_if &(options[:reject_token_if] || :blank?)
end
-
+
# Hooks.
#
-
+
# Preprocessing.
#
def preprocess text; end
# Pretokenizing.
#
@@ -140,11 +176,11 @@
#
def process tokens
reject tokens # Reject any tokens that don't meet criteria
tokens
end
-
+
# Converts words into real tokens.
#
def tokens_for words
Internals::Query::Tokens.new words.collect! { |word| token_for word }
end
@@ -156,11 +192,11 @@
# Returns a tokens object.
#
def empty_tokens
Internals::Query::Tokens.new
end
-
+
end
-
+
end
-
+
end
\ No newline at end of file