module BowTfidf class Tokenizer SPLIT_REGEX = /[\s\n\t\.,\-\!:()\/%\\+\|@^<«>*'~;=»\?—•$”\"’\[£“■‘\{#®♦°™€¥\]©§\}–]/ TOKEN_MIN_LENGTH = 3 TOKEN_MAX_LENGTH = 15 attr_reader :tokens def initialize @tokens = Set[] end def call(text) raise(ArgumentError, 'String instance expected') unless text.is_a?(String) raw_tokens = split(text) raw_tokens.each do |token| process_token(token) end tokens end private def split(text) text.split(SPLIT_REGEX) end def process_token(token) return if token.length < TOKEN_MIN_LENGTH return if token.length > TOKEN_MAX_LENGTH return if token.scan(/\D/).empty? # skip if str contains only digits tokens << token.downcase end end end