require File.expand_path('stopwords', File.dirname(__FILE__)) module Wukong module Helper module Tokenize # # Split a string into its constituent words. # # This is pretty simpleminded: # * downcase the word # * Split at any non-alphanumeric boundary, including '_' # * However, preserve the special cases of 's, 'd or 't at the end of a # word. # # tokenize("Ability is a poor man's wealth #johnwoodenquote") # # => ["ability", "is", "a", "poor", "man's", "wealth", "johnwoodenquote"] # def self.tokenize str return [] if str.blank? str = str.downcase; # kill off all punctuation except [stuff]'s or [stuff]'t # this includes hyphens (words are split) str = str. gsub(/[^a-zA-Z0-9\']+/, ' '). gsub(/(\w)\'([stdm]|re|ve|ll)\b/, '\1!\2').gsub(/\'/, ' ').gsub(/!/, "'") # Busticate at whitespace words = str.split(/\s+/) words.reject!{|w| w.length < 3 || Wukong::Corpus::STOPWORDS_3.include?(w) } words end end end end