module ActsAsTokenizable module StringExtensions #converts accented letters into ascii equivalents (i.e. ñ becomes n) def self.normalize(str) str.mb_chars.normalize(:d).gsub(/[^\x00-\x7F]/n,'').to_s end #returns true if numeric, false, otherwise def self.numeric?(str) true if Float(str) rescue false end #returns an array of strings containing the words on this string. removes spaces, strange chars, etc def self.words(str) str.gsub(/[^\w|-]/, ' ').split end #removes certain words from a string. # As a side-effect, all word-separators are converted to the separator char def self.remove_words(str, words_array, separator = ' ') (words(str) - words_array).join separator end # replaces certain words on a string. # As a side-effect, all word-separators are converted to the separator char def self.replace_words(str, replacements, separator = ' ') replaced_words = words(str) replacements.each do |candidates,replacement| candidates.each do |candidate| replaced_words=replaced_words.collect {|w| w==candidate ? replacement : w} end end replaced_words.join separator end # returns an array that contains, in order: # * the numeric parts, converted to numbers # * the non-numeric parts, as text # this is useful for sorting alphanumerically. For example: # ["A1", "A12", "A2"].sort_by{|x| x.alphanumerics} => ["A1", "A2", "A12"] # # inspired by : http://blog.labnotes.org/2007/12/13/rounded-corners-173-beautiful-code/ def self.alphanumerics(str) str.split(/(\d+)/).map { |v| v =~ /\d/ ? v.to_i : v } end #convert into something that can be used as an indexation key def self.to_token(str, max_length=255) str = normalize(str).strip.downcase.gsub(/[^\w|-]/, '') #remove all non-alphanumeric but hyphen (-) str = str.squeeze unless numeric?(str) #remove duplicates, except on pure numbers return str[0..(max_length-1)] end #convert into something that can be used on links def self.to_slug(str, separator='-') words(normalize(str.strip.downcase)).join(separator) end #tokenizes each word individually, and joins the word with the separator char. def self.words_to_token(str, max_length=255, separator = ' ') words(str).collect{|w| to_token(w)}.uniq.join(separator)[0..(max_length-1)] end end end