Sha256: d67d2d38335949d78e8e867e03fbc97124a6439a2bf1a52a54a7c88a622acecf
Contents?: true
Size: 1.48 KB
Versions: 3
Compression:
Stored size: 1.48 KB
Contents
module Classifier class Base def initialize(options = {}) options.reverse_merge!(:language => 'en') options.reverse_merge!(:encoding => 'UTF_8') @options = options end def prepare_category_name val val.to_s.gsub("_"," ").capitalize.intern end # Removes common punctuation symbols, returning a new string. # E.g., # "Hello (greeting's), with {braces} < >...?".without_punctuation # => "Hello greetings with braces " def without_punctuation str str.tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "") end # Return a Hash of strings => ints. Each word in the string is stemmed, # interned, and indexes to its frequency in the document. def word_hash str word_hash_for_words(str.gsub(/[^\w\s]/,"").split + str.gsub(/[\w]/," ").split) end # Return a word hash without extra punctuation or short symbols, just stemmed words def clean_word_hash str word_hash_for_words str.gsub(/[^\w\s]/,"").split end private def stemmer @stemmer ||= Lingua::Stemmer.new(@options) end def word_hash_for_words(words) d = Hash.new skip_words = SkipWords.for(@options[:language]) words.each do |word| word = word.mb_chars.downcase.to_s if word =~ /[\w]+/ key = stemmer.stem(word).intern if word =~ /[^\w]/ || ! skip_words.include?(word) && word.length > 2 d[key] ||= 0 d[key] += 1 end end return d end end end
Version data entries
3 entries across 3 versions & 1 rubygems
Version | Path |
---|---|
luisparravicini-classifier-1.3.9 | lib/classifier/base.rb |
luisparravicini-classifier-1.3.8 | lib/classifier/base.rb |
luisparravicini-classifier-1.3.7 | lib/classifier/base.rb |