Sha256: 3687f6e45f6656b86bcc71c117bfaee196176eb607abd0ac3252697aed61a4b8
Contents?: true
Size: 1.84 KB
Versions: 1
Compression:
Stored size: 1.84 KB
Contents
module Classifier class Base def initialize(options = {}) options.reverse_merge!(:language => 'en') options.reverse_merge!(:encoding => 'UTF_8') @options = options end def prepare_category_name val val.to_s.gsub("_"," ").capitalize.intern end # Removes common punctuation symbols, returning a new string. # E.g., # "Hello (greeting's), with {braces} < >...?".without_punctuation # => "Hello greetings with braces " def without_punctuation str str.tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "") end # Return a Hash of strings => ints. Each word in the string is stemmed, # interned, and indexes to its frequency in the document. def word_hash str word_hash_for_words(str.gsub(/[^\w\s]/,"").split + str.gsub(/[\w]/," ").split) end # Return a word hash without extra punctuation or short symbols, just stemmed words def clean_word_hash str word_hash_for_words str.gsub(/[^\w\s]/,"").split end # When a Classifier instance is serialized, it is saved with an instance # of Lingua::Stemmer that may not be initialized when deserialized later, # raising a "RuntimeError: Stemmer is not initialized". # # You can run remove_stemmer to force a new Stemmer to be initialized. def remove_stemmer @stemmer = nil end private def stemmer @stemmer ||= Lingua::Stemmer.new(@options) end def word_hash_for_words(words) d = Hash.new skip_words = StopWords.for(@options[:language], @options[:lang_dir]) words.each do |word| word = word.mb_chars.downcase.to_s if word =~ /[\w]+/ key = stemmer.stem(word).intern if word =~ /[^\w]/ || ! skip_words.include?(word) && word.length > 2 d[key] ||= 0 d[key] += 1 end end return d end end end
Version data entries
1 entries across 1 versions & 1 rubygems
Version | Path |
---|---|
luisparravicini-classifier-1.4.1 | lib/classifier/base.rb |