Sha256: 414b9ea7c3a284b1d19ce28ff766c3f2148823abcd6fbd9c5f59b44b78b9db02
Contents?: true
Size: 1.98 KB
Versions: 1
Compression:
Stored size: 1.98 KB
Contents
module Classifier class Base def initialize(options = {}) options.reverse_merge!(:language => 'en') options.reverse_merge!(:encoding => 'UTF_8') @options = options end def prepare_category_name val val.to_s.gsub("_"," ").capitalize end # Removes common punctuation symbols, returning a new string. # E.g., # "Hello (greeting's), with {braces} < >...?".without_punctuation # => "Hello greetings with braces " def without_punctuation str str.tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "") end # Return a Hash of strings => ints. Each word in the string is stemmed, # and indexes to its frequency in the document. def word_hash str word_hash_for_words(str.gsub(/[^\w\s]/,"").split + str.gsub(/[\w]/," ").split) end # Return a word hash without extra punctuation or short symbols, just stemmed words def clean_word_hash str word_hash_for_words str.gsub(/[^\w\s]/,"").split end # When a Classifier instance is serialized, it is saved with an instance # of Lingua::Stemmer that may not be initialized when deserialized later, # raising a "RuntimeError: Stemmer is not initialized". # # You can run remove_stemmer to force a new Stemmer to be initialized. def remove_stemmer @stemmer = nil end private def stemmer @stemmer ||= Lingua::Stemmer.new(@options) end def word_hash_for_words(words) d = Hash.new skip_words = StopWords.for(@options[:language], @options[:lang_dir]) encoding_name = @options[:encoding].gsub(/_/, '-') words.each do |word| word = word.mb_chars.downcase.to_s if word =~ /[\w]+/ key = stemmer.stem(word) key.force_encoding(encoding_name) if defined?(Encoding) && key && key.respond_to?(:force_encoding) if word =~ /[^\w]/ || ! skip_words.include?(word) && word.length > 2 d[key] ||= 0 d[key] += 1 end end return d end end end
Version data entries
1 entries across 1 versions & 1 rubygems
Version | Path |
---|---|
logankoester-classifier-1.4.3 | lib/classifier/base.rb |