Sha256: ef8537e09d0a81ed536d75161d853aecf8f2559e740c15c994a1f2e80e0ab33f
Contents?: true
Size: 1.36 KB
Versions: 1
Compression:
Stored size: 1.36 KB
Contents
module Frekwenza class TfIdf attr_reader :tf, :idf, :tf_idf def initialize(docs, limit, stop_words_file=nil) @docs = split_docs(docs) @tf = [] @idf = {} @tf_idf = [] @docs_size = @docs.size calculate_tf_and_idf calculate_tf_idf(limit, stop_words_file) end private def split_docs(docs) words = [] docs.each do |d| words << d.downcase.gsub(/[^a-z0-9]/, ' ').split(' ') end words end def calculate_tf_and_idf @docs.each do |words| terms_freq = words.inject(Hash.new(0)){|h, e| h[e]+=1; h} @tf.push(terms_freq) distinct_words = words.uniq distinct_words.each do |w| if @idf.has_key?(w) y = @docs_size / ( 10**(@idf[w]) ) y += 1 @idf[w] = Math.log10(@docs_size / y) else @idf[w] = Math.log10(@docs_size) end end end end def calculate_tf_idf(limit, stop_words_file) @tf.each do |tf_freq| tfidf = Hash.new(0) tf_freq.each do |k, v| tfidf[k] = @idf[k] * v end if stop_words_file sw = StopWords.new(stop_words_file) tfidf.reject!{|k| sw.stop_words.include?(k)} end tfidf = Hash[tfidf.sort_by{|k, v| -v}[0..limit-1]] @tf_idf.push(tfidf) end end end end
Version data entries
1 entries across 1 versions & 1 rubygems
Version | Path |
---|---|
frekwenza-0.0.1 | lib/frekwenza/tf_idf.rb |