lib/epitome/corpus.rb in epitome-0.2.0 vs lib/epitome/corpus.rb in epitome-0.2.1
- old
+ new
@@ -1,7 +1,8 @@
require 'matrix'
require 'stopwords'
+require 'pry'
module Epitome
class Corpus
attr_reader :original_corpus
def initialize(document_collection, lang="en")
@@ -93,10 +94,13 @@
end
def idf(word)
# Number of documents in which word appears
# Inverse Frequency Smooth (as per wikipedia article)
- Math.log( @n_docs / n_docs_including_w(word) )
+ result = Math.log( @n_docs / n_docs_including_w(word) )
+
+ # Return 1 to avoid words having all the same td_idf by multiplying by 0
+ return result == 0 ? 1.0 : result
end
def tf(sentence, word)
# Number of occurences of word in sentence
sentence.scan(word).count