lib/company/mapping/tfidf/tfidf.rb in company-mapping-0.1.0 vs lib/company/mapping/tfidf/tfidf.rb in company-mapping-0.2.0
- old
+ new
@@ -1,9 +1,10 @@
module Company
module Mapping
-#TFIDF class implements Term Frequency Inverse Document Frequency statistic.
+#TFIDF class implements Term Frequency Inverse Document Frequency statistic. Term frequency–inverse document frequency,
+# is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus.
class TFIDF
attr_accessor :tf, :idf
def initialize(corpus)
@corpus = corpus
@@ -11,70 +12,41 @@
#Calculates the tf-idf weights in the given corpus
def calculate
@tfidf = Hash.new
- if (@idf==nil)
- @idf = InverseDocumentFrequency.new(@corpus)
- end
-
- if (@tf==nil)
- _tokenizer = BasicTokenizer.new
- @tf = NormalizedTermFrequency.new(_tokenizer)
- end
-
+ @idf ||= InverseDocumentFrequency.new(@corpus)
+ @tf ||= NormalizedTermFrequency.new(BasicTokenizer.new)
@idf_weights = @idf.calculate
- @corpus.each {
- |doc|
+ @corpus.each do |doc|
+ termfreq = @tf.calculate(doc.contents)
- _termfreq = @tf.calculate(doc.contents)
-
- _tfidf_weights = Hash.new
-
- _termfreq.each do |term, tf|
- _weight = tf * @idf_weights[term]
- _tfidf_weights[term] = _weight
- end
-
- @tfidf[doc.id] = _tfidf_weights
- }
- return @tfidf
+ @tfidf[doc.id] =
+ termfreq.each_with_object({}) do |(term, tf), tfidf_weights|
+ weight = tf * @idf_weights[term]
+ tfidf_weights[term] = weight
+ end
+ end
+ @tfidf
end
#Calculates tfidf weights of new incoming document without importing the document in the corpus and re-calculating the tf-idf weights for the entire corpus
def calculate_tfidf_weights_of_new_document(new_doc)
- _termfreq = @tf.calculate(new_doc.contents)
+ termfreq = @tf.calculate(new_doc.contents)
- _tfidf_weights = Hash.new
-
- _termfreq.each do |term, tf|
- if (@idf_weights.has_key? term)
- _weight = tf * @idf_weights[term]
- else
- _weight = tf * @idf.maxIDF
- end
- _tfidf_weights[term] = _weight
+ @tfidf[new_doc.id] = termfreq.each_with_object({}) do |(term, tf), tfidf_weights|
+ weight = tf * (@idf_weights[term] || @idf.maxIDF)
+ tfidf_weights[term] = weight
end
- @tfidf[new_doc.id] = _tfidf_weights
- return @tfidf
+ @tfidf
end
#Calculates tf-idf similarity between two given documents. It is actually
#the calculated Cosine Similarity by using tf*idf weights.
def similarity(doc1_id, doc2_id)
- if (@tfidf==nil)
- calculate
- end
-
- _cosine_similarity = CosineSimilarity.new
- return _cosine_similarity.calculate(@tfidf[doc1_id], @tfidf[doc2_id])
- end
-
- def info
- " term frequency–inverse document frequency, is a numerical "
- +"statistic that is intended to reflect how important a word "
- +"is to a document in a collection or corpus"
+ @tfidf ||= calculate
+ CosineSimilarity.new.calculate(@tfidf[doc1_id], @tfidf[doc2_id])
end
end
end
end
\ No newline at end of file