lib/company/mapping/tfidf/tfidf.rb in company-mapping-0.1.0 vs lib/company/mapping/tfidf/tfidf.rb in company-mapping-0.2.0

- old
+ new

@@ -1,9 +1,10 @@ module Company module Mapping -#TFIDF class implements Term Frequency Inverse Document Frequency statistic. +#TFIDF class implements Term Frequency Inverse Document Frequency statistic. Term frequency–inverse document frequency, +# is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus. class TFIDF attr_accessor :tf, :idf def initialize(corpus) @corpus = corpus @@ -11,70 +12,41 @@ #Calculates the tf-idf weights in the given corpus def calculate @tfidf = Hash.new - if (@idf==nil) - @idf = InverseDocumentFrequency.new(@corpus) - end - - if (@tf==nil) - _tokenizer = BasicTokenizer.new - @tf = NormalizedTermFrequency.new(_tokenizer) - end - + @idf ||= InverseDocumentFrequency.new(@corpus) + @tf ||= NormalizedTermFrequency.new(BasicTokenizer.new) @idf_weights = @idf.calculate - @corpus.each { - |doc| + @corpus.each do |doc| + termfreq = @tf.calculate(doc.contents) - _termfreq = @tf.calculate(doc.contents) - - _tfidf_weights = Hash.new - - _termfreq.each do |term, tf| - _weight = tf * @idf_weights[term] - _tfidf_weights[term] = _weight - end - - @tfidf[doc.id] = _tfidf_weights - } - return @tfidf + @tfidf[doc.id] = + termfreq.each_with_object({}) do |(term, tf), tfidf_weights| + weight = tf * @idf_weights[term] + tfidf_weights[term] = weight + end + end + @tfidf end #Calculates tfidf weights of new incoming document without importing the document in the corpus and re-calculating the tf-idf weights for the entire corpus def calculate_tfidf_weights_of_new_document(new_doc) - _termfreq = @tf.calculate(new_doc.contents) + termfreq = @tf.calculate(new_doc.contents) - _tfidf_weights = Hash.new - - _termfreq.each do |term, tf| - if (@idf_weights.has_key? term) - _weight = tf * @idf_weights[term] - else - _weight = tf * @idf.maxIDF - end - _tfidf_weights[term] = _weight + @tfidf[new_doc.id] = termfreq.each_with_object({}) do |(term, tf), tfidf_weights| + weight = tf * (@idf_weights[term] || @idf.maxIDF) + tfidf_weights[term] = weight end - @tfidf[new_doc.id] = _tfidf_weights - return @tfidf + @tfidf end #Calculates tf-idf similarity between two given documents. It is actually #the calculated Cosine Similarity by using tf*idf weights. def similarity(doc1_id, doc2_id) - if (@tfidf==nil) - calculate - end - - _cosine_similarity = CosineSimilarity.new - return _cosine_similarity.calculate(@tfidf[doc1_id], @tfidf[doc2_id]) - end - - def info - " term frequency–inverse document frequency, is a numerical " - +"statistic that is intended to reflect how important a word " - +"is to a document in a collection or corpus" + @tfidf ||= calculate + CosineSimilarity.new.calculate(@tfidf[doc1_id], @tfidf[doc2_id]) end end end end \ No newline at end of file