Sha256: 2603ac1dec3eb22229f05ca723cfa2e9d96e76f619fa5a28e0d146b2d8ab65c5

Contents?: true

Size: 1.8 KB

Versions: 4

Compression:

Stored size: 1.8 KB

Contents

module Company
  module Mapping

#TFIDF class implements Term Frequency Inverse Document Frequency statistic. Term frequency–inverse document frequency,
# is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus.
    class TFIDF
      attr_accessor :tf, :idf

      def initialize(corpus)
        @corpus = corpus
      end

      #Calculates the tf-idf weights in the given corpus
      def calculate
        @tfidf = Hash.new

        @idf ||= InverseDocumentFrequency.new(@corpus)
        @tf ||= NormalizedTermFrequency.new(BasicTokenizer.new)
        @idf_weights = @idf.calculate

        @corpus.each do |doc|
          termfreq = @tf.calculate(doc.contents)

          @tfidf[doc.id] =
              termfreq.each_with_object({}) do |(term, tf), tfidf_weights|
                weight = tf * @idf_weights[term]
                tfidf_weights[term] = weight
              end
        end
        @tfidf
      end

      #Calculates tfidf weights of new incoming document without importing the document in the corpus and re-calculating the tf-idf weights for the entire corpus
      def calculate_tfidf_weights_of_new_document(new_doc)
        termfreq = @tf.calculate(new_doc.contents)

        @tfidf[new_doc.id] = termfreq.each_with_object({}) do |(term, tf), tfidf_weights|
          weight = tf * (@idf_weights[term] || @idf.maxIDF)
          tfidf_weights[term] = weight
        end
        @tfidf
      end

      #Calculates tf-idf similarity between two given documents. It is actually
      #the calculated Cosine Similarity by using tf*idf weights.
      def similarity(doc1_id, doc2_id)
        @tfidf ||= calculate
        CosineSimilarity.new.calculate(@tfidf[doc1_id], @tfidf[doc2_id])
      end
    end

  end
end

Version data entries

4 entries across 4 versions & 1 rubygems

Version Path
company-mapping-0.2.3 lib/company/mapping/tfidf/tfidf.rb
company-mapping-0.2.2 lib/company/mapping/tfidf/tfidf.rb
company-mapping-0.2.1 lib/company/mapping/tfidf/tfidf.rb
company-mapping-0.2.0 lib/company/mapping/tfidf/tfidf.rb