Sha256: 940d7f4f907f8776f0ea26e0d3ec9c9deb64bcaebc208f26ddf6dcc6aeb225de

Contents?: true

Size: 937 Bytes

Versions: 2

Compression:

Stored size: 937 Bytes

Contents

module Semantic
  module Transform
    class TFIDF

      @@number_of_documents_with_term = []
      def self.transform!(matrix)
        number_of_documents = matrix.size2
        @@number_of_documents_with_term = []

        matrix.transpose.enum_for(:each_row).with_index do |document, column_index|
          document.enum_for(:each).with_index do |term_weight, row_index|
            unless term_weight == 0.0
              inverse_document_frequency = 1 + GSL::Sf.log(number_of_documents / (number_of_documents_with_term(row_index, matrix).to_f + 1))
              term_frequency = Math.sqrt(term_weight)

              matrix[row_index, column_index] = term_frequency * inverse_document_frequency
            end
          end
        end
      end

      def self.number_of_documents_with_term(row_index, matrix)
        @@number_of_documents_with_term[row_index] ||= matrix.row(row_index).where.size
      end

    end
  end
end

Version data entries

2 entries across 2 versions & 1 rubygems

Version Path
rsemantic-0.2.1 lib/semantic/transform/tf_idf_transform.rb
rsemantic-0.2.0 lib/semantic/transform/tf_idf_transform.rb