lib/semantic/transform/tf_idf_transform.rb in rsemantic-0.1.3 vs lib/semantic/transform/tf_idf_transform.rb in rsemantic-0.1.4

- old
+ new

@@ -1,41 +1,30 @@ module Semantic module Transform class TFIDF @@number_of_documents_with_term = [] - - def self.transform(matrix) - number_of_documents = matrix.num_columns + def self.transform!(matrix) + number_of_documents = matrix.size2 @@number_of_documents_with_term = [] - matrix.columns.each_with_index do |document, column_index| - document_term_total = document.rows.inject(0.0) {|word_sum, word_count| word_sum + word_count.to_f } + matrix.transpose.enum_for(:each_row).with_index do |document, column_index| + document_term_total = document.sum - document.rows.each_with_index do |term_weight, row_index| - unless term_weight.to_f == 0.0 - matrix[row_index, column_index] = (term_weight / document_term_total) * - Math.log((number_of_documents / number_of_documents_with_term(row_index, matrix).to_f).abs) + document.enum_for(:each).with_index do |term_weight, row_index| + unless term_weight == 0.0 + inverse_document_frequency = GSL::Sf.log((number_of_documents / + number_of_documents_with_term(row_index, matrix).to_f).abs) + term_frequency = (term_weight / document_term_total) + + matrix[row_index, column_index] = term_frequency * inverse_document_frequency end end end - matrix end def self.number_of_documents_with_term(row_index, matrix) - return @@number_of_documents_with_term[row_index] unless @@number_of_documents_with_term[row_index].nil? - - term_document_occurences = 0 - - rows,cols = matrix.dimensions - - for n in (0...cols) - if matrix[row_index, n] > 0 #Term appears in document - term_document_occurences += 1 - end - end - @@number_of_documents_with_term[row_index] = term_document_occurences - @@number_of_documents_with_term[row_index] + @@number_of_documents_with_term[row_index] ||= matrix.row(row_index).where.size end end end end