lib/semantic/transform/tf_idf_transform.rb in rsemantic-0.1.4 vs lib/semantic/transform/tf_idf_transform.rb in rsemantic-0.2.0
- old
+ new
@@ -6,16 +6,13 @@
def self.transform!(matrix)
number_of_documents = matrix.size2
@@number_of_documents_with_term = []
matrix.transpose.enum_for(:each_row).with_index do |document, column_index|
- document_term_total = document.sum
-
document.enum_for(:each).with_index do |term_weight, row_index|
unless term_weight == 0.0
- inverse_document_frequency = GSL::Sf.log((number_of_documents /
- number_of_documents_with_term(row_index, matrix).to_f).abs)
- term_frequency = (term_weight / document_term_total)
+ inverse_document_frequency = 1 + GSL::Sf.log(number_of_documents / (number_of_documents_with_term(row_index, matrix).to_f + 1))
+ term_frequency = Math.sqrt(term_weight)
matrix[row_index, column_index] = term_frequency * inverse_document_frequency
end
end
end