lib/semantic/transform/lsa_transform.rb in rsemantic-0.1.3 vs lib/semantic/transform/lsa_transform.rb in rsemantic-0.1.4
- old
+ new
@@ -2,41 +2,66 @@
module Transform
class LSA
class << self
- def transform(matrix, number_of_dimensions_to_reduce = 1)
- columns = matrix.num_columns
+ def transform!(matrix, rank = nil)
+ # TODO configurable rank
+ columns = matrix.size2
- if number_of_dimensions_to_reduce <= columns #Its a valid reduction
- u, sigma, vt = matrix.singular_value_decomposition
+ u, v, sigma = matrix.SV_decomp_mod
+ reduce_dimensions!(sigma, rank)
+ sigma = GSL::Matrix.diagonal(sigma)
- sigma_prime = reduce_dimensions(number_of_dimensions_to_reduce, sigma)
+ GSL::Matrix.swap(matrix, u * sigma * v.transpose)
+ end
- matrix_prime = u * sigma_prime * vt
+ private
+ def reduce_dimensions!(vector, rank)
+ # the vector is already sorted (biggest to smallest), so we
+ # only have to zero the elements we do not want
+
+ if rank.nil?
+ rank = determine_rank(vector)
else
- raise Exception, "dimension reduction cannot be greater than %s" % columns
+ rank = valid_rank(vector, rank)
end
-
- matrix_prime
+
+ num_to_zero_out = vector.size - rank
+ vector[rank, num_to_zero_out] = 0
end
-
- private
- def reduce_dimensions(number_of_dimensions_to_reduce, matrix)
- for diagonal_index in dimensions_to_be_reduced(matrix, number_of_dimensions_to_reduce)
- matrix[diagonal_index, diagonal_index] = 0
+
+ def determine_rank(vector)
+ if vector.size <= 15
+ # for less than 15 documents, n-1 is usually the best we
+ # can do. LSA generally works better with bigger data
+ # sets.
+ rank = vector.size - 1
+ elsif vector.size <= 1000
+ # ~500 is a value to work well for really big data sets,
+ # but for less than that, it probably is too big, so we
+ # go for n/3 in this case.
+ rank = vector.size / 3
+ else
+ # if we have more than 1000 documents, using the magical
+ # number 500 (which can be found in various documents)
+ # seems to be the best guess for now.
+ rank = 500
end
- matrix
end
- def dimensions_to_be_reduced(matrix, number_of_dimensions_to_reduce)
- (diagonal_matrix_length(matrix) - number_of_dimensions_to_reduce)...diagonal_matrix_length(matrix)
+ def valid_rank(vector, rank)
+ if rank <= 0
+ # for negative ranks, keep that many dimensions
+ rank = vector.size + rank
+ elsif rank > vector.size
+ # if the rank is > the vector size, limit it to that
+ rank = vector.size
+ else
+ rank
+ end
end
- def diagonal_matrix_length(matrix)
- matrix.num_columns < matrix.num_rows ? matrix.num_columns : matrix.num_rows
- end
-
end
end
end
-end
\ No newline at end of file
+end