lib/semantic/vector_space/builder.rb in rsemantic-0.1.3 vs lib/semantic/vector_space/builder.rb in rsemantic-0.1.4
- old
+ new
@@ -1,24 +1,28 @@
module Semantic
module VectorSpace
- #A algebraic model for representing text documents as vectors of identifiers.
- #A document is represented as a vector. Each dimension of the vector corresponds to a
- #separate term. If a term occurs in the document, then the value in the vector is non-zero.
+ # A algebraic model for representing text documents as vectors of identifiers.
+ # A document is represented as a vector. Each dimension of the vector corresponds to a
+ # separate term. If a term occurs in the document, then the value in the vector is non-zero.
class Builder
- def initialize(options={})
- @parser = Parser.new
- @options = options
+ def initialize(options = {})
+ @parser = Parser.new(:filter_stop_words => options[:filter_stop_words])
@parsed_document_cache = []
end
def build_document_matrix(documents)
@vector_keyword_index = build_vector_keyword_index(documents)
-
+
document_vectors = documents.enum_for(:each_with_index).map{|document,document_id| build_vector(document, document_id)}
- document_matrix = Linalg::DMatrix.join_columns(document_vectors)
-
+
+ n = document_vectors.size
+ m = document_vectors.first.size
+
+ # TODO check where else we use document_vectors and if we can directly use column based ones
+ document_matrix = GSL::Matrix.alloc(*document_vectors.map {|v| v.transpose})
+
Model.new(document_matrix, @vector_keyword_index)
end
def build_query_vector(term_list)
build_vector(term_list.join(" "))
@@ -26,44 +30,47 @@
private
def build_vector_keyword_index(documents)
parse_and_cache(documents)
vocabulary_list = find_unique_vocabulary
- map_vocabulary_to_vector_positions(vocabulary_list)
+ map_vocabulary_to_vector_positions(vocabulary_list)
end
def parse_and_cache(documents)
documents.each_with_index do |document, index|
@parsed_document_cache[index] = @parser.tokenise_and_filter(document)
end
end
def find_unique_vocabulary
- vocabulary_list = @parsed_document_cache.inject([]) { |parsed_document, vocabulary_list| vocabulary_list + parsed_document }
- vocabulary_list.uniq
+ @parsed_document_cache.flatten.reverse.uniq
end
-
+
def map_vocabulary_to_vector_positions(vocabulary_list)
vector_index={}
column = 0
- vocabulary_list.each do |word|
+ vocabulary_list.each do |word|
vector_index[word] = column
column += 1
end
vector_index
- end
-
- def build_vector(word_string, document_id=nil)
+ end
+
+ def build_vector(word_string, document_id = nil)
if document_id.nil?
word_list = @parser.tokenise_and_filter(word_string)
else
word_list = @parsed_document_cache[document_id]
end
-
- vector = Linalg::DMatrix.new(@vector_keyword_index.length, 1)
- word_list.each { |word| vector[@vector_keyword_index[word] , 0] += 1 if @vector_keyword_index.has_key?(word) }
+
+ vector = GSL::Vector.alloc(@vector_keyword_index.length)
+ word_list.each { |word|
+ if @vector_keyword_index.has_key?(word)
+ vector[@vector_keyword_index[word]] += 1
+ end
+ }
+
vector
end
-
end
end
-end
\ No newline at end of file
+end