Sha256: 5cff4e64dcfe204e0165fde68e06f29f455bd59ef8801decd95048ca8b8f4861
Contents?: true
Size: 994 Bytes
Versions: 2
Compression:
Stored size: 994 Bytes
Contents
require 'set' module Lda class Corpus attr_reader :documents, :num_docs, :num_terms, :vocabulary, :stopwords def initialize(stop_word_list = nil) @documents = Array.new @all_terms = Set.new @num_terms = @num_docs = 0 @vocabulary = Vocabulary.new if stop_word_list.nil? @stopwords = YAML.load_file(File.join(File.dirname(__FILE__), '..', 'config', 'stopwords.yml')) else @stopwords = YAML.load_file(stop_word_list) end @stopwords.map! { |w| w.strip } end def add_document(doc) raise 'Parameter +doc+ must be of type Document' unless doc.kind_of?(Document) @documents << doc @all_terms += doc.words @num_docs += 1 @num_terms = @all_terms.size update_vocabulary(doc) nil end def remove_word(word) @vocabulary.words.delete word end protected def update_vocabulary(doc) doc.tokens.each { |w| @vocabulary.check_word(w) } end end end
Version data entries
2 entries across 2 versions & 1 rubygems
Version | Path |
---|---|
lda-ruby-0.3.9 | lib/lda-ruby/corpus/corpus.rb |
lda-ruby-0.3.8 | lib/lda-ruby/corpus/corpus.rb |