Sha256: d9d6a2cbe42ba464e3ca3d78d212eca364c9d14ca017f67a89e6932d397a16c4

Contents?: true

Size: 864 Bytes

Versions: 2

Compression:

Stored size: 864 Bytes

Contents

require 'set'

module Lda
  class Corpus
    attr_reader :documents, :num_docs, :num_terms, :vocabulary, :stopwords

    def initialize
      @documents = Array.new
      @all_terms = Set.new
      @num_terms = @num_docs = 0
      @vocabulary = Vocabulary.new
      @stopwords = YAML.load_file(File.join(File.dirname(__FILE__), '..', 'config', 'stopwords.yml'))
      @stopwords.map! { |w| w.strip }
    end

    def add_document(doc)
      raise 'Parameter +doc+ must be of type Document' unless doc.kind_of?(Document)

      @documents << doc

      @all_terms += doc.words
      @num_docs += 1
      @num_terms = @all_terms.size

      update_vocabulary(doc)
      nil
    end
	
	def remove_word(word)
		@vocabulary.words.delete word
	end
	
    protected

    def update_vocabulary(doc)
      doc.tokens.each { |w| @vocabulary.check_word(w) }
    end
  end
end

Version data entries

2 entries across 2 versions & 1 rubygems

Version Path
lda-ruby-0.3.7 lib/lda-ruby/corpus/corpus.rb
lda-ruby-0.3.6 lib/lda-ruby/corpus/corpus.rb