Sha256: 5cff4e64dcfe204e0165fde68e06f29f455bd59ef8801decd95048ca8b8f4861

Contents?: true

Size: 994 Bytes

Versions: 2

Compression:

Stored size: 994 Bytes

Contents

require 'set'

module Lda
  class Corpus
    attr_reader :documents, :num_docs, :num_terms, :vocabulary, :stopwords

    def initialize(stop_word_list = nil)
      @documents = Array.new
      @all_terms = Set.new
      @num_terms = @num_docs = 0
      @vocabulary = Vocabulary.new
      if stop_word_list.nil?
        @stopwords = YAML.load_file(File.join(File.dirname(__FILE__), '..', 'config', 'stopwords.yml'))
      else
        @stopwords = YAML.load_file(stop_word_list)
      end
      @stopwords.map! { |w| w.strip }
    end
    
    def add_document(doc)
      raise 'Parameter +doc+ must be of type Document' unless doc.kind_of?(Document)

      @documents << doc

      @all_terms += doc.words
      @num_docs += 1
      @num_terms = @all_terms.size

      update_vocabulary(doc)
      nil
    end
	
	def remove_word(word)
		@vocabulary.words.delete word
	end
	
    protected

    def update_vocabulary(doc)
      doc.tokens.each { |w| @vocabulary.check_word(w) }
    end
  end
end

Version data entries

2 entries across 2 versions & 1 rubygems

Version Path
lda-ruby-0.3.9 lib/lda-ruby/corpus/corpus.rb
lda-ruby-0.3.8 lib/lda-ruby/corpus/corpus.rb