Sha256: 1b3fd1caf8c0c4a8013377cf3cc7312b813240308fb22ed1dcf12f9044aa5ae6

Contents?: true

Size: 756 Bytes

Versions: 3

Compression:

Stored size: 756 Bytes

Contents

module Lda
  class TextDocument < Document
    attr_reader :filename

    def initialize(corpus, text)
      super(corpus)
      @filename = nil

      tokenize(text)
      @corpus.stopwords.each { |w| @tokens.delete(w) }
      build_from_tokens
    end

    def has_text?
      true
    end

    def self.build_from_file(corpus, filename)
      @filename = filename.dup.freeze
      text = File.open(@filename, 'r') { |f| f.read }
      self.new(corpus, text)
    end

    protected

    def build_from_tokens
      vocab = Hash.new(0)
      @tokens.each { |t| vocab[t] = vocab[t] + 1 }

      vocab.each_pair do |word, count|
        @words << @corpus.vocabulary.check_word(word) - 1
        @counts << count
      end

      recompute
    end
  end
end

Version data entries

3 entries across 3 versions & 1 rubygems

Version Path
lda-ruby-0.3.9 lib/lda-ruby/document/text_document.rb
lda-ruby-0.3.8 lib/lda-ruby/document/text_document.rb
lda-ruby-0.3.7 lib/lda-ruby/document/text_document.rb