Sha256: 132c268c08dac1a57bc02a0df6adcf53247a310134997484560ec09ccc3b94b3

Contents?: true

Size: 697 Bytes

Versions: 2

Compression:

Stored size: 697 Bytes

Contents

module Lda
  class TextDocument < Document
    attr_reader :filename

    def initialize(corpus, text)
      super(corpus)
      @filename = nil

      tokenize(text)
      build_from_tokens
    end

    def has_text?
      true
    end

    def self.build_from_file(corpus, filename)
      @filename = filename.dup.freeze
      text = File.open(@filename, 'r') { |f| f.read }
      self.new(corpus, text)
    end

    protected

    def build_from_tokens
      vocab = Hash.new(0)
      @tokens.each { |t| vocab[t] = vocab[t] + 1 }

      vocab.each_pair do |word, count|
        @words << @corpus.vocabulary.check_word(word)
        @counts << count
      end

      recompute
    end
  end
end

Version data entries

2 entries across 2 versions & 1 rubygems

Version Path
lda-ruby-0.3.4 lib/lda-ruby/document/text_document.rb
lda-ruby-0.3.1 lib/lda-ruby/document/text_document.rb