Sha256: 6824ceb8adc38124586fc42f9ed74f8a2ee6cd064cbec8b6ba29232e09f0ad90

Contents?: true

Size: 701 Bytes

Versions: 1

Compression:

Stored size: 701 Bytes

Contents

module Lda
  class TextDocument < Document
    attr_reader :filename

    def initialize(corpus, text)
      super(corpus)
      @filename = nil

      tokenize(text)
      build_from_tokens
    end

    def has_text?
      true
    end

    def self.build_from_file(corpus, filename)
      @filename = filename.dup.freeze
      text = File.open(@filename, 'r') { |f| f.read }
      self.new(corpus, text)
    end

    protected

    def build_from_tokens
      vocab = Hash.new(0)
      @tokens.each { |t| vocab[t] = vocab[t] + 1 }

      vocab.each_pair do |word, count|
        @words << @corpus.vocabulary.check_word(word) - 1
        @counts << count
      end

      recompute
    end
  end
end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
lda-ruby-0.3.5 lib/lda-ruby/document/text_document.rb