Sha256: 6866d2079f0997deef6cc9cbbb94f6f06d2be41462c36be33b19311e1f390fd0

Contents?: true

Size: 761 Bytes

Versions: 1

Compression:

Stored size: 761 Bytes

Contents

module Lda
  class TextDocument < Document
    attr_reader :filename

    def initialize(corpus, text)
      super(corpus)
      @filename = nil

      tokenize(text)
      @tokens.reject! { |w| @corpus.stopwords.include?(w) }
      build_from_tokens
    end

    def has_text?
      true
    end

    def self.build_from_file(corpus, filename)
      @filename = filename.dup.freeze
      text = File.open(@filename, 'r') { |f| f.read }
      self.new(corpus, text)
    end

    protected

    def build_from_tokens
      vocab = Hash.new(0)
      @tokens.each { |t| vocab[t] = vocab[t] + 1 }

      vocab.each_pair do |word, count|
        @words << @corpus.vocabulary.check_word(word) - 1
        @counts << count
      end

      recompute
    end
  end
end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
lda-ruby-0.3.6 lib/lda-ruby/document/text_document.rb