Sha256: 155bc2fd847f9597a4c02efd35247c888abcc4ad80a7edaa570a663e4c1b5ca2
Contents?: true
Size: 981 Bytes
Versions: 1
Compression:
Stored size: 981 Bytes
Contents
# coding: utf-8 require 'yaml' module Lda class Document attr_reader :corpus, :words, :counts, :length, :total, :tokens def initialize(corpus) @corpus = corpus @words = Array.new @counts = Array.new @tokens = Array.new @length = 0 @total = 0 end # # Recompute the total and length values. # def recompute @total = @counts.inject(0) { |sum, i| sum + i } @length = @words.size end def has_text? false end def handle(tokens) tokens end def tokenize(text) # now respects Umlaute clean_text = text.gsub(/[^a-zäöüß'-]+/i, ' ').gsub(/\s+/, ' ').downcase # remove everything but letters and ' and leave only single spaces # clean_text = text.gsub(/[^A-Za-z'\s]+/, ' ').gsub(/\s+/, ' ').downcase # remove everything but letters and ' and leave only single spaces @tokens = handle(clean_text.split(' ')) nil end end end
Version data entries
1 entries across 1 versions & 1 rubygems
Version | Path |
---|---|
lda-ruby-0.3.8 | lib/lda-ruby/document/document.rb |