lib/lda-ruby/document/document.rb in lda-ruby-0.3.7 vs lib/lda-ruby/document/document.rb in lda-ruby-0.3.8

- old
+ new

@@ -1,5 +1,6 @@ +# coding: utf-8 require 'yaml' module Lda class Document attr_reader :corpus, :words, :counts, :length, :total, :tokens @@ -29,10 +30,12 @@ def handle(tokens) tokens end def tokenize(text) - clean_text = text.gsub(/[^A-Za-z'\s]+/, ' ').gsub(/\s+/, ' ').downcase # remove everything but letters and ' and leave only single spaces + # now respects Umlaute + clean_text = text.gsub(/[^a-zäöüß'-]+/i, ' ').gsub(/\s+/, ' ').downcase # remove everything but letters and ' and leave only single spaces + # clean_text = text.gsub(/[^A-Za-z'\s]+/, ' ').gsub(/\s+/, ' ').downcase # remove everything but letters and ' and leave only single spaces @tokens = handle(clean_text.split(' ')) nil end end end