lib/lda-ruby/document/document.rb in lda-ruby-0.3.7 vs lib/lda-ruby/document/document.rb in lda-ruby-0.3.8
- old
+ new
@@ -1,5 +1,6 @@
+# coding: utf-8
require 'yaml'
module Lda
class Document
attr_reader :corpus, :words, :counts, :length, :total, :tokens
@@ -29,10 +30,12 @@
def handle(tokens)
tokens
end
def tokenize(text)
- clean_text = text.gsub(/[^A-Za-z'\s]+/, ' ').gsub(/\s+/, ' ').downcase # remove everything but letters and ' and leave only single spaces
+ # now respects Umlaute
+ clean_text = text.gsub(/[^a-zäöüß'-]+/i, ' ').gsub(/\s+/, ' ').downcase # remove everything but letters and ' and leave only single spaces
+ # clean_text = text.gsub(/[^A-Za-z'\s]+/, ' ').gsub(/\s+/, ' ').downcase # remove everything but letters and ' and leave only single spaces
@tokens = handle(clean_text.split(' '))
nil
end
end
end