document.rb in lda-ruby-0.3.8

- old
+ new

@@ -1,5 +1,6 @@
+# coding: utf-8
 require 'yaml'
 
 module Lda
   class Document
     attr_reader :corpus, :words, :counts, :length, :total, :tokens
@@ -29,10 +30,12 @@
     def handle(tokens)
       tokens
     end
 
     def tokenize(text)
-      clean_text = text.gsub(/[^A-Za-z'\s]+/, ' ').gsub(/\s+/, ' ').downcase        # remove everything but letters and ' and leave only single spaces
+      # now respects Umlaute
+      clean_text = text.gsub(/[^a-zäöüß'-]+/i, ' ').gsub(/\s+/, ' ').downcase  # remove everything but letters and ' and leave only single spaces
+      # clean_text = text.gsub(/[^A-Za-z'\s]+/, ' ').gsub(/\s+/, ' ').downcase        # remove everything but letters and ' and leave only single spaces
       @tokens = handle(clean_text.split(' '))
       nil
     end
   end
 end