lib/boilerpipe/util/unicode_tokenizer.rb in boilerpipe-ruby-0.0.1 vs lib/boilerpipe/util/unicode_tokenizer.rb in boilerpipe-ruby-0.1.0

- old
+ new

@@ -1,11 +1,11 @@ module Boilerpipe class UnicodeTokenizer INVISIBLE_SEPARATOR = "\u2063" WORD_BOUNDARY = Regexp.new('\b') - NOT_WORD_BOUNDARY = Regexp.new("[\u2063]*([\\\"'\\.,\\!\\@\\-\\:\\;\\$\\?\\(\\)/])[\u2063]*") + NOT_WORD_BOUNDARY = Regexp.new("[\u2063]*([\\\"'\\.,\\!\\@\\-\\:\\;\\$\\?\\(\\)\/])[\u2063]*") - # replace word boundaries with 'invisible separator' + # replace word boundaries with 'invisible separator' # strip invisible separators from non-word boundaries # replace spaces or invisible separators with a single space # trim # split words on single space