lib/boilerpipe/util/unicode_tokenizer.rb in boilerpipe-ruby-0.0.1 vs lib/boilerpipe/util/unicode_tokenizer.rb in boilerpipe-ruby-0.1.0
- old
+ new
@@ -1,11 +1,11 @@
module Boilerpipe
class UnicodeTokenizer
INVISIBLE_SEPARATOR = "\u2063"
WORD_BOUNDARY = Regexp.new('\b')
- NOT_WORD_BOUNDARY = Regexp.new("[\u2063]*([\\\"'\\.,\\!\\@\\-\\:\\;\\$\\?\\(\\)/])[\u2063]*")
+ NOT_WORD_BOUNDARY = Regexp.new("[\u2063]*([\\\"'\\.,\\!\\@\\-\\:\\;\\$\\?\\(\\)\/])[\u2063]*")
- # replace word boundaries with 'invisible separator'
+ # replace word boundaries with 'invisible separator'
# strip invisible separators from non-word boundaries
# replace spaces or invisible separators with a single space
# trim
# split words on single space