Sha256: f575e57c80b7eceb0177b2bceeb5b0f96b13ecabc2557e013ec7da91192a8de4
Contents?: true
Size: 649 Bytes
Versions: 10
Compression:
Stored size: 649 Bytes
Contents
module Boilerpipe class UnicodeTokenizer INVISIBLE_SEPARATOR = "\u2063" WORD_BOUNDARY = Regexp.new('\b') NOT_WORD_BOUNDARY = Regexp.new("[\u2063]*([\\\"'\\.,\\!\\@\\-\\:\\;\\$\\?\\(\\)\/])[\u2063]*") # replace word boundaries with 'invisible separator' # strip invisible separators from non-word boundaries # replace spaces or invisible separators with a single space # trim # split words on single space def self.tokenize(text) text.gsub(WORD_BOUNDARY, INVISIBLE_SEPARATOR) .gsub(NOT_WORD_BOUNDARY, '\1') .gsub(/[ \u2063]+/, ' ') .strip .split(/[ ]+/) end end end
Version data entries
10 entries across 10 versions & 1 rubygems