module Boilerpipe
  class UnicodeTokenizer
    INVISIBLE_SEPARATOR = "\u2063"
    WORD_BOUNDARY = Regexp.new('\b')
    NOT_WORD_BOUNDARY = Regexp.new("[\u2063]*([\\\"'\\.,\\!\\@\\-\\:\\;\\$\\?\\(\\)\/])[\u2063]*")

    # replace word boundaries with 'invisible separator'
    # strip invisible separators from non-word boundaries
    # replace spaces or invisible separators with a single space
    # trim
    # split words on single space

    def self.tokenize(text)
      text.gsub(WORD_BOUNDARY, INVISIBLE_SEPARATOR)
        .gsub(NOT_WORD_BOUNDARY, '\1')
        .gsub(/[ \u2063]+/, ' ')
        .strip
        .split(/[ ]+/)
    end
  end
end