Sha256: 0a824c4251d6c1387fbe3abcdb5a89bb6f43d072bd678d87ccc206faa59fc438
Contents?: true
Size: 659 Bytes
Versions: 16
Compression:
Stored size: 659 Bytes
Contents
require 'rbbt/segment' module Token extend Entity include Segment self.annotation :original def self.tokenize(text, split_at = /\s|(\(|\)|[-."':,])/, start = 0) tokens = [] while matchdata = text.match(split_at) tokens << Token.setup(matchdata.pre_match, :offset => start) unless matchdata.pre_match.empty? tokens << Token.setup(matchdata.captures.first, :offset => start + matchdata.begin(1)) if matchdata.captures.any? and not matchdata.captures.first.empty? start += matchdata.end(0) text = matchdata.post_match end tokens << Token.setup(text, :offset => start) unless text.empty? tokens end end
Version data entries
16 entries across 16 versions & 1 rubygems