require 'rbbt/annotations' require 'rbbt/text/segment' module Token attr_accessor :offset, :original def self.all_annotations [:offset, :original] end def self.setup(text, start, original = nil) text.extend Token text.offset = start text.original = original text end def info {:original => original, :offset => offset} end def id Misc.hash2md5 info.merge :self => self end def end offset + self.length - 1 end def range (offset..self.end) end def self.tokenize(text, split_at = /\s|(\(|\)|[-."':,])/, start = 0) tokens = [] while matchdata = text.match(split_at) tokens << Token.setup(matchdata.pre_match, start) unless matchdata.pre_match.empty? tokens << Token.setup(matchdata.captures.first, start + matchdata.begin(1)) if matchdata.captures.any? and not matchdata.captures.first.empty? start += matchdata.end(0) text = matchdata.post_match end tokens << Token.setup(text, start) unless text.empty? tokens end end