Sha256: 0a824c4251d6c1387fbe3abcdb5a89bb6f43d072bd678d87ccc206faa59fc438

Contents?: true

Size: 659 Bytes

Versions: 16

Compression:

Stored size: 659 Bytes

Contents

require 'rbbt/segment'

module Token
  extend Entity
  include Segment

  self.annotation :original

  def self.tokenize(text, split_at = /\s|(\(|\)|[-."':,])/, start = 0)

    tokens = []
    while matchdata = text.match(split_at)
      tokens << Token.setup(matchdata.pre_match, :offset => start) unless matchdata.pre_match.empty?
      tokens << Token.setup(matchdata.captures.first, :offset => start + matchdata.begin(1)) if matchdata.captures.any? and not matchdata.captures.first.empty?
      start += matchdata.end(0)
      text = matchdata.post_match
    end

    tokens << Token.setup(text, :offset => start) unless text.empty?

    tokens
  end
end

Version data entries

16 entries across 16 versions & 1 rubygems

Version Path
rbbt-text-1.5.2 lib/rbbt/segment/token.rb
rbbt-text-1.5.1 lib/rbbt/segment/token.rb
rbbt-text-1.5.0 lib/rbbt/segment/token.rb
rbbt-text-1.4.0 lib/rbbt/segment/token.rb
rbbt-text-1.3.11 lib/rbbt/segment/token.rb
rbbt-text-1.3.10 lib/rbbt/segment/token.rb
rbbt-text-1.3.9 lib/rbbt/segment/token.rb
rbbt-text-1.3.8 lib/rbbt/segment/token.rb
rbbt-text-1.3.7 lib/rbbt/segment/token.rb
rbbt-text-1.3.6 lib/rbbt/segment/token.rb
rbbt-text-1.3.5 lib/rbbt/segment/token.rb
rbbt-text-1.3.4 lib/rbbt/segment/token.rb
rbbt-text-1.3.3 lib/rbbt/segment/token.rb
rbbt-text-1.3.2 lib/rbbt/segment/token.rb
rbbt-text-1.3.1 lib/rbbt/segment/token.rb
rbbt-text-1.3.0 lib/rbbt/segment/token.rb