Sha256: a5fc4851d71f76e3632e0d51153d38f2a7bcd80c393998e79d0097256621455b
Contents?: true
Size: 890 Bytes
Versions: 19
Compression:
Stored size: 890 Bytes
Contents
# encoding: UTF-8 # Copyright 2012 Twitter, Inc # http://www.apache.org/licenses/LICENSE-2.0 module TwitterCldr module Tokenizers class SegmentationTokenizer def tokenize(pattern) # according to the spec, whitespace should be ignored tokenizer.tokenize(pattern).reject do |token| token.value.strip.empty? end end private def tokenizer @tokenizer ||= begin recognizers = [ TokenRecognizer.new(:break, /\303\267/u) do |val| # ÷ character val.strip end, TokenRecognizer.new(:no_break, /\303\227/u) do |val| # × character val.strip end ] ur_tokenizer = UnicodeRegexTokenizer.new ur_tokenizer.insert_before(:string, *recognizers) ur_tokenizer end end end end end
Version data entries
19 entries across 19 versions & 2 rubygems