Sha256: 43d04e3b1c99fce659d657d5e543ff5cf801c8dde7c343fc891ba3c97c052c1a
Contents?: true
Size: 1.64 KB
Versions: 2
Compression:
Stored size: 1.64 KB
Contents
# encoding: UTF-8 # Copyright 2012 Twitter, Inc # http://www.apache.org/licenses/LICENSE-2.0 module TwitterCldr module Tokenizers class RbnfTokenizer def tokenize(pattern) PatternTokenizer.new(nil, tokenizer).tokenize(pattern) end private def tokenizer @tokenizer ||= begin # i.e. %spellout-numbering, %%2d-year rule_regex = if RUBY_VERSION <= "1.8.7" /%%?[\w\-]+/u else Regexp.new("%%?[[:word:]\-]+") end recognizers = [ # special rule descriptors TokenRecognizer.new(:negative, /-x/), TokenRecognizer.new(:improper_fraction, /x\.x/), TokenRecognizer.new(:proper_fraction, /0\.x/), TokenRecognizer.new(:master, /x\.0/), # normal rule descriptors TokenRecognizer.new(:equals, /=/), TokenRecognizer.new(:rule, rule_regex), TokenRecognizer.new(:right_arrow, />/), TokenRecognizer.new(:left_arrow, /</), TokenRecognizer.new(:open_bracket, /\[/), TokenRecognizer.new(:close_bracket, /\]/), TokenRecognizer.new(:decimal, /[0#][0#,\.]+/), TokenRecognizer.new(:plural, /\$\(.*\)\$/), # ending TokenRecognizer.new(:semicolon, /;/), ] splitter_source = recognizers.map { |r| r.regex.source }.join("|") splitter = Regexp.new("(#{splitter_source})") Tokenizer.new( recognizers + [ TokenRecognizer.new(:plaintext, //) # catch-all ], splitter ) end end end end end
Version data entries
2 entries across 2 versions & 1 rubygems
Version | Path |
---|---|
twitter_cldr-3.1.1 | lib/twitter_cldr/tokenizers/numbers/rbnf_tokenizer.rb |
twitter_cldr-3.1.0 | lib/twitter_cldr/tokenizers/numbers/rbnf_tokenizer.rb |