Sha256: 43d04e3b1c99fce659d657d5e543ff5cf801c8dde7c343fc891ba3c97c052c1a

Contents?: true

Size: 1.64 KB

Versions: 2

Compression:

Stored size: 1.64 KB

Contents

# encoding: UTF-8

# Copyright 2012 Twitter, Inc
# http://www.apache.org/licenses/LICENSE-2.0

module TwitterCldr
  module Tokenizers
    class RbnfTokenizer

      def tokenize(pattern)
        PatternTokenizer.new(nil, tokenizer).tokenize(pattern)
      end

      private

      def tokenizer
        @tokenizer ||= begin
          # i.e. %spellout-numbering, %%2d-year
          rule_regex = if RUBY_VERSION <= "1.8.7"
            /%%?[\w\-]+/u
          else
            Regexp.new("%%?[[:word:]\-]+")
          end

          recognizers = [
            # special rule descriptors
            TokenRecognizer.new(:negative, /-x/),
            TokenRecognizer.new(:improper_fraction, /x\.x/),
            TokenRecognizer.new(:proper_fraction, /0\.x/),
            TokenRecognizer.new(:master, /x\.0/),

            # normal rule descriptors
            TokenRecognizer.new(:equals, /=/),
            TokenRecognizer.new(:rule, rule_regex),
            TokenRecognizer.new(:right_arrow, />/),
            TokenRecognizer.new(:left_arrow, /</),
            TokenRecognizer.new(:open_bracket, /\[/),
            TokenRecognizer.new(:close_bracket, /\]/),
            TokenRecognizer.new(:decimal, /[0#][0#,\.]+/),
            TokenRecognizer.new(:plural, /\$\(.*\)\$/),

            # ending
            TokenRecognizer.new(:semicolon, /;/),
          ]

          splitter_source = recognizers.map { |r| r.regex.source }.join("|")
          splitter = Regexp.new("(#{splitter_source})")

          Tokenizer.new(
            recognizers + [
              TokenRecognizer.new(:plaintext, //)  # catch-all
            ], splitter
          )
        end
      end

    end
  end
end

Version data entries

2 entries across 2 versions & 1 rubygems

Version Path
twitter_cldr-3.1.1 lib/twitter_cldr/tokenizers/numbers/rbnf_tokenizer.rb
twitter_cldr-3.1.0 lib/twitter_cldr/tokenizers/numbers/rbnf_tokenizer.rb