# encoding: UTF-8 # Copyright 2012 Twitter, Inc # http://www.apache.org/licenses/LICENSE-2.0 require 'spec_helper' describe TwitterCldr::Tokenizers::UnicodeRegexTokenizer do describe "#tokenize" do let(:tokenizer) { described_class.new } def tokenize(str) tokenizer.tokenize(str) end it "should tokenize a regular regex" do got = tokenize("^(ab)xy$") expected = [ { value: "^", type: :negate }, { value: "(", type: :special_char }, { value: "a", type: :string }, { value: "b", type: :string }, { value: ")", type: :special_char }, { value: "x", type: :string }, { value: "y", type: :string }, { value: "$", type: :special_char } ] check_token_list(got, expected) end it "should tokenize a regex containing a basic character class" do got = tokenize("a[bc]d") expected = [ { value: "a", type: :string }, { value: "[", type: :open_bracket }, { value: "b", type: :string }, { value: "c", type: :string }, { value: "]", type: :close_bracket }, { value: "d", type: :string } ] check_token_list(got, expected) end it "should tokenize a regex containing unicode character sets" do got = tokenize("\\p{Zs}[:Lu:]") expected = [ { value: "\\p{Zs}", type: :character_set }, { value: "[:Lu:]", type: :character_set } ] check_token_list(got, expected) end it "should tokenize a regex containing escaped characters" do got = tokenize("^[a\\b]\\$") expected = [ { value: "^", type: :negate }, { value: "[", type: :open_bracket }, { value: "a", type: :string }, { value: "\\b", type: :escaped_character }, { value: "]", type: :close_bracket }, { value: "\\$", type: :escaped_character } ] check_token_list(got, expected) end it "should tokenize a regex containing basic character ranges" do got = tokenize("[a-z0-9]|[ab]") expected = [ { value: "[", type: :open_bracket }, { value: "a", type: :string }, { value: "-", type: :dash }, { value: "z", type: :string }, { value: "0", type: :string }, { value: "-", type: :dash }, { value: "9", type: :string }, { value: "]", type: :close_bracket }, { value: "|", type: :pipe }, { value: "[", type: :open_bracket }, { value: "a", type: :string }, { value: "b", type: :string }, { value: "]", type: :close_bracket }, ] check_token_list(got, expected) end it "should tokenize a regex containing escaped unicode characters" do got = tokenize("\\u0020[\\u0123-\\u0155]") expected = [ { value: "\\u0020", type: :unicode_char }, { value: "[", type: :open_bracket }, { value: "\\u0123", type: :unicode_char }, { value: "-", type: :dash }, { value: "\\u0155", type: :unicode_char }, { value: "]", type: :close_bracket }, ] check_token_list(got, expected) end it "should tokenize a regex containing variable substitutions" do got = tokenize("$CR(?:ab)[$LF]") expected = [ { value: "$CR", type: :variable }, { value: "(", type: :special_char }, { value: "?", type: :special_char }, { value: ":", type: :special_char }, { value: "a", type: :string }, { value: "b", type: :string }, { value: ")", type: :special_char }, { value: "[", type: :open_bracket }, { value: "$LF", type: :variable }, { value: "]", type: :close_bracket } ] check_token_list(got, expected) end it "should tokenize a regex containing multichar strings" do got = tokenize("[{foo}bar]") expected = [ { value: "[", type: :open_bracket }, { value: "{foo}", type: :multichar_string }, { value: "b", type: :string }, { value: "a", type: :string }, { value: "r", type: :string }, { value: "]", type: :close_bracket } ] end it "should tokenize a regex containing negated character sets" do got = tokenize("[[:^N:]\\P{L}]") expected = [ { value: "[", type: :open_bracket }, { value: "[:^N:]", type: :negated_character_set }, { value: "\\P{L}", type: :negated_character_set }, { value: "]", type: :close_bracket } ] check_token_list(got, expected) end it "should tokenize a regex containing some of everything" do got = tokenize("^[a-zb]?[^[\\p{Z}\\u0020-\\u007f]-[\\P{L}]-[[:N:]\\u0123]][:^CC:]*[{foo}]+$") expected = [ { value: "^", type: :negate }, { value: "[", type: :open_bracket }, { value: "a", type: :string }, { value: "-", type: :dash }, { value: "z", type: :string }, { value: "b", type: :string }, { value: "]", type: :close_bracket }, { value: "?", type: :special_char }, { value: "[", type: :open_bracket }, { value: "^", type: :negate }, { value: "[", type: :open_bracket }, { value: "\\p{Z}", type: :character_set }, { value: "\\u0020", type: :unicode_char }, { value: "-", type: :dash }, { value: "\\u007f", type: :unicode_char }, { value: "]", type: :close_bracket }, { value: "-", type: :dash }, { value: "[", type: :open_bracket }, { value: "\\P{L}", type: :negated_character_set }, { value: "]", type: :close_bracket }, { value: "-", type: :dash }, { value: "[", type: :open_bracket }, { value: "[:N:]", type: :character_set }, { value: "\\u0123", type: :unicode_char }, { value: "]", type: :close_bracket }, { value: "]", type: :close_bracket }, { value: "[:^CC:]", type: :negated_character_set }, { value: "*", type: :special_char }, { value: "[", type: :open_bracket }, { value: "{foo}", type: :multichar_string }, { value: "]", type: :close_bracket }, { value: "+", type: :special_char }, { value: "$", type: :special_char } ] check_token_list(got, expected) end end end