# encoding: UTF-8 # Copyright 2012 Twitter, Inc # http://www.apache.org/licenses/LICENSE-2.0 require 'spec_helper' include TwitterCldr::Tokenizers describe UnicodeRegexTokenizer do describe "#tokenize" do let(:tokenizer) { UnicodeRegexTokenizer.new } def tokenize(str) tokenizer.tokenize(str) end it "should tokenize a regular regex" do got = tokenize("^(ab)xy$") expected = [ { :value => "^", :type => :negate }, { :value => "(", :type => :special_char }, { :value => "a", :type => :string }, { :value => "b", :type => :string }, { :value => ")", :type => :special_char }, { :value => "x", :type => :string }, { :value => "y", :type => :string }, { :value => "$", :type => :special_char } ] check_token_list(got, expected) end it "should tokenize a regex containing a basic character class" do got = tokenize("a[bc]d") expected = [ { :value => "a", :type => :string }, { :value => "[", :type => :open_bracket }, { :value => "b", :type => :string }, { :value => "c", :type => :string }, { :value => "]", :type => :close_bracket }, { :value => "d", :type => :string } ] check_token_list(got, expected) end it "should tokenize a regex containing unicode character sets" do got = tokenize("\\p{Zs}[:Lu:]") expected = [ { :value => "\\p{Zs}", :type => :character_set }, { :value => "[:Lu:]", :type => :character_set } ] check_token_list(got, expected) end it "should tokenize a regex containing escaped characters" do got = tokenize("^[a\\b]\\$") expected = [ { :value => "^", :type => :negate }, { :value => "[", :type => :open_bracket }, { :value => "a", :type => :string }, { :value => "\\b", :type => :escaped_character }, { :value => "]", :type => :close_bracket }, { :value => "\\$", :type => :escaped_character } ] check_token_list(got, expected) end it "should tokenize a regex containing basic character ranges" do got = tokenize("[a-z0-9]|[ab]") expected = [ { :value => "[", :type => :open_bracket }, { :value => "a", :type => :string }, { :value => "-", :type => :dash }, { :value => "z", :type => :string }, { :value => "0", :type => :string }, { :value => "-", :type => :dash }, { :value => "9", :type => :string }, { :value => "]", :type => :close_bracket }, { :value => "|", :type => :pipe }, { :value => "[", :type => :open_bracket }, { :value => "a", :type => :string }, { :value => "b", :type => :string }, { :value => "]", :type => :close_bracket }, ] check_token_list(got, expected) end it "should tokenize a regex containing escaped unicode characters" do got = tokenize("\\u0020[\\u0123-\\u0155]") expected = [ { :value => "\\u0020", :type => :unicode_char }, { :value => "[", :type => :open_bracket }, { :value => "\\u0123", :type => :unicode_char }, { :value => "-", :type => :dash }, { :value => "\\u0155", :type => :unicode_char }, { :value => "]", :type => :close_bracket }, ] check_token_list(got, expected) end it "should tokenize a regex containing variable substitutions" do got = tokenize("$CR(?:ab)[$LF]") expected = [ { :value => "$CR", :type => :variable }, { :value => "(", :type => :special_char }, { :value => "?", :type => :special_char }, { :value => ":", :type => :special_char }, { :value => "a", :type => :string }, { :value => "b", :type => :string }, { :value => ")", :type => :special_char }, { :value => "[", :type => :open_bracket }, { :value => "$LF", :type => :variable }, { :value => "]", :type => :close_bracket } ] check_token_list(got, expected) end it "should tokenize a regex containing multichar strings" do got = tokenize("[{foo}bar]") expected = [ { :value => "[", :type => :open_bracket }, { :value => "{foo}", :type => :multichar_string }, { :value => "b", :type => :string }, { :value => "a", :type => :string }, { :value => "r", :type => :string }, { :value => "]", :type => :close_bracket } ] end it "should tokenize a regex containing negated character sets" do got = tokenize("[[:^N:]\\P{L}]") expected = [ { :value => "[", :type => :open_bracket }, { :value => "[:^N:]", :type => :negated_character_set }, { :value => "\\P{L}", :type => :negated_character_set }, { :value => "]", :type => :close_bracket } ] check_token_list(got, expected) end it "should tokenize a regex containing some of everything" do got = tokenize("^[a-zb]?[^[\\p{Z}\\u0020-\\u007f]-[\\P{L}]-[[:N:]\\u0123]][:^CC:]*[{foo}]+$") expected = [ { :value => "^", :type => :negate }, { :value => "[", :type => :open_bracket }, { :value => "a", :type => :string }, { :value => "-", :type => :dash }, { :value => "z", :type => :string }, { :value => "b", :type => :string }, { :value => "]", :type => :close_bracket }, { :value => "?", :type => :special_char }, { :value => "[", :type => :open_bracket }, { :value => "^", :type => :negate }, { :value => "[", :type => :open_bracket }, { :value => "\\p{Z}", :type => :character_set }, { :value => "\\u0020", :type => :unicode_char }, { :value => "-", :type => :dash }, { :value => "\\u007f", :type => :unicode_char }, { :value => "]", :type => :close_bracket }, { :value => "-", :type => :dash }, { :value => "[", :type => :open_bracket }, { :value => "\\P{L}", :type => :negated_character_set }, { :value => "]", :type => :close_bracket }, { :value => "-", :type => :dash }, { :value => "[", :type => :open_bracket }, { :value => "[:N:]", :type => :character_set }, { :value => "\\u0123", :type => :unicode_char }, { :value => "]", :type => :close_bracket }, { :value => "]", :type => :close_bracket }, { :value => "[:^CC:]", :type => :negated_character_set }, { :value => "*", :type => :special_char }, { :value => "[", :type => :open_bracket }, { :value => "{foo}", :type => :multichar_string }, { :value => "]", :type => :close_bracket }, { :value => "+", :type => :special_char }, { :value => "$", :type => :special_char } ] check_token_list(got, expected) end end end