# encoding: UTF-8 # Copyright 2012 Twitter, Inc # http://www.apache.org/licenses/LICENSE-2.0 require 'spec_helper' include TwitterCldr::Parsers include TwitterCldr::Tokenizers describe UnicodeRegexParser do let(:tokenizer) { UnicodeRegexTokenizer.new } let(:parser) { UnicodeRegexParser.new } def tokenize(str) tokenizer.tokenize(str) end def parse(tokens, options = {}) parser.parse(tokens, options) end describe "#parse" do it "identifies ranges" do elements = parse(tokenize("[a-z]")) elements.first.should be_a(UnicodeRegexParser::CharacterClass) root = elements.first.send(:root) root.should be_a(UnicodeRegexParser::CharacterRange) root.initial.codepoints.should == "a".unpack("U*") root.final.codepoints.should == "z".unpack("U*") end it "replaces variables" do symbol_table = SymbolTable.new("$VAR" => tokenize("\\p{L}")) elements = parse(tokenize("($VAR)?"), :symbol_table => symbol_table) elements[1].should be_a(UnicodeRegexParser::CharacterSet) elements[1].property_value.should == "L" end it "handles character and negated character sets" do elements = parse(tokenize("\\p{L}[:^P:]\\P{L}[:P:]")) element = elements[0] element.should be_a(UnicodeRegexParser::CharacterSet) element.property_value.should == "L" element = elements[1] element.should be_a(UnicodeRegexParser::CharacterClass) element.send(:root).child.property_value.should == "P" element.send(:root).operator.should == :negate element = elements[2] element.should be_a(UnicodeRegexParser::CharacterClass) element.send(:root).child.property_value.should == "L" element = elements[3] element.should be_a(UnicodeRegexParser::CharacterSet) element.property_value.should == "P" end it "handles unicode characters" do elements = parse(tokenize("\\u0123")) elements[0].should be_a(UnicodeRegexParser::UnicodeString) elements[0].codepoints.should == [291] end it "handles multichar and escaped unicode strings" do elements = parse(tokenize("\\g{abc}")) elements[0].should be_a(UnicodeRegexParser::Literal) elements[0].text.should == "\\g" elements[1].should be_a(UnicodeRegexParser::UnicodeString) elements[1].codepoints.should == [97, 98, 99] end it "handles special chars" do elements = parse(tokenize("^(?:)$")) elements.each { |elem| elem.should be_a(UnicodeRegexParser::Literal) } elements[0].text.should == "^" elements[1].text.should == "(" elements[2].text.should == "?" elements[3].text.should == ":" elements[4].text.should == ")" elements[5].text.should == "$" end end end