# -*- encoding: utf-8 -*- require File.expand_path("../../helpers", __FILE__) class ScannerUTF8 < Test::Unit::TestCase tests = { # ascii, single byte characters 'a' => { 0 => [:literal, :literal, 'a', 0, 1], }, 'ab+' => { 0 => [:literal, :literal, 'ab', 0, 2], 1 => [:quantifier, :one_or_more, '+', 2, 3], }, # 2 byte wide characters, Arabic 'aاbبcت' => { 0 => [:literal, :literal, 'aاbبcت', 0, 9], }, 'aاbبت?' => { 0 => [:literal, :literal, 'aاbبت', 0, 8], 1 => [:quantifier, :zero_or_one, '?', 8, 9], }, 'aا?bبcت+' => { 0 => [:literal, :literal, 'aا', 0, 3], 1 => [:quantifier, :zero_or_one, '?', 3, 4], 2 => [:literal, :literal, 'bبcت', 4, 10], 3 => [:quantifier, :one_or_more, '+', 10, 11], }, 'a(اbب+)cت?' => { 0 => [:literal, :literal, 'a', 0, 1], 1 => [:group, :capture, '(', 1, 2], 2 => [:literal, :literal, 'اbب', 2, 7], 3 => [:quantifier, :one_or_more, '+', 7, 8], 4 => [:group, :close, ')', 8, 9], 5 => [:literal, :literal, 'cت', 9, 12], 6 => [:quantifier, :zero_or_one, '?', 12, 13], }, # 3 byte wide characters, Japanese 'ab?れます+cd' => { 0 => [:literal, :literal, 'ab', 0, 2], 1 => [:quantifier, :zero_or_one, '?', 2, 3], 2 => [:literal, :literal, 'れます', 3, 12], 3 => [:quantifier, :one_or_more, '+', 12, 13], 4 => [:literal, :literal, 'cd', 13, 15], }, # 4 byte wide characters, Osmanya '𐒀𐒁?𐒂ab+𐒃' => { 0 => [:literal, :literal, '𐒀𐒁', 0, 8], 1 => [:quantifier, :zero_or_one, '?', 8, 9], 2 => [:literal, :literal, '𐒂ab', 9, 15], 3 => [:quantifier, :one_or_more, '+', 15, 16], 4 => [:literal, :literal, '𐒃', 16, 20], }, 'mu𝄞?si*𝄫c+' => { 0 => [:literal, :literal, 'mu𝄞', 0, 6], 1 => [:quantifier, :zero_or_one, '?', 6, 7], 2 => [:literal, :literal, 'si', 7, 9], 3 => [:quantifier, :zero_or_more, '*', 9, 10], 4 => [:literal, :literal, '𝄫c', 10, 15], 5 => [:quantifier, :one_or_more, '+', 15, 16], }, } count = 0 tests.each do |pattern, checks| define_method "test_scan_utf8_runs_#{count+=1}" do tokens = RS.scan(pattern) checks.each do |offset, token| assert_equal( token, tokens[offset] ) end end end end