# encoding: utf-8 # Copyright (C) 2016 Mikio Ikoma require "strscan" require 'regextest/common' require "pp" class Regextest::Front::Scanner include Regextest::Common # A table for lexical analysis of elements # (Not strict analisis here.) LexCodeLiteral = %r!\\x[0-9A-Fa-f]{1,2}|\\[0-7]{2,3}! LexTable = [ [:LEX_OCTET, %r!\\[0123][0-7]{2}|\\[123][0-7]! ], [:LEX_BACK_REFER, %r!\\[1-9]\d*! ], [:LEX_CODE_LITERAL, LexCodeLiteral ], [:LEX_NAMED_REFER, %r!\\k[<'](?:\-?\d+|\w+)(?:[\+\-]\d+)?[>']! ], [:LEX_NAMED_GENERATE, %r!\\g[<'][\+\-]?\w+[>']! ], [:LEX_CONTROL_LETTER, %r!\\c[a-zA-Z]|\\C-[a-zA-Z]|\\c\\\\|\\C-\\\\! ], [:LEX_META_LETTER, %r!\\M-(?:[a-zA-Z]|\\C-[a-zA-Z])! ], [:LEX_ESCAPED_LETTER, %r!\\[tvnrfae #\{\}\[\]\(\)]! ], # \b is an anchor out of bracket [:LEX_UNICODE, %r!\\u\h{4}|\\u\{\h{1,6}(?:\s+\h{1,6})*\}! ], # [:LEX_POSIX_CHAR_CLASS, # invalid out of bracket # %r!\[:\w+:\]! ], [:LEX_SIMPLIFIED_CLASS, %r!\\[wWsSdDhH]! ], [:LEX_UNICODE_CLASS, %r!\\[pP]\{\^?\w+\}! ], [:LEX_QUANTIFIER, %r![\?\*\+][\?\+]?|\{\d+(?:,\d*)?\}\??|\{,\d+\}\??! ], [:LEX_COMMENT, %r!\(\?\#(?:\\\)|[^\)])*\)! ], [:LEX_OPTION_PAREN_1, %r!\(\?(?=\w*x)[imxdau]+(?:\-[im]*)?\)! ], # (?x-im) [:LEX_OPTION_PAREN_2, %r!\(\?[imxdau]*(?:\-(?=\w*x)[imx]+)?\)! ], # (?im-x) [:LEX_PAREN_START_EX1, %r!\(\?(?=\w*x)[imxdau]+(?:\-[im]*)?(?::|(?=\)))! ], # (?x-im: ... ) [:LEX_PAREN_START_EX2, %r!\(\?[imxdau]*(?:\-(?=\w*x)[imx]+)?(?::|(?=\)))! ], # (?im-x: ... ) [:LEX_PAREN_START, %r!\(\?<\w+>| \(\?'\w+'| \(\?\(\d+\)| \(\?\(<\w+>\)| \(\?\('\w+'\)| \(\?[imdau]*(?:\-[im]*)?(?::|(?=\)))| # expressions independent of "x" \(\?\<[\=\!]| \(\?.| # for better error message \( !x ], [:LEX_PAREN_END, %r!\)! ], [:LEX_BRACKET, # considering nested bracket # %r!(?\[(?:\[:\^?\w+:\]|\\\]|\\.|[^\]\[]|\g)+\])! ], %r!(? \[\^?\]? # first part [, [^, [], [^] are possible (?:| # body part \[:\^?\w+:\]| \\\]| \\.| # \&\&\]| # &&] is valid... however not matched ']'. why? [^\]\[]| \g )+ \]) !x ], [:LEX_OR, %r!\|! ], [:LEX_ANC_LINE_BEGIN, %r!\^! ], [:LEX_ANC_LINE_END, %r!\$! ], [:LEX_ANC_WORD_BOUND, %r!\\b! ], [:LEX_ANC_WORD_UNBOUND, %r!\\B! ], [:LEX_ANC_STRING_BEGIN, %r!\\A! ], [:LEX_ANC_STRING_END, %r!\\z! ], [:LEX_ANC_STRING_END2, %r!\\Z! ], [:LEX_ANC_LOOK_BEHIND2, # included in Ruby but not in Onigmo %r!\\K! ], [:LEX_ANC_MATCH_START, %r!\\G! ], [:LEX_SPECIAL_LETTER, %r!\\[RX]! ], [:LEX_SHARP, %r!\#! ], [:LEX_ANY_LETTER, %r!\.! ], # [:LEX_EXTENDED_COMMENT, # commented out since there is dynamic syntax change using (?x: ...) # %r!\#.*?(?:\n|$)! ], [:LEX_NEW_LINE, %r!\n!m ], [:LEX_SPACE, %r!\s!m ], # [:LEX_ERROR, # commented out since /]/ or /foo # [/x is valid notation # %r![\[\]]! ], [:LEX_SIMPLE_ESCAPE, # redundant escape \@, \", etc. %r!\\.! ], # [:LEX_REGOPT_LETTER, # imxdau # %r![imxdau]! ], [:LEX_CHAR, %r!.! ], ] def initialize @lex_table = LexTable.dup whole_lex = @lex_table.map{|lex| "(?<#{lex[0]}>" + lex[1].source + ")"}.join('|') # puts whole_lex @reg = /^#{whole_lex}/mx end # def scan(test_string) results = [] match_string = test_string match_offset = 0 while (md = @reg.match(match_string)) scan_offset = @lex_table.index{|elem| md[elem[0]]} name = @lex_table[scan_offset][0] if(name != :LEX_ERROR) lex_word = md[name] match_length = md.end(0) results.push [name, [lex_word, match_offset, match_length]] match_string = md.post_match match_offset += match_length else offset = test_string.index(match_string) raise "Regexp syntax error, offset #{offset}, \n" + test_string + "\n" + (" "*offset) + "^" end end results.push [false, nil] TstLog("Scanned elements:\n#{results}") results end # method for testing def self.test(test_string, reg_options = nil) puts "String: #{test_string.inspect}" results = Regextest::Front::Scanner.new().scan(test_string) pp results end end # Test suite (execute when this file is specified in command line) if __FILE__ == $0 require 'kconv' puts "test #{__FILE__}" Regextest::Front::Scanner.test("(aa(bb))") Regextest::Front::Scanner.test('\x0a\070\n') Regextest::Front::Scanner.test('(\w+)\s\1') Regextest::Front::Scanner.test('\A(?|.|(?:(?.)\g\k))\z') Regextest::Front::Scanner.test("(?foo|(bar)*|(hoge|boke))") Regextest::Front::Scanner.test("^.+?ab{3}c{3,}d{4,5}e{,5}f??") Regextest::Front::Scanner.test("([abcd]aa(bb))") Regextest::Front::Scanner.test('([[ab][)ab][^\]ab])') Regextest::Front::Scanner.test('\uabcd\u3030\u{4040}\u{8 4040 ffffff}') Regextest::Front::Scanner.test('[\d&&[^47]]') # これはダメ。 Regextest::Front::Scanner.test('\p{ALPHA}\p{^NUMBER}\P{ALPHANUM}') Regextest::Front::Scanner.test('[[:alnum:][:^lower:]]') Regextest::Front::Scanner.test('(?# comment)') Regextest::Front::Scanner.test('ab # ', Regexp::EXTENDED) # BUG Regextest::Front::Scanner.test('aa(?)bb') # must be error end