lib/support/base_tokenizer.rb in rley-0.7.00 vs lib/support/base_tokenizer.rb in rley-0.7.01

- old
+ new

@@ -35,17 +35,18 @@ protected # Patterns: # Unambiguous single character - # Conditional single character (e.g. '+' operator, '+' prefix for positive numbers) + # Conditional single character: + # (e.g. '+' operator, '+' prefix for positive numbers) def _next_token skip_whitespaces curr_ch = scanner.peek(1) return nil if curr_ch.nil? || curr_ch.empty? - token = recognize_token() + token = recognize_token if token.nil? # Unknown token curr_ch = scanner.peek(1) erroneous = curr_ch.nil? ? '' : scanner.scan(/./) sequel = scanner.scan(/.{1,20}/) erroneous += sequel unless sequel.nil? @@ -53,44 +54,12 @@ end return token end - def recognize_token() -=begin - if "()'`".include? curr_ch # Single characters - # Delimiters, separators => single character token - token = build_token(@@lexeme2name[curr_ch], scanner.getch) - elsif (lexeme = scanner.scan(/(?:\.)(?=\s)/)) # Single char occurring alone - token = build_token('PERIOD', lexeme) - elsif (lexeme = scanner.scan(/,@?/)) - token = build_token(@@lexeme2name[lexeme], lexeme) - elsif (lexeme = scanner.scan(/#(?:(?:true)|(?:false)|(?:u8)|[\\\(tfeiodx]|(?:\d+[=#]))/)) - token = cardinal_token(lexeme) - elsif (lexeme = scanner.scan(/[+-]?[0-9]+(?=\s|[|()";]|$)/)) - token = build_token('INTEGER', lexeme) # Decimal radix - elsif (lexeme = scanner.scan(/[+-]?[0-9]+(?:\.[0-9]+)?(?:(?:e|E)[+-]?[0-9]+)?/)) - # Order dependency: must be tested after INTEGER case - token = build_token('REAL', lexeme) - elsif (lexeme = scanner.scan(/"(?:\\"|[^"])*"/)) # Double quotes literal? - token = build_token('STRING_LIT', lexeme) - elsif (lexeme = scanner.scan(/[a-zA-Z!$%&*\/:<=>?@^_~][a-zA-Z0-9!$%&*+-.\/:<=>?@^_~+-]*/)) - keyw = @@keywords[lexeme.upcase] - tok_type = keyw ? keyw : 'IDENTIFIER' - token = build_token(tok_type, lexeme) - elsif (lexeme = scanner.scan(/\|(?:[^|])*\|/)) # Vertical bar delimited - token = build_token('IDENTIFIER', lexeme) - elsif (lexeme = scanner.scan(/([\+\-])((?=\s|[|()";])|$)/)) - # # R7RS peculiar identifiers case 1: isolated plus and minus as identifiers - token = build_token('IDENTIFIER', lexeme) - elsif (lexeme = scanner.scan(/[+-][a-zA-Z!$%&*\/:<=>?@^_~+-@][a-zA-Z0-9!$%&*+-.\/:<=>?@^_~+-]*/)) - # R7RS peculiar identifiers case 2 - token = build_token('IDENTIFIER', lexeme) - elsif (lexeme = scanner.scan(/\.[a-zA-Z!$%&*\/:<=>?@^_~+-@.][a-zA-Z0-9!$%&*+-.\/:<=>?@^_~+-]*/)) - # R7RS peculiar identifiers case 4 - token = build_token('IDENTIFIER', lexeme) -=end + def recognize_token + raise NotImplementedError end def build_token(aSymbolName, aLexeme, aFormat = :default) begin value = convert_to(aLexeme, aSymbolName, aFormat) @@ -103,11 +72,11 @@ end return token end - def convert_to(aLexeme, aSymbolName, aFormat) + def convert_to(aLexeme, _symbol_name, _format) return aLexeme end def skip_whitespaces pre_pos = scanner.pos @@ -122,15 +91,15 @@ ws_found = true next_line end # next_ch = scanner.peek(1) # if next_ch == ';' - # cmt_found = true - # scanner.skip(/;[^\r\n]*(?:(?:\r\n)|\r|\n)?/) - # next_line + # cmt_found = true + # scanner.skip(/;[^\r\n]*(?:(?:\r\n)|\r|\n)?/) + # next_line # end - break unless ws_found or cmt_found + break unless ws_found || cmt_found end curr_pos = scanner.pos return if curr_pos == pre_pos end @@ -138,60 +107,5 @@ def next_line @lineno += 1 @line_start = scanner.pos end end # class -=begin -require 'base_tokenizer' - -class PB_Tokenizer < BaseTokenizer - @@lexeme2name = { - '(' => 'LPAREN', - ')' => 'RPAREN', - '+' => 'PLUS', - }.freeze - - protected - - def recognize_token() - token = nil - curr_ch = scanner.peek(1) - - if '()'.include? curr_ch # Single characters - # Delimiters, separators => single character token - token = build_token(@@lexeme2name[curr_ch], scanner.getch) - elsif (lexeme = scanner.scan(/(?:\+)(?=\s)/)) # Single char occurring alone - token = build_token(@@lexeme2name[lexeme], lexeme) - elsif (lexeme = scanner.scan(/[+-]?[0-9]+/)) - token = build_token('INTEGER', lexeme) - end - end -end # class - - # Basic tokenizer - # @return [Array<Rley::Lexical::Token>] - def tokenize(aText) - tokenizer = PB_Tokenizer.new(aText) - tokenizer.token - end - -=end -=begin - # Basic expression tokenizer - def tokenize(aText) - tokens = aText.scan(/\S+/).map do |lexeme| - case lexeme - when '+', '(', ')' - terminal = @grammar.name2symbol[lexeme] - when /^[-+]?\d+$/ - terminal = @grammar.name2symbol['int'] - else - msg = "Unknown input text '#{lexeme}'" - raise StandardError, msg - end - pos = Rley::Lexical::Position.new(1, 4) # Dummy position - Rley::Lexical::Token.new(lexeme, terminal, pos) - end - - return tokens - end -=end \ No newline at end of file