lib/support/base_tokenizer.rb in rley-0.7.00 vs lib/support/base_tokenizer.rb in rley-0.7.01
- old
+ new
@@ -35,17 +35,18 @@
protected
# Patterns:
# Unambiguous single character
- # Conditional single character (e.g. '+' operator, '+' prefix for positive numbers)
+ # Conditional single character:
+ # (e.g. '+' operator, '+' prefix for positive numbers)
def _next_token
skip_whitespaces
curr_ch = scanner.peek(1)
return nil if curr_ch.nil? || curr_ch.empty?
- token = recognize_token()
+ token = recognize_token
if token.nil? # Unknown token
curr_ch = scanner.peek(1)
erroneous = curr_ch.nil? ? '' : scanner.scan(/./)
sequel = scanner.scan(/.{1,20}/)
erroneous += sequel unless sequel.nil?
@@ -53,44 +54,12 @@
end
return token
end
- def recognize_token()
-=begin
- if "()'`".include? curr_ch # Single characters
- # Delimiters, separators => single character token
- token = build_token(@@lexeme2name[curr_ch], scanner.getch)
- elsif (lexeme = scanner.scan(/(?:\.)(?=\s)/)) # Single char occurring alone
- token = build_token('PERIOD', lexeme)
- elsif (lexeme = scanner.scan(/,@?/))
- token = build_token(@@lexeme2name[lexeme], lexeme)
- elsif (lexeme = scanner.scan(/#(?:(?:true)|(?:false)|(?:u8)|[\\\(tfeiodx]|(?:\d+[=#]))/))
- token = cardinal_token(lexeme)
- elsif (lexeme = scanner.scan(/[+-]?[0-9]+(?=\s|[|()";]|$)/))
- token = build_token('INTEGER', lexeme) # Decimal radix
- elsif (lexeme = scanner.scan(/[+-]?[0-9]+(?:\.[0-9]+)?(?:(?:e|E)[+-]?[0-9]+)?/))
- # Order dependency: must be tested after INTEGER case
- token = build_token('REAL', lexeme)
- elsif (lexeme = scanner.scan(/"(?:\\"|[^"])*"/)) # Double quotes literal?
- token = build_token('STRING_LIT', lexeme)
- elsif (lexeme = scanner.scan(/[a-zA-Z!$%&*\/:<=>?@^_~][a-zA-Z0-9!$%&*+-.\/:<=>?@^_~+-]*/))
- keyw = @@keywords[lexeme.upcase]
- tok_type = keyw ? keyw : 'IDENTIFIER'
- token = build_token(tok_type, lexeme)
- elsif (lexeme = scanner.scan(/\|(?:[^|])*\|/)) # Vertical bar delimited
- token = build_token('IDENTIFIER', lexeme)
- elsif (lexeme = scanner.scan(/([\+\-])((?=\s|[|()";])|$)/))
- # # R7RS peculiar identifiers case 1: isolated plus and minus as identifiers
- token = build_token('IDENTIFIER', lexeme)
- elsif (lexeme = scanner.scan(/[+-][a-zA-Z!$%&*\/:<=>?@^_~+-@][a-zA-Z0-9!$%&*+-.\/:<=>?@^_~+-]*/))
- # R7RS peculiar identifiers case 2
- token = build_token('IDENTIFIER', lexeme)
- elsif (lexeme = scanner.scan(/\.[a-zA-Z!$%&*\/:<=>?@^_~+-@.][a-zA-Z0-9!$%&*+-.\/:<=>?@^_~+-]*/))
- # R7RS peculiar identifiers case 4
- token = build_token('IDENTIFIER', lexeme)
-=end
+ def recognize_token
+ raise NotImplementedError
end
def build_token(aSymbolName, aLexeme, aFormat = :default)
begin
value = convert_to(aLexeme, aSymbolName, aFormat)
@@ -103,11 +72,11 @@
end
return token
end
- def convert_to(aLexeme, aSymbolName, aFormat)
+ def convert_to(aLexeme, _symbol_name, _format)
return aLexeme
end
def skip_whitespaces
pre_pos = scanner.pos
@@ -122,15 +91,15 @@
ws_found = true
next_line
end
# next_ch = scanner.peek(1)
# if next_ch == ';'
- # cmt_found = true
- # scanner.skip(/;[^\r\n]*(?:(?:\r\n)|\r|\n)?/)
- # next_line
+ # cmt_found = true
+ # scanner.skip(/;[^\r\n]*(?:(?:\r\n)|\r|\n)?/)
+ # next_line
# end
- break unless ws_found or cmt_found
+ break unless ws_found || cmt_found
end
curr_pos = scanner.pos
return if curr_pos == pre_pos
end
@@ -138,60 +107,5 @@
def next_line
@lineno += 1
@line_start = scanner.pos
end
end # class
-=begin
-require 'base_tokenizer'
-
-class PB_Tokenizer < BaseTokenizer
- @@lexeme2name = {
- '(' => 'LPAREN',
- ')' => 'RPAREN',
- '+' => 'PLUS',
- }.freeze
-
- protected
-
- def recognize_token()
- token = nil
- curr_ch = scanner.peek(1)
-
- if '()'.include? curr_ch # Single characters
- # Delimiters, separators => single character token
- token = build_token(@@lexeme2name[curr_ch], scanner.getch)
- elsif (lexeme = scanner.scan(/(?:\+)(?=\s)/)) # Single char occurring alone
- token = build_token(@@lexeme2name[lexeme], lexeme)
- elsif (lexeme = scanner.scan(/[+-]?[0-9]+/))
- token = build_token('INTEGER', lexeme)
- end
- end
-end # class
-
- # Basic tokenizer
- # @return [Array<Rley::Lexical::Token>]
- def tokenize(aText)
- tokenizer = PB_Tokenizer.new(aText)
- tokenizer.token
- end
-
-=end
-=begin
- # Basic expression tokenizer
- def tokenize(aText)
- tokens = aText.scan(/\S+/).map do |lexeme|
- case lexeme
- when '+', '(', ')'
- terminal = @grammar.name2symbol[lexeme]
- when /^[-+]?\d+$/
- terminal = @grammar.name2symbol['int']
- else
- msg = "Unknown input text '#{lexeme}'"
- raise StandardError, msg
- end
- pos = Rley::Lexical::Position.new(1, 4) # Dummy position
- Rley::Lexical::Token.new(lexeme, terminal, pos)
- end
-
- return tokens
- end
-=end
\ No newline at end of file