require "rly/lex_token" module Rly # Exception, which is returned on unhandled lexing errors. class LexError < Exception; end # Base class for your lexer. # # Generally, you define a new lexer by subclassing Rly::Lex. Your code should # use methods {.token}, {.ignore}, {.literals}, {.on_error} to make the lexer # configuration (check the methods documentation for details). # # Once you got your lexer configured, you can create its instances passing a # String to be tokenized. You can then use {#next} method to get tokens. If you # have more string to tokenize, you can append it to input buffer at any time with # {#input}. class Lex # Tracks the current line number for generated tokens # # *lineno*'s value should be increased manually. Check the example for a demo # rule. # # @api semipublic # @return [Fixnum] current line number # # @example # token /\n+/ do |t| t.lexer.lineno = t.value.count("\n"); t end attr_accessor :lineno # Tracks the current position in the input string # # Genreally, it should only be used to skip a few characters in the error hander. # # @api semipublic # @return [Fixnum] index of a starting character for current token # # @example # on_error do |t| # t.lexer.pos += 1 # nil # skip the bad character # end attr_accessor :pos # Creates a new lexer instance for given input # # @api public # @param input [String] a string to be tokenized # @example # class MyLexer < Rly::Lex # ignore " " # token :LOWERS, /[a-z]+/ # token :UPPERS, /[A-Z]+/ # end # # lex = MyLexer.new("hello WORLD") # t = lex.next # puts "#{tok.type} -> #{tok.value}" #=> "LOWERS -> hello" # t = lex.next # puts "#{tok.type} -> #{tok.value}" #=> "UPPERS -> WORLD" # t = lex.next # => nil def initialize(input="") @input = input @pos = 0 @lineno = 0 end def inspect "#<#{self.class} pos=#{@pos} len=#{@input.length} lineno=#{@lineno}>" end # Appends string to input buffer # # The given string is appended to input buffer, further {#next} calls will # tokenize it as usual. # # @api public # # @example # lex = MyLexer.new("hello") # # t = lex.next # puts "#{tok.type} -> #{tok.value}" #=> "LOWERS -> hello" # t = lex.next # => nil # lex.input("WORLD") # t = lex.next # puts "#{tok.type} -> #{tok.value}" #=> "UPPERS -> WORLD" # t = lex.next # => nil def input(input) @input << input nil end # Processes the next token in input # # This is the main interface to lexer. It returns next available token or **nil** # if there are no more tokens available in the input string. # # {#each} Raises {LexError} if the input cannot be processed. This happens if # there were no matches by 'token' rules and no matches by 'literals' rule. # If the {.on_error} handler is not set, the exception will be raised immediately, # however, if the handler is set, the eception will be raised only if the {#pos} # after returning from error handler is still unchanged. # # @api public # @raise [LexError] if the input cannot be processed # @return [LexToken] if the next chunk of input was processed successfully # @return [nil] if there are no more tokens available in input # # @example # lex = MyLexer.new("hello WORLD") # # t = lex.next # puts "#{tok.type} -> #{tok.value}" #=> "LOWERS -> hello" # t = lex.next # puts "#{tok.type} -> #{tok.value}" #=> "UPPERS -> WORLD" # t = lex.next # => nil def next while @pos < @input.length if self.class.ignores_list[@input[@pos]] @pos += 1 next end matched = false self.class.tokens.each do |type, rule, block| m = rule.match(@input, @pos) next unless m next unless m.begin(0) == @pos tok = LexToken.new(type, m[0], self) matched = true tok = block.call(tok) if block @pos = m.end(0) return tok if tok.type end unless matched if self.class.literals_list[@input[@pos]] tok = LexToken.new(@input[@pos], @input[@pos], self) matched = true @pos += 1 return tok end end unless matched if self.class.error_hander pos = @pos tok = LexToken.new(:error, @input[@pos], self) tok = self.class.error_hander.call(tok) if pos == @pos raise LexError.new("Illegal character '#{@input[@pos]}' at index #{@pos}") else return tok if tok && tok.type end else raise LexError.new("Illegal character '#{@input[@pos]}' at index #{@pos}") end end end return nil end class << self def terminals self.tokens.map { |t,r,b| t }.compact + self.literals_list.chars.to_a end # Returns the list of registered tokens # # @api private # @visibility protected # @return [Array] array of [type, regex, block] triples def tokens @tokens ||= [] end # Returns the list of registered literals # # @api private # @visibility protected # @return [String] registered literals def literals_list @literals ||= "" end # Returns the list of registered ignorables # # @api private # @visibility protected # @return [String] registered ignorables def ignores_list @ignores ||= "" end # Returns the registered error handler, if any # # @api private # @visibility protected # @return [Proc] registered error handler def error_hander @error_block end private # @!group DSL Class Methods # Adds a token definition to a class # # This method adds a token definition to be lated used to tokenize input. # It can be used to register normal tokens, and also functional tokens (the # latter ones are processed as usual but are not being returned). # # @!visibility public # @api public # @param type [Symbol] token type. It should be an all-caps symbol by convention # @param regex [Regexp] a regular expression to match the token # # @yieldparam tok [LexToken] a new token instance for processed input # @yieldreturn [LexToken] the same or modified token instance. Return nil # to ignore the input # @see .literals # @see .ignores # @example # class MyLexer < Rly::Lex # token :LOWERS, /[a-z]+/ # this would match LOWERS on 1+ lowercase letters # # token :INT, /\d+/ do |t| # this would match on integers # t.value = t.value.to_i # additionally the value is converted to Fixnum # t # the updated token is returned # end # # token /\n/ do |t| # this would match on newlines # t.lexer.lineno += 1 # the block will be executed on match, but # end # no token will be returned (as name is not specified) # # end def token(*args, &block) if args.length == 2 self.tokens << [args[0], args[1], block] elsif args.length == 1 self.tokens << [nil, args[0], block] else raise ArgumentError end nil end # Specifies a list of one-char literals # # Literals may be used in the case when you have several one-character tokens # and you don't want to define them one by one using {.token} method. # # @!visibility public # @api public # @param lit [String] the list of literals # @see .token # @example # class MyLexer < Rly::Lex # literals "+-/*" # end # # lex = MyLexer.new("+-") # lex.each do |tok| # puts "#{tok.type} -> #{tok.value}" #=> "+ -> +" # #=> "- -> -" # end def literals(lit) @literals = lit nil end # Specifies a list of one-char symbols to be ignored in input # # This method allows to skip over formatting symbols (like tabs and spaces) quickly. # # @!visibility public # @api public # @param ign [String] the list of ignored symbols # @see .token # @example # class MyLexer < Rly::Lex # literals "+-" # token :INT, /\d+/ # ignore " \t" # end # # lex = MyLexer.new("2 + 2") # lex.each do |tok| # puts "#{tok.type} -> #{tok.value}" #=> "INT -> 2" # #=> "+ -> +" # #=> "INT -> 2" # end def ignore(ign) @ignores = ign nil end # Specifies a block that should be called on error # # In case of lexing error the lexer first tries to fix it by providing a # chance for developer to look on the failing character. If this block is # not provided, the lexing error always results in {LexError}. # # You must increment the lexer's {#pos} as part of the action. You may also # return a new {LexToken} or nil to skip the input # # @!visibility public # @api public # @see .token # @example # class MyLexer < Rly::Lex # token :INT, /\d+/ # on_error do |tok| # tok.lexer.pos += 1 # just skip the offending character # end # end # # lex = MyLexer.new("123qwe") # lex.each do |tok| # puts "#{tok.type} -> #{tok.value}" #=> "INT -> 123" # end def on_error(&block) @error_block = block nil end end end end