# -*- coding: utf-8 -*- # module Rouge # @abstract # A stateful lexer that uses sets of regular expressions to # tokenize a string. Most lexers are instances of RegexLexer. class RegexLexer < Lexer # A rule is a tuple of a regular expression to test, and a callback # to perform if the test succeeds. # # @see StateDSL#rule class Rule attr_reader :callback attr_reader :re attr_reader :beginning_of_line def initialize(re, callback) @re = re @callback = callback @beginning_of_line = re.source[0] == ?^ end def inspect "#" end end # a State is a named set of rules that can be tested for or # mixed in. # # @see RegexLexer.state class State attr_reader :name, :rules def initialize(name, rules) @name = name @rules = rules end def inspect "#<#{self.class.name} #{@name.inspect}>" end end class StateDSL attr_reader :rules def initialize(name, &defn) @name = name @defn = defn @rules = [] end def to_state(lexer_class) load! rules = @rules.map do |rule| rule.is_a?(String) ? lexer_class.get_state(rule) : rule end State.new(@name, rules) end def prepended(&defn) parent_defn = @defn StateDSL.new(@name) do instance_eval(&defn) instance_eval(&parent_defn) end end def appended(&defn) parent_defn = @defn StateDSL.new(@name) do instance_eval(&parent_defn) instance_eval(&defn) end end protected # Define a new rule for this state. # # @overload rule(re, token, next_state=nil) # @overload rule(re, &callback) # # @param [Regexp] re # a regular expression for this rule to test. # @param [String] tok # the token type to yield if `re` matches. # @param [#to_s] next_state # (optional) a state to push onto the stack if `re` matches. # If `next_state` is `:pop!`, the state stack will be popped # instead. # @param [Proc] callback # a block that will be evaluated in the context of the lexer # if `re` matches. This block has access to a number of lexer # methods, including {RegexLexer#push}, {RegexLexer#pop!}, # {RegexLexer#token}, and {RegexLexer#delegate}. The first # argument can be used to access the match groups. def rule(re, tok=nil, next_state=nil, &callback) if tok.nil? && callback.nil? raise "please pass `rule` a token to yield or a callback" end callback ||= case next_state when :pop! proc do |stream| puts " yielding #{tok.qualname}, #{stream[0].inspect}" if @debug @output_stream.call(tok, stream[0]) puts " popping stack: #{1}" if @debug @stack.pop or raise 'empty stack!' end when :push proc do |stream| puts " yielding #{tok.qualname}, #{stream[0].inspect}" if @debug @output_stream.call(tok, stream[0]) puts " pushing #{@stack.last.name}" if @debug @stack.push(@stack.last) end when Symbol proc do |stream| puts " yielding #{tok.qualname}, #{stream[0].inspect}" if @debug @output_stream.call(tok, stream[0]) state = @states[next_state] || self.class.get_state(next_state) puts " pushing #{state.name}" if @debug @stack.push(state) end when nil proc do |stream| puts " yielding #{tok.qualname}, #{stream[0].inspect}" if @debug @output_stream.call(tok, stream[0]) end else raise "invalid next state: #{next_state.inspect}" end rules << Rule.new(re, callback) end # Mix in the rules from another state into this state. The rules # from the mixed-in state will be tried in order before moving on # to the rest of the rules in this state. def mixin(state) rules << state.to_s end private def load! return if @loaded @loaded = true instance_eval(&@defn) end end # The states hash for this lexer. # @see state def self.states @states ||= {} end def self.state_definitions @state_definitions ||= InheritableHash.new(superclass.state_definitions) end @state_definitions = {} def self.replace_state(name, new_defn) states[name] = nil state_definitions[name] = new_defn end # The routines to run at the beginning of a fresh lex. # @see start def self.start_procs @start_procs ||= InheritableList.new(superclass.start_procs) end @start_procs = [] # Specify an action to be run every fresh lex. # # @example # start { puts "I'm lexing a new string!" } def self.start(&b) start_procs << b end # Define a new state for this lexer with the given name. # The block will be evaluated in the context of a {StateDSL}. def self.state(name, &b) name = name.to_s state_definitions[name] = StateDSL.new(name, &b) end def self.prepend(name, &b) name = name.to_s dsl = state_definitions[name] or raise "no such state #{name.inspect}" replace_state(name, dsl.prepended(&b)) end def self.append(state, &b) name = name.to_s dsl = state_definitions[name] or raise "no such state #{name.inspect}" replace_state(name, dsl.appended(&b)) end # @private def self.get_state(name) return name if name.is_a? State states[name.to_sym] ||= begin defn = state_definitions[name.to_s] or raise "unknown state: #{name.inspect}" defn.to_state(self) end end # @private def get_state(state_name) self.class.get_state(state_name) end # The state stack. This is initially the single state `[:root]`. # It is an error for this stack to be empty. # @see #state def stack @stack ||= [get_state(:root)] end # The current state - i.e. one on top of the state stack. # # NB: if the state stack is empty, this will throw an error rather # than returning nil. def state stack.last or raise 'empty stack!' end # reset this lexer to its initial state. This runs all of the # start_procs. def reset! @stack = nil @current_stream = nil self.class.start_procs.each do |pr| instance_eval(&pr) end end # This implements the lexer protocol, by yielding [token, value] pairs. # # The process for lexing works as follows, until the stream is empty: # # 1. We look at the state on top of the stack (which by default is # `[:root]`). # 2. Each rule in that state is tried until one is successful. If one # is found, that rule's callback is evaluated - which may yield # tokens and manipulate the state stack. Otherwise, one character # is consumed with an `'Error'` token, and we continue at (1.) # # @see #step #step (where (2.) is implemented) def stream_tokens(str, &b) stream = StringScanner.new(str) @current_stream = stream @output_stream = b @states = self.class.states @null_steps = 0 until stream.eos? if @debug puts "lexer: #{self.class.tag}" puts "stack: #{stack.map(&:name).inspect}" puts "stream: #{stream.peek(20).inspect}" end success = step(state, stream) if !success puts " no match, yielding Error" if @debug b.call(Token::Tokens::Error, stream.getch) end end end # The number of successive scans permitted without consuming # the input stream. If this is exceeded, the match fails. MAX_NULL_SCANS = 5 # Runs one step of the lex. Rules in the current state are tried # until one matches, at which point its callback is called. # # @return true if a rule was tried successfully # @return false otherwise. def step(state, stream) state.rules.each do |rule| if rule.is_a?(State) puts " entering mixin #{rule.name}" if @debug return true if step(rule, stream) puts " exiting mixin #{rule.name}" if @debug else puts " trying #{rule.inspect}" if @debug # XXX HACK XXX # StringScanner's implementation of ^ is b0rken. # see http://bugs.ruby-lang.org/issues/7092 # TODO: this doesn't cover cases like /(a|^b)/, but it's # the most common, for now... next if rule.beginning_of_line && !stream.beginning_of_line? if size = stream.skip(rule.re) puts " got #{stream[0].inspect}" if @debug instance_exec(stream, &rule.callback) if size.zero? @null_steps += 1 if @null_steps > MAX_NULL_SCANS puts " too many scans without consuming the string!" if @debug return false end else @null_steps = 0 end return true end end end false end # Yield a token. # # @param tok # the token type # @param val # (optional) the string value to yield. If absent, this defaults # to the entire last match. def token(tok, val=@current_stream[0]) yield_token(tok, val) end # @deprecated # # Yield a token with the next matched group. Subsequent calls # to this method will yield subsequent groups. def group(tok) raise "RegexLexer#group is deprecated: use #groups instead" end # Yield tokens corresponding to the matched groups of the current # match. def groups(*tokens) tokens.each_with_index do |tok, i| yield_token(tok, @current_stream[i+1]) end end # Delegate the lex to another lexer. The #lex method will be called # with `:continue` set to true, so that #reset! will not be called. # In this way, a single lexer can be repeatedly delegated to while # maintaining its own internal state stack. # # @param [#lex] lexer # The lexer or lexer class to delegate to # @param [String] text # The text to delegate. This defaults to the last matched string. def delegate(lexer, text=nil) puts " delegating to #{lexer.inspect}" if @debug text ||= @current_stream[0] lexer.lex(text, :continue => true) do |tok, val| puts " delegated token: #{tok.inspect}, #{val.inspect}" if @debug yield_token(tok, val) end end def recurse(text=nil) delegate(self.class, text) end # Push a state onto the stack. If no state name is given and you've # passed a block, a state will be dynamically created using the # {StateDSL}. def push(state_name=nil, &b) push_state = if state_name get_state(state_name) elsif block_given? StateDSL.new(b.inspect, &b).to_state(self.class) else # use the top of the stack by default self.state end puts " pushing #{push_state.name}" if @debug stack.push(push_state) end # Pop the state stack. If a number is passed in, it will be popped # that number of times. def pop!(times=1) raise 'empty stack!' if stack.empty? puts " popping stack: #{times}" if @debug stack.pop(times) nil end # replace the head of the stack with the given state def goto(state_name) raise 'empty stack!' if stack.empty? puts " going to state #{state_name} " if @debug stack[-1] = get_state(state_name) end # reset the stack back to `[:root]`. def reset_stack puts ' resetting stack' if @debug stack.clear stack.push get_state(:root) end # Check if `state_name` is in the state stack. def in_state?(state_name) state_name = state_name.to_s stack.any? do |state| state.name == state_name.to_s end end # Check if `state_name` is the state on top of the state stack. def state?(state_name) state_name.to_s == state.name end private def yield_token(tok, val) return if val.nil? || val.empty? puts " yielding #{tok.qualname}, #{val.inspect}" if @debug @output_stream.yield(tok, val) end end end