# -*- coding: utf-8 -*- # # frozen_string_literal: true module Rouge # @abstract # A stateful lexer that uses sets of regular expressions to # tokenize a string. Most lexers are instances of RegexLexer. class RegexLexer < Lexer class InvalidRegex < StandardError def initialize(re) @re = re end def to_s "regex #{@re.inspect} matches empty string, but has no predicate!" end end class ClosedState < StandardError attr_reader :state def initialize(state) @state = state end def rule @state.rules.last end def to_s rule = @state.rules.last msg = "State :#{state.name} cannot continue after #{rule.inspect}, which will always match." if rule.re.source.include?('*') msg += " Consider replacing * with +." end msg end end # A rule is a tuple of a regular expression to test, and a callback # to perform if the test succeeds. # # @see StateDSL#rule class Rule attr_reader :callback attr_reader :re attr_reader :beginning_of_line def initialize(re, callback) @re = re @callback = callback @beginning_of_line = re.source[0] == ?^ end def inspect "#" end end # a State is a named set of rules that can be tested for or # mixed in. # # @see RegexLexer.state class State attr_reader :name, :rules def initialize(name, rules) @name = name @rules = rules end def inspect "#<#{self.class.name} #{@name.inspect}>" end end class StateDSL attr_reader :rules, :name def initialize(name, &defn) @name = name @defn = defn @rules = [] @loaded = false @closed = false end def to_state(lexer_class) load! rules = @rules.map do |rule| rule.is_a?(String) ? lexer_class.get_state(rule) : rule end State.new(@name, rules) end def prepended(&defn) parent_defn = @defn StateDSL.new(@name) do instance_eval(&defn) instance_eval(&parent_defn) end end def appended(&defn) parent_defn = @defn StateDSL.new(@name) do instance_eval(&parent_defn) instance_eval(&defn) end end protected # Define a new rule for this state. # # @overload rule(re, token, next_state=nil) # @overload rule(re, &callback) # # @param [Regexp] re # a regular expression for this rule to test. # @param [String] tok # the token type to yield if `re` matches. # @param [#to_s] next_state # (optional) a state to push onto the stack if `re` matches. # If `next_state` is `:pop!`, the state stack will be popped # instead. # @param [Proc] callback # a block that will be evaluated in the context of the lexer # if `re` matches. This block has access to a number of lexer # methods, including {RegexLexer#push}, {RegexLexer#pop!}, # {RegexLexer#token}, and {RegexLexer#delegate}. The first # argument can be used to access the match groups. def rule(re, tok=nil, next_state=nil, &callback) raise ClosedState.new(self) if @closed if tok.nil? && callback.nil? raise "please pass `rule` a token to yield or a callback" end matches_empty = re =~ '' callback ||= case next_state when :pop! proc do |stream| puts " yielding: #{tok.qualname}, #{stream[0].inspect}" if @debug @output_stream.call(tok, stream[0]) puts " popping stack: 1" if @debug @stack.pop or raise 'empty stack!' end when :push proc do |stream| puts " yielding: #{tok.qualname}, #{stream[0].inspect}" if @debug @output_stream.call(tok, stream[0]) puts " pushing :#{@stack.last.name}" if @debug @stack.push(@stack.last) end when Symbol proc do |stream| puts " yielding: #{tok.qualname}, #{stream[0].inspect}" if @debug @output_stream.call(tok, stream[0]) state = @states[next_state] || self.class.get_state(next_state) puts " pushing :#{state.name}" if @debug @stack.push(state) end when nil # cannot use an empty-matching regexp with no predicate raise InvalidRegex.new(re) if matches_empty proc do |stream| puts " yielding: #{tok.qualname}, #{stream[0].inspect}" if @debug @output_stream.call(tok, stream[0]) end else raise "invalid next state: #{next_state.inspect}" end rules << Rule.new(re, callback) close! if matches_empty && !context_sensitive?(re) end def context_sensitive?(re) source = re.source return true if source =~ /[(][?] MAX_NULL_SCANS puts " warning: too many scans without consuming the string!" if @debug return false end else @null_steps = 0 end return true end end end false end # Yield a token. # # @param tok # the token type # @param val # (optional) the string value to yield. If absent, this defaults # to the entire last match. def token(tok, val=@current_stream[0]) yield_token(tok, val) end # @deprecated # # Yield a token with the next matched group. Subsequent calls # to this method will yield subsequent groups. def group(tok) raise "RegexLexer#group is deprecated: use #groups instead" end # Yield tokens corresponding to the matched groups of the current # match. def groups(*tokens) tokens.each_with_index do |tok, i| yield_token(tok, @current_stream[i+1]) end end # Delegate the lex to another lexer. We use the `continue_lex` method # so that #reset! will not be called. In this way, a single lexer # can be repeatedly delegated to while maintaining its own internal # state stack. # # @param [#lex] lexer # The lexer or lexer class to delegate to # @param [String] text # The text to delegate. This defaults to the last matched string. def delegate(lexer, text=nil) puts " delegating to: #{lexer.inspect}" if @debug text ||= @current_stream[0] lexer.continue_lex(text) do |tok, val| puts " delegated token: #{tok.inspect}, #{val.inspect}" if @debug yield_token(tok, val) end end def recurse(text=nil) delegate(self.class, text) end # Push a state onto the stack. If no state name is given and you've # passed a block, a state will be dynamically created using the # {StateDSL}. def push(state_name=nil, &b) push_state = if state_name get_state(state_name) elsif block_given? StateDSL.new(b.inspect, &b).to_state(self.class) else # use the top of the stack by default self.state end puts " pushing: :#{push_state.name}" if @debug stack.push(push_state) end # Pop the state stack. If a number is passed in, it will be popped # that number of times. def pop!(times=1) raise 'empty stack!' if stack.empty? puts " popping stack: #{times}" if @debug stack.pop(times) nil end # replace the head of the stack with the given state def goto(state_name) raise 'empty stack!' if stack.empty? puts " going to: state :#{state_name} " if @debug stack[-1] = get_state(state_name) end # reset the stack back to `[:root]`. def reset_stack puts ' resetting stack' if @debug stack.clear stack.push get_state(:root) end # Check if `state_name` is in the state stack. def in_state?(state_name) state_name = state_name.to_sym stack.any? do |state| state.name == state_name.to_sym end end # Check if `state_name` is the state on top of the state stack. def state?(state_name) state_name.to_sym == state.name end private def yield_token(tok, val) return if val.nil? || val.empty? puts " yielding: #{tok.qualname}, #{val.inspect}" if @debug @output_stream.yield(tok, val) end end end