lib/rouge/regex_lexer.rb in rouge-1.1.0 vs lib/rouge/regex_lexer.rb in rouge-1.2.0
- old
+ new
@@ -8,22 +8,14 @@
#
# @see StateDSL#rule
class Rule
attr_reader :callback
attr_reader :re
+ attr_reader :beginning_of_line
def initialize(re, callback)
@re = re
@callback = callback
- end
-
- # Does the regex start with a ^?
- #
- # Since Regexps are immuntable, this is cached to avoid
- # calling Regexp#source more than once.
- def beginning_of_line?
- return @beginning_of_line if instance_variable_defined?(:@beginning_of_line)
-
@beginning_of_line = re.source[0] == ?^
end
def inspect
"#<Rule #{@re.inspect}>"
@@ -99,15 +91,29 @@
# {RegexLexer#token}, and {RegexLexer#delegate}. The first
# argument can be used to access the match groups.
def rule(re, tok=nil, next_state=nil, &callback)
callback ||= case next_state
when :pop!
- proc { token tok; pop! }
+ proc do |stream|
+ puts " yielding #{tok.qualname}, #{stream[0].inspect}" if @debug
+ @output_stream.call(tok, stream[0])
+ puts " popping stack: #{1}" if @debug
+ @stack.pop or raise 'empty stack!'
+ end
when Symbol
- proc { token tok; push next_state }
+ proc do |stream|
+ puts " yielding #{tok.qualname}, #{stream[0].inspect}" if @debug
+ @output_stream.call(tok, stream[0])
+ state = @states[next_state] || self.class.get_state(next_state)
+ puts " pushing #{state.name}" if @debug
+ @stack.push(state)
+ end
else
- proc { token tok }
+ proc do |stream|
+ puts " yielding #{tok.qualname}, #{stream[0].inspect}" if @debug
+ @output_stream.call(tok, stream[0])
+ end
end
rules << Rule.new(re, callback)
end
@@ -178,14 +184,12 @@
# @private
def self.get_state(name)
return name if name.is_a? State
- name = name.to_s
-
- states[name] ||= begin
- defn = state_definitions[name] or raise "unknown state: #{name.inspect}"
+ states[name.to_sym] ||= begin
+ defn = state_definitions[name.to_s] or raise "unknown state: #{name.inspect}"
defn.to_state(self)
end
end
# @private
@@ -233,107 +237,99 @@
# @see #step #step (where (2.) is implemented)
def stream_tokens(str, &b)
stream = StringScanner.new(str)
@current_stream = stream
+ @output_stream = b
+ @states = self.class.states
+ @null_steps = 0
until stream.eos?
- debug { "lexer: #{self.class.tag}" }
- debug { "stack: #{stack.map(&:name).inspect}" }
- debug { "stream: #{stream.peek(20).inspect}" }
- success = step(get_state(state), stream, &b)
+ if @debug
+ puts "lexer: #{self.class.tag}"
+ puts "stack: #{stack.map(&:name).inspect}"
+ puts "stream: #{stream.peek(20).inspect}"
+ end
+ success = step(state, stream)
+
if !success
- debug { " no match, yielding Error" }
+ puts " no match, yielding Error" if @debug
b.call(Token::Tokens::Error, stream.getch)
end
end
end
+ # The number of successive scans permitted without consuming
+ # the input stream. If this is exceeded, the match fails.
+ MAX_NULL_SCANS = 5
+
# Runs one step of the lex. Rules in the current state are tried
# until one matches, at which point its callback is called.
#
# @return true if a rule was tried successfully
# @return false otherwise.
- def step(state, stream, &b)
+ def step(state, stream)
state.rules.each do |rule|
- case rule
- when State
- debug { " entering mixin #{rule.name}" }
- return true if step(rule, stream, &b)
- debug { " exiting mixin #{rule.name}" }
- when Rule
- debug { " trying #{rule.inspect}" }
+ if rule.is_a?(State)
+ puts " entering mixin #{rule.name}" if @debug
+ return true if step(rule, stream)
+ puts " exiting mixin #{rule.name}" if @debug
+ else
+ puts " trying #{rule.inspect}" if @debug
- if run_rule(rule, stream)
- debug { " got #{stream[0].inspect}" }
+ # XXX HACK XXX
+ # StringScanner's implementation of ^ is b0rken.
+ # see http://bugs.ruby-lang.org/issues/7092
+ # TODO: this doesn't cover cases like /(a|^b)/, but it's
+ # the most common, for now...
+ next if rule.beginning_of_line && !stream.beginning_of_line?
- run_callback(stream, rule.callback, &b)
+ if size = stream.skip(rule.re)
+ puts " got #{stream[0].inspect}" if @debug
+ instance_exec(stream, &rule.callback)
+
+ if size.zero?
+ @null_steps += 1
+ if @null_steps > MAX_NULL_SCANS
+ puts " too many scans without consuming the string!" if @debug
+ return false
+ end
+ else
+ @null_steps = 0
+ end
+
return true
end
end
end
false
end
- # @private
- def run_callback(stream, callback, &output_stream)
- with_output_stream(output_stream) do
- @group_count = 0
- instance_exec(stream, &callback)
- end
- end
-
- # The number of successive scans permitted without consuming
- # the input stream. If this is exceeded, the match fails.
- MAX_NULL_SCANS = 5
-
- # @private
- def run_rule(rule, scanner)
- # XXX HACK XXX
- # StringScanner's implementation of ^ is b0rken.
- # see http://bugs.ruby-lang.org/issues/7092
- # TODO: this doesn't cover cases like /(a|^b)/, but it's
- # the most common, for now...
- return false if rule.beginning_of_line? && !scanner.beginning_of_line?
-
- if (@null_steps ||= 0) >= MAX_NULL_SCANS
- debug { " too many scans without consuming the string!" }
- return false
- end
-
- scanner.scan(rule.re) or return false
-
- if scanner.matched_size.zero?
- @null_steps += 1
- else
- @null_steps = 0
- end
-
- true
- end
-
# Yield a token.
#
# @param tok
# the token type
# @param val
# (optional) the string value to yield. If absent, this defaults
# to the entire last match.
- def token(tok, val=:__absent__)
- val = @current_stream[0] if val == :__absent__
+ def token(tok, val=@current_stream[0])
yield_token(tok, val)
end
+ # @deprecated
+ #
# Yield a token with the next matched group. Subsequent calls
# to this method will yield subsequent groups.
def group(tok)
- yield_token(tok, @current_stream[@group_count += 1])
+ raise "RegexLexer#group is deprecated: use #groups instead"
end
+ # Yield tokens corresponding to the matched groups of the current
+ # match.
def groups(*tokens)
tokens.each_with_index do |tok, i|
yield_token(tok, @current_stream[i+1])
end
end
@@ -346,15 +342,15 @@
# @param [#lex] lexer
# The lexer or lexer class to delegate to
# @param [String] text
# The text to delegate. This defaults to the last matched string.
def delegate(lexer, text=nil)
- debug { " delegating to #{lexer.inspect}" }
+ puts " delegating to #{lexer.inspect}" if @debug
text ||= @current_stream[0]
lexer.lex(text, :continue => true) do |tok, val|
- debug { " delegated token: #{tok.inspect}, #{val.inspect}" }
+ puts " delegated token: #{tok.inspect}, #{val.inspect}" if @debug
yield_token(tok, val)
end
end
def recurse(text=nil)
@@ -372,35 +368,37 @@
else
# use the top of the stack by default
self.state
end
- debug { " pushing #{push_state.name}" }
+ puts " pushing #{push_state.name}" if @debug
stack.push(push_state)
end
# Pop the state stack. If a number is passed in, it will be popped
# that number of times.
def pop!(times=1)
raise 'empty stack!' if stack.empty?
- debug { " popping stack: #{times}" }
+ puts " popping stack: #{times}" if @debug
stack.pop(times)
nil
end
# replace the head of the stack with the given state
def goto(state_name)
raise 'empty stack!' if stack.empty?
+
+ puts " going to state #{state_name} " if @debug
stack[-1] = get_state(state_name)
end
# reset the stack back to `[:root]`.
def reset_stack
- debug { ' resetting stack' }
+ puts ' resetting stack' if @debug
stack.clear
stack.push get_state(:root)
end
# Check if `state_name` is in the state stack.
@@ -415,22 +413,9 @@
def state?(state_name)
state_name.to_s == state.name
end
private
- def with_output_stream(output_stream, &b)
- old_output_stream = @output_stream
- @output_stream = Enumerator::Yielder.new do |tok, val|
- debug { " yielding #{tok.qualname}, #{val.inspect}" }
- output_stream.call(tok, val)
- end
-
- yield
-
- ensure
- @output_stream = old_output_stream
- end
-
def yield_token(tok, val)
return if val.nil? || val.empty?
@output_stream.yield(tok, val)
end
end