lib/rouge/lexer.rb in rouge-0.0.2 vs lib/rouge/lexer.rb in rouge-0.0.3
- old
+ new
@@ -1,57 +1,97 @@
+# stdlib
+require 'strscan'
+
module Rouge
class Lexer
class << self
- def create(opts={}, &b)
- new(opts, &b).send(:force_load!)
+ def make(opts={}, &b)
+ _sup = self
+
+ Class.new(self) do
+ @lazy_load_proc = b
+ @default_options = _sup.default_options.merge(opts)
+ @parent = _sup
+ end
end
+ def lex(stream, opts={}, &b)
+ new(opts).lex(stream, &b)
+ end
+
+ protected
+ def force_load!
+ return self if @force_load
+ @force_load = true
+ @lazy_load_proc && instance_eval(&@lazy_load_proc)
+
+ self
+ end
+ public
+
+ def new(*a, &b)
+ force_load!
+ super(*a, &b)
+ end
+
+ def default_options
+ @default_options ||= {}
+ end
+
def find(name)
registry[name.to_s]
end
def register(name, lexer)
registry[name.to_s] = lexer
end
+ def tag(t=nil)
+ return @tag if t.nil?
+
+ @tag = t.to_s
+ aliases @tag
+ end
+
+ def aliases(*args)
+ args.each { |arg| Lexer.register(arg, self) }
+ end
+
+ def extensions(*exts)
+ exts.each do |ext|
+ Lexer.extension_registry[ext] = self
+ end
+ end
+
+ def extension_registry
+ @extension_registry ||= {}
+ end
+
private
def registry
@registry ||= {}
end
end
- def name(n=nil)
- return @name if n.nil?
+ # -*- instance methods -*- #
- @name = n.to_s
- aliases @name
- end
-
- def aliases(*args)
- args.each { |arg| Lexer.register(arg, self) }
- end
-
def initialize(opts={}, &b)
- options opts
+ options(opts)
@lazy_load_proc = b
end
- def default_options
- {}
- end
-
def options(o={})
- (@options ||= default_options).merge!(o)
+ (@options ||= {}).merge!(o)
- @options
+ self.class.default_options.merge(@options)
end
def option(k, v=:absent)
if v == :absent
- options[k.to_s]
+ options[k]
else
- options({ k.to_s => v })
+ options({ k => v })
end
end
def debug(&b)
puts(b.call) if option :debug
@@ -59,177 +99,274 @@
def get_tokens(stream)
lex(stream).to_a
end
- def lex(stream, &b)
- return enum_for(:lex, stream) unless block_given?
+ def lex(string, &b)
+ return enum_for(:lex, string) unless block_given?
- stream_tokens(stream, &b)
+ last_token = nil
+ last_val = nil
+ stream_tokens(StringScanner.new(string)) do |tok, val|
+ next if val.empty?
+
+ if tok == last_token
+ last_val << val
+ next
+ end
+
+ b.call(last_token, last_val) if last_token
+ last_token = tok
+ last_val = val
+ end
+
+ b.call(last_token, last_val) if last_token
end
def stream_tokens(stream, &b)
raise 'abstract'
end
-
- protected
-
- def force_load!
- return self if @force_load
- @force_load = true
- instance_eval &@lazy_load_proc
-
- self
- end
end
class RegexLexer < Lexer
class Rule
attr_reader :callback
- attr_reader :next_lexer
+ attr_reader :next_state
attr_reader :re
- def initialize(re, callback, next_lexer)
- @orig_re = re
- @re = Regexp.new %/\\A(?:#{re.source})/
+ def initialize(re, callback, next_state)
+ @re = re
@callback = callback
- @next_lexer = next_lexer
+ @next_state = next_state
end
def inspect
- "#<Rule #{@orig_re.inspect}>"
+ "#<Rule #{@re.inspect}>"
end
def consume(stream, &b)
- # TODO: I'm sure there is a much faster way of doing this.
- # also, encapsulate the stream in its own class.
- match = stream.match(@re)
+ stream.scan(@re)
- if match
- stream.slice!(0...$&.size)
- yield match
+ if stream.matched?
+ yield stream
return true
end
false
end
end
- def initialize(parent=nil, opts={}, &defn)
- if parent.is_a? Hash
- opts = parent
- parent = nil
+ class State
+ attr_reader :name
+ def initialize(lexer_class, name, &defn)
+ @lexer_class = lexer_class
+ @name = name
+ @defn = defn
end
- @parent = parent
- super(opts, &defn)
+ def relative_state(state_name)
+ @lexer_class.get_state(state_name)
+ end
+
+ def rules
+ @rules ||= []
+ end
+
+ def load!
+ return self if @loaded
+ @loaded = true
+ StateDSL.new(rules).instance_eval(&@defn)
+ self
+ end
end
- def lexer(name, opts={}, &defn)
- @scope ||= {}
- name = name.to_s
+ class ScanState
+ def self.delegate(m, target)
+ define_method(m) do |*a, &b|
+ send(target).send(m, *a, &b)
+ end
+ end
- if block_given?
- l = @scope[name] = RegexLexer.new(self, options.merge(opts), &defn)
- l.instance_variable_set :@name, name
- l
- else
- @scope[name] || @parent && @parent.lexer(name)
+ attr_accessor :scanner
+ attr_accessor :stack
+ attr_accessor :lexer
+ def initialize(lexer, scanner, stack=nil)
+ @lexer = lexer
+ @scanner = scanner
+ @stack = stack || [lexer.get_state(:root)]
end
+
+ def pop!
+ raise 'empty stack!' if stack.empty?
+
+ debug { " popping stack" }
+ stack.pop
+ end
+
+ def push(state_name)
+ debug { " pushing #{state_name}" }
+ stack.push(state.relative_state(state_name))
+ end
+
+ delegate :debug, :lexer
+
+ delegate :[], :scanner
+ delegate :captures, :scanner
+ delegate :peek, :scanner
+ delegate :eos?, :scanner
+
+ def run_callback(&callback)
+ Enumerator.new do |y|
+ @output_stream = y
+ @group_count = 0
+ instance_exec(self, &callback)
+ @output_stream = nil
+ end
+ end
+
+ def token(tok, val=nil)
+ raise 'no output stream' unless @output_stream
+
+ @output_stream << [Token[tok], val || scanner[0]]
+ end
+
+ def group(tok)
+ token(tok, scanner[@group_count += 1])
+ end
+
+ def delegate(lexer, text=nil)
+ debug { " delegating to #{lexer.name}" }
+ text ||= scanner[0]
+
+ lexer.lex(text) do |tok, val|
+ debug { " delegated token: #{tok.inspect}, #{val.inspect}" }
+ token(tok, val)
+ end
+ end
+
+ def state
+ raise 'empty stack!' if stack.empty?
+ stack.last
+ end
+
+ def scan(re, &b)
+ scanner.scan(re)
+
+ if scanner.matched?
+ yield self
+ return true
+ end
+
+ return false
+ end
end
- def mixin(lexer)
- lexer = get_lexer(lexer)
- lexer.force_load!
+ class StateDSL
+ attr_reader :rules
+ def initialize(rules)
+ @rules = rules
+ end
- rules << lexer
+ def rule(re, tok=nil, next_state=nil, &callback)
+ if block_given?
+ next_state = tok
+ else
+ tok = Token[tok]
+
+ callback = proc do |ss|
+ token tok, ss[0]
+ case next_state
+ when :pop!
+ pop!
+ when Symbol
+ push next_state
+ end # else pass
+ end
+ end
+
+ rules << Rule.new(re, callback, next_state)
+ end
+
+ def mixin(lexer_name)
+ rules << lexer_name.to_s
+ end
end
- def rules
- force_load!
- @rules ||= []
+ def self.states
+ @states ||= {}
end
- def rule(re, token=nil, next_lexer=nil, &callback)
- if block_given?
- next_lexer = token
- else
- if token.is_a? String
- token = Token[token]
- end
+ def self.state(name, &b)
+ name = name.to_s
+ states[name] = State.new(self, name, &b)
+ end
- callback = proc { |match, &b| b.call token, match }
+ def initialize(parent=nil, opts={}, &defn)
+ if parent.is_a? Hash
+ opts = parent
+ parent = nil
end
- rules << Rule.new(re, callback, get_lexer(next_lexer))
+ @parent = parent
+ super(opts, &defn)
end
- def stream_tokens(stream, &b)
- stream = stream.dup
- stack = [self]
+ def self.get_state(name)
+ return name if name.is_a? State
- stream_with_stack(stream.dup, [self], &b)
+ state = states[name.to_s]
+ raise "unknown state: #{name}" unless state
+ state.load!
end
- def stream_with_stack(stream, stack, &b)
- return true if stream.empty?
+ def self.[](name)
+ get_state(name)
+ end
- until stream.empty?
- debug { "stack: #{stack.map(&:name).inspect}" }
- debug { "parsing #{stream.slice(0..20).inspect}" }
- success = stack.last.step(stream, stack, &b)
+ def get_state(name)
+ self.class.get_state(name)
+ end
+ def stream_tokens(stream, &b)
+ scan_state = ScanState.new(self, stream)
+
+ stream_with_state(scan_state, &b)
+ end
+
+ def stream_with_state(scan_state, &b)
+ until scan_state.eos?
+ debug { "stack: #{scan_state.stack.map(&:name).inspect}" }
+ debug { "stream: #{scan_state.scanner.peek(20).inspect}" }
+ success = step(get_state(scan_state.state), scan_state, &b)
+
if !success
debug { " no match, yielding Error" }
- b.call(Token['Error'], stream.slice!(0..0))
+ b.call(Token['Error'], scan_state.scanner.getch)
end
end
end
- def step(stream, stack, &b)
- rules.each do |rule|
- return true if run_rule(rule, stream, stack, &b)
+ def step(state, scan_state, &b)
+ state.rules.each do |rule|
+ return true if run_rule(rule, scan_state, &b)
end
false
end
private
- def get_lexer(o)
- case o
- when RegexLexer, :pop!
- o
- else
- lexer o
- end
- end
-
- def run_rule(rule, stream, stack, &b)
+ def run_rule(rule, scan_state, &b)
case rule
- when String, RegexLexer
- lexer = get_lexer(rule)
- debug { " entering mixin #{lexer.name}" }
- get_lexer(rule).step(stream, stack, &b)
+ when String
+ debug { " entering mixin #{rule}" }
+ step(get_state(rule), scan_state, &b)
when Rule
debug { " trying #{rule.inspect}" }
- rule.consume(stream) do |match|
+ scan_state.scan(rule.re) do |match|
debug { " got #{match[0].inspect}" }
- rule.callback.call(*match) do |tok, res|
- if tok.is_a? String
- tok = Token[tok]
- end
-
- debug { " yielding #{tok.name.inspect}, #{res.inspect}" }
- b.call(tok, res)
- end
-
- if rule.next_lexer == :pop!
- debug { " popping stack" }
- stack.pop
- elsif rule.next_lexer
- lexer = get_lexer(rule.next_lexer)
- debug { " entering #{lexer.name}" }
- stack.push lexer
+ scan_state.run_callback(&rule.callback).each do |tok, res|
+ debug { " yielding #{tok.to_s.inspect}, #{res.inspect}" }
+ b.call(Token[tok], res)
end
end
end
end