# stdlib require 'strscan' module Rouge class Lexer class << self # Lexes `stream` with the given options. The lex is delegated to a # new instance. # # @see #lex def lex(stream, opts={}, &b) new(opts).lex(stream, &b) end def default_options(o={}) @default_options ||= {} @default_options.merge!(o) @default_options end # Given a string, return the correct lexer class. def find(name) registry[name.to_s] end # Guess which lexer to use based on a hash of info. # # @option info :mimetype # A mimetype to guess by # @option info :filename # A filename to guess by # @option info :source # The source itself, which, if guessing by mimetype or filename # fails, will be searched for shebangs, tags, and # other hints. # # @see Lexer.analyze_text def guess(info={}) by_mimetype = guess_by_mimetype(info[:mimetype]) if info[:mimetype] return by_mimetype if by_mimetype by_filename = guess_by_filename(info[:filename]) if info[:filename] return by_filename if by_filename by_source = guess_by_source(info[:source]) if info[:source] return by_source if by_source # guessing failed, just parse it as text return Lexers::Text end def guess_by_mimetype(mt) registry.values.detect do |lexer| lexer.mimetypes.include? mt end end def guess_by_filename(fname) fname = File.basename(fname) registry.values.detect do |lexer| lexer.filenames.any? do |pattern| File.fnmatch?(pattern, fname) end end end def guess_by_source(source) source = TextAnalyzer.new(source) best_result = 0 best_match = nil registry.values.each do |lexer| result = lexer.analyze_text(source) || 0 return lexer if result == 1 if result > best_result best_match = lexer best_result = result end end best_match end def register(name, lexer) registry[name.to_s] = lexer end # Used to specify or get the canonical name of this lexer class. # # @example # class MyLexer < Lexer # tag 'foo' # end # # MyLexer.tag # => 'foo' # # Lexer.find('foo') # => MyLexer def tag(t=nil) return @tag if t.nil? @tag = t.to_s aliases @tag end # Used to specify alternate names this lexer class may be found by. # # @example # class Erb < Lexer # tag 'erb' # aliases 'eruby', 'rhtml' # end # # Lexer.find('eruby') # => Erb def aliases(*args) args.each { |arg| Lexer.register(arg, self) } end # Specify a list of filename globs associated with this lexer # # @example # class Ruby < Lexer # filenames '*.rb', '*.ruby', 'Gemfile', 'Rakefile' # end def filenames(*fnames) (@filenames ||= []).concat(fnames) end # Specify a list of mimetypes associated with this lexer. # # @example # class Html < Lexer # mimetypes 'text/html', 'application/xhtml+xml' # end def mimetypes(*mts) (@mimetypes ||= []).concat(mts) end private def registry @registry ||= {} end end # -*- instance methods -*- # def initialize(opts={}) options(opts) end def options(o={}) (@options ||= {}).merge!(o) self.class.default_options.merge(@options) end def option(k, v=:absent) if v == :absent options[k] else options({ k => v }) end end # Leave a debug message if the `:debug` option is set. The message # is given as a block because some debug messages contain calculated # information that is unnecessary for lexing in the real world. # # @example # debug { "hello, world!" } def debug(&b) puts(b.call) if option :debug end # @abstract # # Called after each lex is finished. The default implementation # is a noop. def reset! end # Given a string, yield [token, chunk] pairs. If no block is given, # an enumerator is returned. # # @option opts :continue # Continue the lex from the previous state (i.e. don't call #reset!) def lex(string, opts={}, &b) return enum_for(:lex, string) unless block_given? reset! unless opts[:continue] last_token = nil last_val = nil stream_tokens(StringScanner.new(string)) do |tok, val| next if val.empty? if tok == last_token last_val << val next end b.call(last_token, last_val) if last_token last_token = tok last_val = val end b.call(last_token, last_val) if last_token end # @abstract # # Yield [token, chunk] pairs, given a prepared input stream. This # must be implemented. # # @param [StringScanner] stream # the stream def stream_tokens(stream, &b) raise 'abstract' end # @abstract # # return a number between 0 and 1 indicating the likelihood that # the text given should be lexed with this lexer. The default # implementation returns 0. # # @param [TextAnalyzer] text # the text to be analyzed, with a couple of handy methods on it, # like {TextAnalyzer#shebang?} and {TextAnalyzer#doctype?} def self.analyze_text(text) 0 end end end