# stdlib require 'strscan' require 'cgi' module Rouge # @abstract # A lexer transforms text into a stream of `[token, chunk]` pairs. class Lexer include Token::Tokens class << self # Lexes `stream` with the given options. The lex is delegated to a # new instance. # # @see #lex def lex(stream, opts={}, &b) new(opts).lex(stream, &b) end def default_options(o={}) @default_options ||= {} @default_options.merge!(o) @default_options end # Given a string, return the correct lexer class. def find(name) registry[name.to_s] end # Find a lexer, with fancy shiny features. # # * The string you pass can include CGI-style options # # Lexer.find_fancy('erb?parent=tex') # # * You can pass the special name 'guess' so we guess for you, # and you can pass a second argument of the code to guess by # # Lexer.find_fancy('guess', "#!/bin/bash\necho Hello, world") # # This is used in the Redcarpet plugin as well as Rouge's own # markdown lexer for highlighting internal code blocks. # def find_fancy(str, code=nil) name, opts = str ? str.split('?', 2) : [nil, ''] # parse the options hash from a cgi-style string opts = CGI.parse(opts || '').map do |k, vals| [ k.to_sym, vals.empty? ? true : vals[0] ] end opts = Hash[opts] lexer_class = case name when 'guess', nil self.guess(:source => code, :mimetype => opts[:mimetype]) when String self.find(name) end lexer_class && lexer_class.new(opts) end # Specify or get this lexer's description. def desc(arg=:absent) if arg == :absent @desc else @desc = arg end end # Specify or get the path name containing a small demo for # this lexer (can be overriden by {demo}). def demo_file(arg=:absent) return @demo_file = Pathname.new(arg) unless arg == :absent @demo_file = Pathname.new(__FILE__).dirname.join('demos', tag) end # Specify or get a small demo string for this lexer def demo(arg=:absent) return @demo = arg unless arg == :absent @demo = File.read(demo_file) end # @return a list of all lexers. def all registry.values.uniq end # Guess which lexer to use based on a hash of info. # # This accepts the same arguments as Lexer.guess, but will never throw # an error. It will return a (possibly empty) list of potential lexers # to use. def guesses(info={}) mimetype, filename, source = info.values_at(:mimetype, :filename, :source) lexers = registry.values.uniq total_size = lexers.size lexers = filter_by_mimetype(lexers, mimetype) if mimetype return lexers if lexers.size == 1 lexers = filter_by_filename(lexers, filename) if filename return lexers if lexers.size == 1 if source # If we're filtering against *all* lexers, we only use confident return # values from analyze_text. But if we've filtered down already, we can trust # the analysis more. source_threshold = lexers.size < total_size ? 0 : 0.5 return [best_by_source(lexers, source, source_threshold)].compact end [] end class AmbiguousGuess < StandardError attr_reader :alternatives def initialize(alternatives); @alternatives = alternatives; end def message "Ambiguous guess: can't decide between #{alternatives.map(&:tag).inspect}" end end # Guess which lexer to use based on a hash of info. # # @option info :mimetype # A mimetype to guess by # @option info :filename # A filename to guess by # @option info :source # The source itself, which, if guessing by mimetype or filename # fails, will be searched for shebangs, tags, and # other hints. # # @see Lexer.analyze_text # @see Lexer.multi_guess def guess(info={}) lexers = guesses(info) return Lexers::PlainText if lexers.empty? return lexers[0] if lexers.size == 1 raise AmbiguousGuess.new(lexers) end def guess_by_mimetype(mt) guess :mimetype => mt end def guess_by_filename(fname) guess :filename => fname end def guess_by_source(source) guess :source => source end private def filter_by_mimetype(lexers, mt) filtered = lexers.select { |lexer| lexer.mimetypes.include? mt } filtered.any? ? filtered : lexers end # returns a list of lexers that match the given filename with # equal specificity (i.e. number of wildcards in the pattern). # This helps disambiguate between, e.g. the Nginx lexer, which # matches `nginx.conf`, and the Conf lexer, which matches `*.conf`. # In this case, nginx will win because the pattern has no wildcards, # while `*.conf` has one. def filter_by_filename(lexers, fname) fname = File.basename(fname) out = [] best_seen = nil lexers.each do |lexer| score = lexer.filenames.map do |pattern| if File.fnmatch?(pattern, fname, File::FNM_DOTMATCH) # specificity is better the fewer wildcards there are pattern.scan(/[*?\[]/).size end end.compact.min next unless score if best_seen.nil? || score < best_seen best_seen = score out = [lexer] elsif score == best_seen out << lexer end end out.any? ? out : lexers end def best_by_source(lexers, source, threshold=0) source = case source when String source when ->(s){ s.respond_to? :read } source.read else raise 'invalid source' end assert_utf8!(source) source = TextAnalyzer.new(source) best_result = threshold best_match = nil lexers.each do |lexer| result = lexer.analyze_text(source) || 0 return lexer if result == 1 if result > best_result best_match = lexer best_result = result end end best_match end protected # @private def register(name, lexer) registry[name.to_s] = lexer end public # Used to specify or get the canonical name of this lexer class. # # @example # class MyLexer < Lexer # tag 'foo' # end # # MyLexer.tag # => 'foo' # # Lexer.find('foo') # => MyLexer def tag(t=nil) return @tag if t.nil? @tag = t.to_s Lexer.register(@tag, self) end # Used to specify alternate names this lexer class may be found by. # # @example # class Erb < Lexer # tag 'erb' # aliases 'eruby', 'rhtml' # end # # Lexer.find('eruby') # => Erb def aliases(*args) args.map!(&:to_s) args.each { |arg| Lexer.register(arg, self) } (@aliases ||= []).concat(args) end # Specify a list of filename globs associated with this lexer. # # @example # class Ruby < Lexer # filenames '*.rb', '*.ruby', 'Gemfile', 'Rakefile' # end def filenames(*fnames) (@filenames ||= []).concat(fnames) end # Specify a list of mimetypes associated with this lexer. # # @example # class Html < Lexer # mimetypes 'text/html', 'application/xhtml+xml' # end def mimetypes(*mts) (@mimetypes ||= []).concat(mts) end # @private def assert_utf8!(str) return if %w(US-ASCII UTF-8).include? str.encoding.name raise EncodingError.new( "Bad encoding: #{str.encoding.names.join(',')}. " + "Please convert your string to UTF-8." ) end private def registry @registry ||= {} end end # -*- instance methods -*- # # Create a new lexer with the given options. Individual lexers may # specify extra options. The only current globally accepted option # is `:debug`. # # @option opts :debug # Prints debug information to stdout. The particular info depends # on the lexer in question. In regex lexers, this will log the # state stack at the beginning of each step, along with each regex # tried and each stream consumed. Try it, it's pretty useful. def initialize(opts={}) options(opts) @debug = option(:debug) end # get and/or specify the options for this lexer. def options(o={}) (@options ||= {}).merge!(o) self.class.default_options.merge(@options) end # get or specify one option for this lexer def option(k, v=:absent) if v == :absent options[k] else options({ k => v }) end end # @deprecated # Instead of `debug { "foo" }`, simply `puts "foo" if @debug`. # # Leave a debug message if the `:debug` option is set. The message # is given as a block because some debug messages contain calculated # information that is unnecessary for lexing in the real world. # # Calls to this method should be guarded with "if @debug" for best # performance when debugging is turned off. # # @example # debug { "hello, world!" } if @debug def debug warn "Lexer#debug is deprecated. Simply puts if @debug instead." puts yield if @debug end # @abstract # # Called after each lex is finished. The default implementation # is a noop. def reset! end # Given a string, yield [token, chunk] pairs. If no block is given, # an enumerator is returned. # # @option opts :continue # Continue the lex from the previous state (i.e. don't call #reset!) def lex(string, opts={}, &b) return enum_for(:lex, string) unless block_given? Lexer.assert_utf8!(string) reset! unless opts[:continue] # consolidate consecutive tokens of the same type last_token = nil last_val = nil stream_tokens(string) do |tok, val| next if val.empty? if tok == last_token last_val << val next end b.call(last_token, last_val) if last_token last_token = tok last_val = val end b.call(last_token, last_val) if last_token end # delegated to {Lexer.tag} def tag self.class.tag end # @abstract # # Yield `[token, chunk]` pairs, given a prepared input stream. This # must be implemented. # # @param [StringScanner] stream # the stream def stream_tokens(stream, &b) raise 'abstract' end # @abstract # # Return a number between 0 and 1 indicating the likelihood that # the text given should be lexed with this lexer. The default # implementation returns 0. Values under 0.5 will only be used # to disambiguate filename or mimetype matches. # # @param [TextAnalyzer] text # the text to be analyzed, with a couple of handy methods on it, # like {TextAnalyzer#shebang?} and {TextAnalyzer#doctype?} def self.analyze_text(text) 0 end end module Lexers def self.load_const(const_name, relpath) return if const_defined?(const_name) root = Pathname.new(__FILE__).dirname.join('lexers') load root.join(relpath) end end end