require 'strscan' module JsDuck # Tokenizes JavaScript code into lexical tokens. # # Each token has a type and value. # Types and possible values for them are as follows: # # - :number -- 25 # - :string -- "Hello world" # - :ident -- "foo" # - :regex -- "/abc/i" # - :operator -- "+" # - :doc_comment -- "/** My comment */" # # Plus a separate types for all keywords: :if, :while, :function, ... # For keywords the type and value are the same. # # Notice that doc-comments are recognized as tokens while normal # comments are ignored just as whitespace. # class Lexer # Input can be either a String or StringScanner. # # In the latter case we ensure that only #next will advance the # scanpointer of StringScanner - this allows context-switching # while parsing some string. Specifically we need this feature to # parse some JavaScript inside doc-comments. def initialize(input) @input = input.is_a?(StringScanner) ? input : StringScanner.new(input) @buffer = [] end # Tests if given pattern matches the tokens that follow at current # position. # # Takes list of strings and symbols. Symbols are compared to # token type, while strings to token value. For example: # # look(:ident, "=", :regex) # def look(*tokens) buffer_tokens(tokens.length) i = 0 tokens.all? do |t| tok = @buffer[i] i += 1 if !tok false elsif t.instance_of?(Symbol) tok[:type] == t else tok[:value] == t end end end # Returns the value of next token, moving the current token cursor # also to next token. # # When full=true, returns full token as hash like so: # # {:type => :ident, :value => "foo"} # # For doc-comments the full token also contains the field :linenr, # pointing to the line where the doc-comment began. # def next(full=false) buffer_tokens(1) tok = @buffer.shift # advance the scanpointer to the position after this token @input.pos = tok[:pos] full ? tok : tok[:value] end # True when no more tokens. def empty? buffer_tokens(1) return !@buffer.first end # Ensures next n tokens are read in buffer # # At the end of buffering the initial position scanpointer is # restored. Only the #next method will advance the scanpointer in # a way that's visible outside this class. def buffer_tokens(n) prev_pos = @input.pos @input.pos = @buffer.last[:pos] if @buffer.last (n - @buffer.length).times do @previous_token = tok = next_token if tok # remember scanpointer position after each token tok[:pos] = @input.pos @buffer << tok end end @input.pos = prev_pos end # Parses out next token from input stream. # # For efficency we look for tokens in order of frequency in # JavaScript source code: # # - first check for most common operators. # - then for identifiers and keywords. # - then strings # - then comments # # The remaining token types are less frequent, so these are left # to the end. # def next_token while !@input.eos? do skip_white if @input.check(/[.(),;={}:]/) return { :type => :operator, :value => @input.scan(/./) } elsif @input.check(/[a-zA-Z_$]/) value = @input.scan(/[$\w]+/) kw = KEYWORDS[value] return { :type => kw || :ident, :value => kw || value } elsif @input.check(/'/) return { :type => :string, :value => @input.scan(/'([^'\\]|\\.)*('|\Z)/m).gsub(/\A'|'\Z/m, "") } elsif @input.check(/"/) return { :type => :string, :value => @input.scan(/"([^"\\]|\\.)*("|\Z)/m).gsub(/\A"|"\Z/m, "") } elsif @input.check(/\//) # Several things begin with dash: # - comments, regexes, division-operators if @input.check(/\/\*\*[^\/]/) return { :type => :doc_comment, # Calculate current line number, starting with 1 :linenr => @input.string[0...@input.pos].count("\n") + 1, :value => @input.scan_until(/\*\/|\Z/).sub(/\A\/\*\*/, "").sub(/\*\/\Z/, "") } elsif @input.check(/\/\*/) # skip multiline comment @input.scan_until(/\*\/|\Z/) elsif @input.check(/\/\//) # skip line comment @input.scan_until(/\n|\Z/) elsif regex? return { :type => :regex, :value => @input.scan(META_REGEX) } else return { :type => :operator, :value => @input.scan(/\//) } end elsif @input.check(/[0-9]+/) nr = @input.scan(/[0-9]+(\.[0-9]*)?/) return { :type => :number, :value => nr } elsif @input.check(/./) return { :type => :operator, :value => @input.scan(/./) } end end end # A slash "/" is a division operator if it follows: # - identifier # - the "this" keyword # - number # - closing bracket ) # - closing square-bracket ] # Otherwise it's a beginning of regex def regex? if @previous_token type = @previous_token[:type] value = @previous_token[:value] if type == :ident || type == :number return false elsif type == :this return false elsif type == :operator && (value == ")" || value == "]") return false end end return true end def skip_white @input.scan(/\s+/) end # A regex to match a regex META_REGEX = %r{ / (?# beginning ) ( [^/\[\\] (?# any character except \ / [ ) | \\. (?# an escaping \ followed by any character ) | \[ ([^\]\\]|\\.)* \] (?# [...] containing any characters including / ) (?# except \ ] which have to be escaped ) )* (/[gim]*|\Z) (?# ending + modifiers ) }x KEYWORDS = { "break" => :break, "case" => :case, "catch" => :catch, "continue" => :continue, "default" => :default, "delete" => :delete, "do" => :do, "else" => :else, "finally" => :finally, "for" => :for, "function" => :function, "if" => :if, "in" => :in, "instanceof" => :instanceof, "new" => :new, "return" => :return, "switch" => :switch, "this" => :this, "throw" => :throw, "try" => :try, "typeof" => :typeof, "var" => :var, "void" => :void, "while" => :while, "with" => :with, } end end