lib/turmali/lexer.rb in turmali-0.0.1 vs lib/turmali/lexer.rb in turmali-0.0.2

- old
+ new

@@ -1,129 +1,83 @@ -# Our lexer will be used like so: `Lexer.new.tokenize("code")`, -# and will return an array of tokens (a token being a tuple of `[TOKEN_TYPE, TOKEN_VALUE]`). class Lexer - # First we define the special keywords of our language in a constant. - # It will be used later on in the tokenizing process to disambiguate - # an identifier (method name, local variable, etc.) from a keyword. + KEYWORDS = ["def", "class", "if", "true", "false", "nil"] def tokenize(code) - code.chomp! # Remove extra line breaks - tokens = [] # This will hold the generated tokens - - # We need to know how deep we are in the indentation so - # we keep track of the current indentation level we are in, and previous ones in the stack - # so that when we dedent, we can check if we're on the correct level. - current_indent = 0 # number of spaces in the last indent + code.chomp! + tokens = [] + + current_indent = 0 indent_stack = [] - - # Here is how to implement a very simple scanner. - # Advance one character at the time until you find something to parse. - # We'll use regular expressions to scan from the current position (`i`) - # up to the end of the code. - i = 0 # Current character position + + i = 0 while i < code.size chunk = code[i..-1] - # Each of the following `if/elsif`s will test the current code chunk with - # a regular expression. The order is important as we want to match `if` - # as a keyword, and not a method name, we'll need to apply it first. - # - # First, we'll scan for names: method names and variable names, which we'll call identifiers. - # Also scanning for special reserved keywords such as `if`, `def` - # and `true`. if identifier = chunk[/\A([a-z]\w*)/, 1] - if KEYWORDS.include?(identifier) # keywords will generate [:IF, "if"] + if KEYWORDS.include?(identifier) tokens << [identifier.upcase.to_sym, identifier] else tokens << [:IDENTIFIER, identifier] end - i += identifier.size # skip what we just parsed + i += identifier.size - # Now scanning for constants, names starting with a capital letter. - # Which means, class names are constants in our language. elsif constant = chunk[/\A([A-Z]\w*)/, 1] tokens << [:CONSTANT, constant] i += constant.size - - # Next, matching numbers. Our language will only support integers. But to add support for floats, - # you'd simply need to add a similar rule and adapt the regular expression accordingly. + elsif number = chunk[/\A([0-9]+)/, 1] tokens << [:NUMBER, number.to_i] i += number.size - - # Of course, matching strings too. Anything between `"..."`. + elsif string = chunk[/\A"([^"]*)"/, 1] tokens << [:STRING, string] - i += string.size + 2 # skip two more to exclude the `"`. - - # And here's the indentation magic! We have to take care of 3 cases: - # - # if true: # 1) The block is created. - # line 1 - # line 2 # 2) New line inside a block, at the same level. - # continue # 3) Dedent. - # - # This `elsif` takes care of the first case. The number of spaces will determine - # the indent level. - elsif indent = chunk[/\A\:\n( +)/m, 1] # Matches ": <newline> <spaces>" - if indent.size <= current_indent # indent should go up when creating a block + i += string.size + 2 + + elsif indent = chunk[/\A\:\n( +)/m, 1] + if indent.size <= current_indent raise "Bad indent level, got #{indent.size} indents, " + "expected > #{current_indent}" end current_indent = indent.size indent_stack.push(current_indent) tokens << [:INDENT, indent.size] i += indent.size + 2 - - # The next `elsif` takes care of the two last cases: - # - # * Case 2: We stay in the same block if the indent level (number of spaces) is the - # same as `current_indent`. - # * Case 3: Close the current block, if indent level is lower than `current_indent`. - elsif indent = chunk[/\A\n( *)/m, 1] # Matches "<newline> <spaces>" - if indent.size == current_indent # Case 2 - tokens << [:NEWLINE, "\n"] # Nothing to do, we're still in the same block - elsif indent.size < current_indent # Case 3 + + elsif indent = chunk[/\A\n( *)/m, 1] + if indent.size == current_indent + tokens << [:NEWLINE, "\n"] + elsif indent.size < current_indent while indent.size < current_indent indent_stack.pop current_indent = indent_stack.last || 0 tokens << [:DEDENT, indent.size] end tokens << [:NEWLINE, "\n"] - else # indent.size > current_indent, error! - raise "Missing ':'" # Cannot increase indent level without using ":" + else + raise "Missing ':'" end i += indent.size + 1 - # Long operators such as `||`, `&&`, `==`, etc. - # will be matched by the following block. - # One character long operators are matched by the catch all `else` at the bottom. elsif operator = chunk[/\A(\|\||&&|==|!=|<=|>=)/, 1] tokens << [operator, operator] i += operator.size - - # We're ignoring spaces. Contrary to line breaks, spaces are meaningless in our language. - # That's why we don't create tokens for them. They are only used to separate other tokens. + elsif chunk.match(/\A /) i += 1 - - # Finally, catch all single characters, mainly operators. - # We treat all other single characters as a token. Eg.: `( ) , . ! + - <`. + else value = chunk[0,1] tokens << [value, value] i += 1 end end - - # Close all open blocks. If the code ends without dedenting, this will take care of - # balancing the `INDENT`...`DEDENT`s. + while indent = indent_stack.pop tokens << [:DEDENT, indent_stack.first || 0] end tokens end -end +end \ No newline at end of file