lexer.rb in turmali-0.0.2

- old
+ new
@@ -1,129 +1,83 @@
-# Our lexer will be used like so: `Lexer.new.tokenize("code")`,
-# and will return an array of tokens (a token being a tuple of `[TOKEN_TYPE, TOKEN_VALUE]`).
 class Lexer
-  # First we define the special keywords of our language in a constant.
-  # It will be used later on in the tokenizing process to disambiguate
-  # an identifier (method name, local variable, etc.) from a keyword.
+
   KEYWORDS = ["def", "class", "if", "true", "false", "nil"]
   
   def tokenize(code)
-    code.chomp! # Remove extra line breaks
-    tokens = [] # This will hold the generated tokens
-    
-    # We need to know how deep we are in the indentation so
-    # we keep track of the current indentation level we are in, and previous ones in the stack
-    # so that when we dedent, we can check if we're on the correct level.
-    current_indent = 0 # number of spaces in the last indent
+    code.chomp! 
+    tokens = [] 
+
+    current_indent = 0 
     indent_stack = []
-    
-    # Here is how to implement a very simple scanner.
-    # Advance one character at the time until you find something to parse.
-    # We'll use regular expressions to scan from the current position (`i`)
-    # up to the end of the code.
-    i = 0 # Current character position
+
+    i = 0 
     while i < code.size
       chunk = code[i..-1]
       
-      # Each of the following `if/elsif`s will test the current code chunk with
-      # a regular expression. The order is important as we want to match `if`
-      # as a keyword, and not a method name, we'll need to apply it first.
-      #
-      # First, we'll scan for names: method names and variable names, which we'll call identifiers.
-      # Also scanning for special reserved keywords such as `if`, `def`
-      # and `true`.
       if identifier = chunk[/\A([a-z]\w*)/, 1]
-        if KEYWORDS.include?(identifier) # keywords will generate [:IF, "if"]
+        if KEYWORDS.include?(identifier) 
           tokens << [identifier.upcase.to_sym, identifier]
         else
           tokens << [:IDENTIFIER, identifier]
         end
-        i += identifier.size # skip what we just parsed
+        i += identifier.size 
       
-      # Now scanning for constants, names starting with a capital letter.
-      # Which means, class names are constants in our language.
       elsif constant = chunk[/\A([A-Z]\w*)/, 1]
         tokens << [:CONSTANT, constant]
         i += constant.size
-        
-      # Next, matching numbers. Our language will only support integers. But to add support for floats,
-      # you'd simply need to add a similar rule and adapt the regular expression accordingly.
+
       elsif number = chunk[/\A([0-9]+)/, 1]
         tokens << [:NUMBER, number.to_i]
         i += number.size
-        
-      # Of course, matching strings too. Anything between `"..."`.
+
       elsif string = chunk[/\A"([^"]*)"/, 1]
         tokens << [:STRING, string]
-        i += string.size + 2 # skip two more to exclude the `"`.
-      
-      # And here's the indentation magic! We have to take care of 3 cases:
-      # 
-      #     if true:  # 1) The block is created.
-      #       line 1
-      #       line 2  # 2) New line inside a block, at the same level.
-      #     continue  # 3) Dedent.
-      #
-      # This `elsif` takes care of the first case. The number of spaces will determine 
-      # the indent level.
-      elsif indent = chunk[/\A\:\n( +)/m, 1] # Matches ": <newline> <spaces>"
-        if indent.size <= current_indent # indent should go up when creating a block
+        i += string.size + 2 
+
+      elsif indent = chunk[/\A\:\n( +)/m, 1] 
+        if indent.size <= current_indent 
           raise "Bad indent level, got #{indent.size} indents, " +
                 "expected > #{current_indent}"
         end
         current_indent = indent.size
         indent_stack.push(current_indent)
         tokens << [:INDENT, indent.size]
         i += indent.size + 2
-  
-      # The next `elsif` takes care of the two last cases:
-      #
-      # * Case 2: We stay in the same block if the indent level (number of spaces) is the
-      #   same as `current_indent`.
-      # * Case 3: Close the current block, if indent level is lower than `current_indent`.
-      elsif indent = chunk[/\A\n( *)/m, 1] # Matches "<newline> <spaces>"
-        if indent.size == current_indent # Case 2
-          tokens << [:NEWLINE, "\n"] # Nothing to do, we're still in the same block
-        elsif indent.size < current_indent # Case 3
+
+      elsif indent = chunk[/\A\n( *)/m, 1] 
+        if indent.size == current_indent 
+          tokens << [:NEWLINE, "\n"] 
+        elsif indent.size < current_indent 
           while indent.size < current_indent
             indent_stack.pop
             current_indent = indent_stack.last || 0
             tokens << [:DEDENT, indent.size]
           end
           tokens << [:NEWLINE, "\n"]
-        else # indent.size > current_indent, error!
-          raise "Missing ':'" # Cannot increase indent level without using ":"
+        else 
+          raise "Missing ':'" 
         end
         i += indent.size + 1
       
-      # Long operators such as `||`, `&&`, `==`, etc.
-      # will be matched by the following block.
-      # One character long operators are matched by the catch all `else` at the bottom.
       elsif operator = chunk[/\A(\|\||&&|==|!=|<=|>=)/, 1]
         tokens << [operator, operator]
         i += operator.size
-      
-      # We're ignoring spaces. Contrary to line breaks, spaces are meaningless in our language.
-      # That's why we don't create tokens for them. They are only used to separate other tokens.
+
       elsif chunk.match(/\A /)
         i += 1
-      
-      # Finally, catch all single characters, mainly operators.
-      # We treat all other single characters as a token. Eg.: `( ) , . ! + - <`.
+
       else
         value = chunk[0,1]
         tokens << [value, value]
         i += 1
         
       end
       
     end
-    
-    # Close all open blocks. If the code ends without dedenting, this will take care of
-    # balancing the `INDENT`...`DEDENT`s.
+
     while indent = indent_stack.pop
       tokens << [:DEDENT, indent_stack.first || 0]
     end
     
     tokens
   end
-end
+end
\ No newline at end of file