lib/turmali/grammar.y in turmali-0.0.3 vs lib/turmali/grammar.y in turmali-0.0.4
- old
+ new
@@ -1,18 +1,24 @@
class Parser
+# We need to tell the parser what tokens to expect. So each type of token produced
+# by our lexer needs to be declared here.
token IF
token DEF
token CLASS
token NEWLINE
token NUMBER
token STRING
token TRUE FALSE NIL
token IDENTIFIER
token CONSTANT
token INDENT DEDENT
+token WHILE
+# Here is the Operator Precedence Table. As presented before, it tells the parser in
+# which order to parse expressions containing operators.
+# This table is based on the [C and C++ Operator Precedence Table](http://en.wikipedia.org/wiki/Operators_in_C_and_C%2B%2B#Operator_precedence).
prechigh
left '.'
right '!'
left '*' '/'
left '+' '-'
@@ -22,24 +28,53 @@
left '||'
right '='
left ','
preclow
-
+# In the following `rule` section, we define the parsing rules.
+# All rules are declared using the following format:
+#
+# RuleName:
+# OtherRule TOKEN AnotherRule { result = Node.new }
+# | OtherRule { ... }
+# ;
+#
+# In the action section (inside the `{...}` on the right), you can do the following:
+#
+# * Assign to `result` the value returned by the rule, usually a node for the AST.
+# * Use `val[index of expression]` to get the `result` of a matched
+# expressions on the left.
rule
+ # First, parsers are dumb, we need to explicitly tell it how to handle empty
+ # programs. This is what the first rule does. Note that everything between `/* ... */` is
+ # a comment.
Program:
/* nothing */ { result = Nodes.new([]) }
| Expressions { result = val[0] }
;
+ # Next, we define what a list of expressions is. Simply put, it's series of expressions separated by a
+ # terminator (a new line or `;` as defined later). But once again, we need to explicitly
+ # define how to handle trailing and orphans line breaks (the last two lines).
+ #
+ # One very powerful trick we'll use to define variable rules like this one
+ # (rules which can match any number of tokens) is *left-recursion*. Which means we reference
+ # the rule itself, directly or indirectly, on the left side **only**. This is true for the current
+ # type of parser we're using (LR). For other types of parsers like ANTLR (LL), it's the opposite,
+ # you can only use right-recursion.
+ #
+ # As you'll see bellow, the `Expressions` rule references `Expressions` itself.
+ # In other words, a list of expressions can be another list of expressions followed by
+ # another expression.
Expressions:
Expression { result = Nodes.new(val) }
| Expressions Terminator Expression { result = val[0] << val[2] }
| Expressions Terminator { result = val[0] }
| Terminator { result = Nodes.new([]) }
;
+ # Every type of expression supported by our language is defined here.
Expression:
Literal
| Call
| Operator
| GetConstant
@@ -47,26 +82,44 @@
| GetLocal
| SetLocal
| Def
| Class
| If
+ | While
| '(' Expression ')' { result = val[1] }
;
+ # Notice how we implement support for parentheses using the previous rule.
+ # `'(' Expression ')'` will force the parsing of `Expression` in its
+ # entirety first. Parentheses will then be discarded leaving only the fully parsed expression.
+ #
+ # Terminators are tokens that can terminate an expression.
+ # When using tokens to define rules, we simply reference them by their type which we defined in
+ # the lexer.
Terminator:
NEWLINE
| ";"
;
-
+
+ # Literals are the hard-coded values inside the program. If you want to add support
+ # for other literal types, such as arrays or hashes, this it where you'd do it.
Literal:
NUMBER { result = NumberNode.new(val[0]) }
| STRING { result = StringNode.new(val[0]) }
| TRUE { result = TrueNode.new }
| FALSE { result = FalseNode.new }
| NIL { result = NilNode.new }
;
-
+
+ # Method calls can take three forms:
+ #
+ # * Without a receiver (`self` is assumed): `method(arguments)`.
+ # * With a receiver: `receiver.method(arguments)`.
+ # * And a hint of syntactic sugar so that we can drop
+ # the `()` if no arguments are given: `receiver.method`.
+ #
+ # Each one of those is handled by the following rule.
Call:
IDENTIFIER Arguments { result = CallNode.new(nil, val[0], val[1]) }
| Expression "." IDENTIFIER
Arguments { result = CallNode.new(val[0], val[2], val[3]) }
| Expression "." IDENTIFIER { result = CallNode.new(val[0], val[2], []) }
@@ -80,10 +133,17 @@
ArgList:
Expression { result = val }
| ArgList "," Expression { result = val[0] << val[2] }
;
+
+ # In our language, like in Ruby, operators are converted to method calls.
+ # So `1 + 2` will be converted to `1.+(2)`.
+ # `1` is the receiver of the `+` method call, passing `2`
+ # as an argument.
+ # Operators need to be defined individually for the Operator Precedence Table to take
+ # action.
Operator:
Expression '||' Expression { result = CallNode.new(val[0], val[1], [val[2]]) }
| Expression '&&' Expression { result = CallNode.new(val[0], val[1], [val[2]]) }
| Expression '==' Expression { result = CallNode.new(val[0], val[1], [val[2]]) }
| Expression '!=' Expression { result = CallNode.new(val[0], val[1], [val[2]]) }
@@ -95,10 +155,11 @@
| Expression '-' Expression { result = CallNode.new(val[0], val[1], [val[2]]) }
| Expression '*' Expression { result = CallNode.new(val[0], val[1], [val[2]]) }
| Expression '/' Expression { result = CallNode.new(val[0], val[1], [val[2]]) }
;
+ # Then we have rules for getting and setting values of constants and local variables.
GetConstant:
CONSTANT { result = GetConstantNode.new(val[0]) }
;
SetConstant:
@@ -111,14 +172,24 @@
SetLocal:
IDENTIFIER "=" Expression { result = SetLocalNode.new(val[0], val[2]) }
;
+ # Our language uses indentation to separate blocks of code. But the lexer took care of all
+ # that complexity for us and wrapped all blocks in `INDENT ... DEDENT`. A block
+ # is simply an increment in indentation followed by some code and closing with an equivalent
+ # decrement in indentation.
+ #
+ # If you'd like to use curly brackets or `end` to delimit blocks instead, you'd
+ # simply need to modify this one rule.
+ # You'll also need to remove the indentation logic from the lexer.
Block:
INDENT Expressions DEDENT { result = val[1] }
;
-
+
+ # The `def` keyword is used for defining methods. Once again, we're introducing
+ # a bit of syntactic sugar here to allow skipping the parentheses when there are no parameters.
Def:
DEF IDENTIFIER Block { result = DefNode.new(val[1], [], val[2]) }
| DEF IDENTIFIER
"(" ParamList ")" Block { result = DefNode.new(val[1], val[3], val[5]) }
;
@@ -126,19 +197,28 @@
ParamList:
/* nothing */ { result = [] }
| IDENTIFIER { result = val }
| ParamList "," IDENTIFIER { result = val[0] << val[2] }
;
-
+
+ # Class definition is similar to method definition.
+ # Class names are also constants because they start with a capital letter.
Class:
CLASS CONSTANT Block { result = ClassNode.new(val[1], val[2]) }
;
-
+
+ # Finally, `if` is similar to `class` but receives a *condition*.
If:
IF Expression Block { result = IfNode.new(val[1], val[2]) }
;
+
+ While:
+ WHILE Expression Block { result = WhileNode.new(val[1], val[2]) }
+ ;
end
+# The final code at the bottom of this Racc file will be put as-is in the generated `Parser` class.
+# You can put some code at the top (`header`) and some inside the class (`inner`).
---- header
require "turmali/lexer"
require "turmali/nodes"
---- inner
\ No newline at end of file