lib/turmali/grammar.y in turmali-0.0.3 vs lib/turmali/grammar.y in turmali-0.0.4

- old
+ new

@@ -1,18 +1,24 @@ class Parser +# We need to tell the parser what tokens to expect. So each type of token produced +# by our lexer needs to be declared here. token IF token DEF token CLASS token NEWLINE token NUMBER token STRING token TRUE FALSE NIL token IDENTIFIER token CONSTANT token INDENT DEDENT +token WHILE +# Here is the Operator Precedence Table. As presented before, it tells the parser in +# which order to parse expressions containing operators. +# This table is based on the [C and C++ Operator Precedence Table](http://en.wikipedia.org/wiki/Operators_in_C_and_C%2B%2B#Operator_precedence). prechigh left '.' right '!' left '*' '/' left '+' '-' @@ -22,24 +28,53 @@ left '||' right '=' left ',' preclow - +# In the following `rule` section, we define the parsing rules. +# All rules are declared using the following format: +# +# RuleName: +# OtherRule TOKEN AnotherRule { result = Node.new } +# | OtherRule { ... } +# ; +# +# In the action section (inside the `{...}` on the right), you can do the following: +# +# * Assign to `result` the value returned by the rule, usually a node for the AST. +# * Use `val[index of expression]` to get the `result` of a matched +# expressions on the left. rule + # First, parsers are dumb, we need to explicitly tell it how to handle empty + # programs. This is what the first rule does. Note that everything between `/* ... */` is + # a comment. Program: /* nothing */ { result = Nodes.new([]) } | Expressions { result = val[0] } ; + # Next, we define what a list of expressions is. Simply put, it's series of expressions separated by a + # terminator (a new line or `;` as defined later). But once again, we need to explicitly + # define how to handle trailing and orphans line breaks (the last two lines). + # + # One very powerful trick we'll use to define variable rules like this one + # (rules which can match any number of tokens) is *left-recursion*. Which means we reference + # the rule itself, directly or indirectly, on the left side **only**. This is true for the current + # type of parser we're using (LR). For other types of parsers like ANTLR (LL), it's the opposite, + # you can only use right-recursion. + # + # As you'll see bellow, the `Expressions` rule references `Expressions` itself. + # In other words, a list of expressions can be another list of expressions followed by + # another expression. Expressions: Expression { result = Nodes.new(val) } | Expressions Terminator Expression { result = val[0] << val[2] } | Expressions Terminator { result = val[0] } | Terminator { result = Nodes.new([]) } ; + # Every type of expression supported by our language is defined here. Expression: Literal | Call | Operator | GetConstant @@ -47,26 +82,44 @@ | GetLocal | SetLocal | Def | Class | If + | While | '(' Expression ')' { result = val[1] } ; + # Notice how we implement support for parentheses using the previous rule. + # `'(' Expression ')'` will force the parsing of `Expression` in its + # entirety first. Parentheses will then be discarded leaving only the fully parsed expression. + # + # Terminators are tokens that can terminate an expression. + # When using tokens to define rules, we simply reference them by their type which we defined in + # the lexer. Terminator: NEWLINE | ";" ; - + + # Literals are the hard-coded values inside the program. If you want to add support + # for other literal types, such as arrays or hashes, this it where you'd do it. Literal: NUMBER { result = NumberNode.new(val[0]) } | STRING { result = StringNode.new(val[0]) } | TRUE { result = TrueNode.new } | FALSE { result = FalseNode.new } | NIL { result = NilNode.new } ; - + + # Method calls can take three forms: + # + # * Without a receiver (`self` is assumed): `method(arguments)`. + # * With a receiver: `receiver.method(arguments)`. + # * And a hint of syntactic sugar so that we can drop + # the `()` if no arguments are given: `receiver.method`. + # + # Each one of those is handled by the following rule. Call: IDENTIFIER Arguments { result = CallNode.new(nil, val[0], val[1]) } | Expression "." IDENTIFIER Arguments { result = CallNode.new(val[0], val[2], val[3]) } | Expression "." IDENTIFIER { result = CallNode.new(val[0], val[2], []) } @@ -80,10 +133,17 @@ ArgList: Expression { result = val } | ArgList "," Expression { result = val[0] << val[2] } ; + + # In our language, like in Ruby, operators are converted to method calls. + # So `1 + 2` will be converted to `1.+(2)`. + # `1` is the receiver of the `+` method call, passing `2` + # as an argument. + # Operators need to be defined individually for the Operator Precedence Table to take + # action. Operator: Expression '||' Expression { result = CallNode.new(val[0], val[1], [val[2]]) } | Expression '&&' Expression { result = CallNode.new(val[0], val[1], [val[2]]) } | Expression '==' Expression { result = CallNode.new(val[0], val[1], [val[2]]) } | Expression '!=' Expression { result = CallNode.new(val[0], val[1], [val[2]]) } @@ -95,10 +155,11 @@ | Expression '-' Expression { result = CallNode.new(val[0], val[1], [val[2]]) } | Expression '*' Expression { result = CallNode.new(val[0], val[1], [val[2]]) } | Expression '/' Expression { result = CallNode.new(val[0], val[1], [val[2]]) } ; + # Then we have rules for getting and setting values of constants and local variables. GetConstant: CONSTANT { result = GetConstantNode.new(val[0]) } ; SetConstant: @@ -111,14 +172,24 @@ SetLocal: IDENTIFIER "=" Expression { result = SetLocalNode.new(val[0], val[2]) } ; + # Our language uses indentation to separate blocks of code. But the lexer took care of all + # that complexity for us and wrapped all blocks in `INDENT ... DEDENT`. A block + # is simply an increment in indentation followed by some code and closing with an equivalent + # decrement in indentation. + # + # If you'd like to use curly brackets or `end` to delimit blocks instead, you'd + # simply need to modify this one rule. + # You'll also need to remove the indentation logic from the lexer. Block: INDENT Expressions DEDENT { result = val[1] } ; - + + # The `def` keyword is used for defining methods. Once again, we're introducing + # a bit of syntactic sugar here to allow skipping the parentheses when there are no parameters. Def: DEF IDENTIFIER Block { result = DefNode.new(val[1], [], val[2]) } | DEF IDENTIFIER "(" ParamList ")" Block { result = DefNode.new(val[1], val[3], val[5]) } ; @@ -126,19 +197,28 @@ ParamList: /* nothing */ { result = [] } | IDENTIFIER { result = val } | ParamList "," IDENTIFIER { result = val[0] << val[2] } ; - + + # Class definition is similar to method definition. + # Class names are also constants because they start with a capital letter. Class: CLASS CONSTANT Block { result = ClassNode.new(val[1], val[2]) } ; - + + # Finally, `if` is similar to `class` but receives a *condition*. If: IF Expression Block { result = IfNode.new(val[1], val[2]) } ; + + While: + WHILE Expression Block { result = WhileNode.new(val[1], val[2]) } + ; end +# The final code at the bottom of this Racc file will be put as-is in the generated `Parser` class. +# You can put some code at the top (`header`) and some inside the class (`inner`). ---- header require "turmali/lexer" require "turmali/nodes" ---- inner \ No newline at end of file