ast_builder.rb in rley-0.5.11

- old
+ new

@@ -1,5 +1,6 @@
+require 'stringio'
 require_relative 'ast_building'
 require_relative 'regex_repr'
 
 # The purpose of a ASTBuilder is to build piece by piece an AST
 # (Abstract Syntax Tree) from a sequence of input tokens and
@@ -33,57 +34,132 @@
   # @param aRange [Range] Range of tokens matched by the rule
   # @param theTokens [Array] The input tokens
   # @param theChildren [Array] Children nodes (one per rhs symbol)
   def new_parent_node(aProduction, aRange, theTokens, theChildren)
     node = case aProduction.name
-      when 'srl_0' # rule 'srl' => 'term'
+      when 'srl_0' # rule 'srl' => 'pattern'
         return_first_child(aRange, theTokens, theChildren)
+        
+      when 'pattern_0' # rule 'pattern' => %w[pattern COMMA quantifiable]
+        reduce_pattern_0(aProduction, aRange, theTokens, theChildren)
 
+      when 'pattern_1' # rule 'pattern' => %w[pattern quantifiable]
+        reduce_pattern_1(aProduction, aRange, theTokens, theChildren)        
+        
+      when 'pattern_2' # rule 'pattern' => 'quantifiable' 
+        return_first_child(aRange, theTokens, theChildren)
+
+      when 'quantifiable_0' # rule 'quantifiable' => 'term'
+        return_first_child(aRange, theTokens, theChildren)
+
+      when 'quantifiable_1' # rule 'quantifiable' = %w[term quantifier]
+        reduce_quantifiable_1(aProduction, aRange, theTokens, theChildren)
+
       when 'term_0' # rule 'term' => 'atom'
         return_first_child(aRange, theTokens, theChildren)
 
-      when 'term_1' # rule 'term' => %w[atom quantifier]
-        reduce_term_1(aProduction, aRange, theTokens, theChildren)
+      when 'term_1' # rule 'term' => 'alternation'
+        return_first_child(aRange, theTokens, theChildren)
+        
+      when 'term_2' # rule 'term' => 'grouping'
+        return_first_child(aRange, theTokens, theChildren)
 
       when 'atom_0' # rule 'atom' => 'letter_range'
         return_first_child(aRange, theTokens, theChildren)
-        
+
       when 'atom_1' # rule 'atom' => 'digit_range'
         return_first_child(aRange, theTokens, theChildren)
-      
+
+      when 'atom_2' # rule 'atom' => 'character_class'
+        return_first_child(aRange, theTokens, theChildren)
+
+      when 'atom_3' # rule 'atom' => 'special_char'
+        return_first_child(aRange, theTokens, theChildren)
+
+      when 'atom_4' # rule 'atom' => 'literal'
+        return_first_child(aRange, theTokens, theChildren)
+
       # rule 'letter_range' => %w[LETTER FROM LETTER_LIT TO LETTER_LIT]
-      when 'letter_range_0' 
+      when 'letter_range_0'
         reduce_letter_range_0(aProduction, aRange, theTokens, theChildren)
 
-      #rule 'letter_range' => %w[UPPERCASE LETTER FROM LETTER_LIT TO LETTER_LIT]  
-      when 'letter_range_1' 
+      #rule 'letter_range' => %w[UPPERCASE LETTER FROM LETTER_LIT TO LETTER_LIT]
+      when 'letter_range_1'
         reduce_letter_range_1(aProduction, aRange, theTokens, theChildren)
 
       when 'letter_range_2' # rule 'letter_range' => 'LETTER'
         reduce_letter_range_2(aProduction, aRange, theTokens, theChildren)
 
       when 'letter_range_3' # rule 'letter_range' => %w[UPPERCASE LETTER]
         reduce_letter_range_3(aProduction, aRange, theTokens, theChildren)
 
       # rule 'digit_range' => %w[digit_or_number FROM DIGIT_LIT TO DIGIT_LIT]
-      when 'digit_range_0' 
+      when 'digit_range_0'
         reduce_digit_range_0(aProduction, aRange, theTokens, theChildren)
 
-      when 'digit_range_1' #rule 'digit_range' => 'digit_or_number'
+      when 'digit_range_1' # rule 'digit_range' => 'digit_or_number'
         reduce_digit_range_1(aProduction, aRange, theTokens, theChildren)
 
+      when 'character_class_0' # rule 'character_class' => %w[ANY CHARACTER]
+        reduce_character_class_0(aProduction, aRange, theTokens, theChildren)
+
+      when 'character_class_1' # rule 'character_class' => %w[NO CHARACTER]
+        reduce_character_class_1(aProduction, aRange, theTokens, theChildren)
+
+      when 'character_class_2' # rule 'character_class' => 'WHITESPACE'
+        reduce_character_class_2(aProduction, aRange, theTokens, theChildren)
+
+      when 'character_class_3' # rule 'character_class' => %w[NO WHITESPACE]
+        reduce_character_class_3(aProduction, aRange, theTokens, theChildren)
+
+      when 'character_class_4' # rule 'character_class' => 'ANYTHING'
+        reduce_character_class_4(aProduction, aRange, theTokens, theChildren)
+
+       when 'character_class_5' # rule 'character_class' => %w[ONE OF STRING_LIT]
+        reduce_character_class_5(aProduction, aRange, theTokens, theChildren)
+
+      when 'special_char_0' # rule 'special_char' => 'TAB'
+        reduce_special_char_0(aProduction, aRange, theTokens, theChildren)
+
+      when 'special_char_1' # rule 'special_char' => 'BACKSLASH'
+        reduce_special_char_1(aProduction, aRange, theTokens, theChildren)
+
+      when 'special_char_2' # rule 'special_char' => %w[NEW LINE]
+        reduce_special_char_2(aProduction, aRange, theTokens, theChildren)
+
+      when 'literal_0' # rule 'literal' => %[LITERALLY STRING_LIT]
+        reduce_literal_0(aProduction, aRange, theTokens, theChildren)
+
+      # rule 'alternation' => %w[ANY OF LPAREN alternatives RPAREN]
+      when 'alternation_0'
+        reduce_alternation_0(aProduction, aRange, theTokens, theChildren)
+
+      # rule 'alternatives' => %w[alternatives COMMA quantifiable]
+      when 'alternatives_0'
+        reduce_alternatives_0(aProduction, aRange, theTokens, theChildren)
+
+      # rule 'alternatives' => %w[alternatives quantifiable]
+      when 'alternatives_1'
+        reduce_alternatives_1(aProduction, aRange, theTokens, theChildren)
+
+      when 'alternatives_2' # rule 'alternatives' => 'quantifiable'
+        reduce_alternatives_2(aProduction, aRange, theTokens, theChildren)
+        
+      when 'grouping' # rule 'grouping' => %w[LPAREN pattern RPAREN]
+        reduce_grouping_0(aProduction, aRange, theTokens, theChildren)      
+
       when 'quantifier_0' # rule 'quantifier' => 'ONCE'
         multiplicity(1, 1)
 
       when 'quantifier_1' # rule 'quantifier' => 'TWICE'
         multiplicity(2, 2)
 
       when 'quantifier_2' # rule 'quantifier' => %w[EXACTLY count TIMES]
         reduce_quantifier_2(aProduction, aRange, theTokens, theChildren)
 
       # rule 'quantifier' => %w[BETWEEN count AND count times_suffix]
-      when 'quantifier_3' 
+      when 'quantifier_3'
         reduce_quantifier_3(aProduction, aRange, theTokens, theChildren)
 
       when 'quantifier_4' # rule 'quantifier' => 'OPTIONAL'
         multiplicity(0, 1)
 
@@ -93,14 +169,14 @@
       when 'quantifier_6' # rule 'quantifier' => %w[NEVER OR MORE]
         multiplicity(0, :more)
 
       when 'quantifier_7' # rule 'quantifier' => %w[AT LEAST count TIMES]
         reduce_quantifier_7(aProduction, aRange, theTokens, theChildren)
-      
+
       # rule 'digit_or_number' => 'DIGIT'
       # rule 'digit_or_number' => 'NUMER'
-      when 'digit_or_number_0', 'digit_or_number_1' 
+      when 'digit_or_number_0', 'digit_or_number_1'
         return_first_child(aRange, theTokens, theChildren)
 
       when 'count_0', 'count_1'
         return_first_child(aRange, theTokens, theChildren)
 
@@ -115,10 +191,32 @@
 
   def multiplicity(lowerBound, upperBound)
     return SRL::Regex::Multiplicity.new(lowerBound, upperBound, :greedy)
   end
 
+  def string_literal(aString, to_escape = true)
+    if aString.size > 1
+      chars = []
+      aString.each_char do |ch|
+        if to_escape && Regex::Character::MetaChars.include?(ch)
+          chars << Regex::Character.new("\\")
+        end
+        chars << Regex::Character.new(ch)
+      end
+      result = Regex::Concatenation.new(*chars)
+    else
+        if to_escape && Regex::Character::MetaChars.include?(aString)
+          result = Regex::Concatenation.new(Regex::Character.new("\\"), 
+            Regex::Character.new(aString))
+        else
+          result = Regex::Character.new(aString)
+        end    
+    end
+
+    return result
+  end
+
   def char_range(lowerBound, upperBound)
     # TODO fix module nesting
     lower = Regex::Character.new(lowerBound)
     upper =  Regex::Character.new(upperBound)
     return Regex::CharRange.new(lower, upper)
@@ -126,19 +224,37 @@
 
   def char_class(toNegate, *theChildren)
     Regex::CharClass.new(toNegate, *theChildren)
   end
 
+  def char_shorthand(shortName)
+    Regex::CharShorthand.new(shortName)
+  end
+
+  def wildcard()
+    Regex::Wildcard.new
+  end
+
   def repetition(expressionToRepeat, aMultiplicity)
     return Regex::Repetition.new(expressionToRepeat, aMultiplicity)
   end
+  
+  # rule 'pattern' => %w[pattern COMMA quantifiable]
+  def reduce_pattern_0(aProduction, aRange, theTokens, theChildren)
+    return Regex::Concatenation.new(theChildren[0], theChildren[2])
+  end
 
-  # rule 'term' => %w[atom quantifier]
-  def reduce_term_1(aProduction, aRange, theTokens, theChildren)
+  # rule 'pattern' => %w[pattern quantifiable]
+  def reduce_pattern_1(aProduction, aRange, theTokens, theChildren)
+    return Regex::Concatenation.new(theChildren[0], theChildren[1])
+  end
+
+  # rule 'quantifiable' => %w[term quantifier]
+  def reduce_quantifiable_1(aProduction, aRange, theTokens, theChildren)
     quantifier = theChildren.last
-    atom = theChildren.first
-    repetition(atom, quantifier)
+    term = theChildren.first
+    repetition(term, quantifier)
   end
 
   # rule 'letter_range' => %w[LETTER FROM LETTER_LIT TO LETTER_LIT]
   def reduce_letter_range_0(aProduction, aRange, theTokens, theChildren)
     lower = theChildren[2].token.lexeme
@@ -164,26 +280,106 @@
   #rule 'letter_range' => %w[UPPERCASE LETTER]
   def reduce_letter_range_3(aProduction, aRange, theTokens, theChildren)
     ch_range = char_range('A', 'Z')
     char_class(false, ch_range)
   end
-  
+
   # rule 'digit_range' => %w[digit_or_number FROM DIGIT_LIT TO DIGIT_LIT]
   def reduce_digit_range_0(aProduction, aRange, theTokens, theChildren)
     reduce_letter_range_0(aProduction, aRange, theTokens, theChildren)
   end
 
   # rule 'digit_range' => 'digit_or_number'
   def reduce_digit_range_1(aProduction, aRange, theTokens, theChildren)
-    ch_range = char_range('0', '9')
-    char_class(false, ch_range)  
+    char_shorthand('d')
   end
 
+  # rule 'character_class' => %w[ANY CHARACTER]
+  def reduce_character_class_0(aProduction, aRange, theTokens, theChildren)
+    char_shorthand('w')
+  end
+
+  # rule 'character_class' => %w[NO CHARACTER]
+  def reduce_character_class_1(aProduction, aRange, theTokens, theChildren)
+    char_shorthand('W')
+  end
+
+  # rule 'character_class' => 'WHITESPACE'
+  def reduce_character_class_2(aProduction, aRange, theTokens, theChildren)
+    char_shorthand('s')
+  end
+
+  # rule 'character_class' => %w[NO WHITESPACE]
+  def reduce_character_class_3(aProduction, aRange, theTokens, theChildren)
+    char_shorthand('S')
+  end
+
+  # rule 'character_class' => 'ANYTHING'
+  def reduce_character_class_4(aProduction, aRange, theTokens, theChildren)
+    wildcard
+  end
+
+  # rule 'character_class' => %w[ONE OF STRING_LIT]
+  def reduce_character_class_5(aProduction, aRange, theTokens, theChildren)
+    raw_literal = theChildren[-1].token.lexeme.dup
+    alternatives = raw_literal.chars.map { |ch| Regex::Character.new(ch) }
+    return Regex::CharClass.new(false, *alternatives) # TODO check other implementations
+  end
+
+  # rule 'special_char' => 'TAB'
+  def reduce_special_char_0(aProduction, aRange, theTokens, theChildren)
+    Regex::Character.new('\t')
+  end
+
+  # rule 'special_char' => 'BACKSLASH'
+  def reduce_special_char_1(aProduction, aRange, theTokens, theChildren)
+    Regex::Character.new('\\')
+  end
+
+  # rule 'special_char' => %w[NEW LINE]
+  def reduce_special_char_2(aProduction, aRange, theTokens, theChildren)
+    # TODO: control portability
+    Regex::Character.new('\n')
+  end
+
+  # rule 'literal' => %[LITERALLY STRING_LIT]
+  def reduce_literal_0(aProduction, aRange, theTokens, theChildren)
+    # What if literal is empty?...
+
+    raw_literal = theChildren[-1].token.lexeme.dup
+    return string_literal(raw_literal)
+  end
+  
+  # rule 'alternation' => %w[ANY OF LPAREN alternatives RPAREN]
+  def reduce_alternation_0(aProduction, aRange, theTokens, theChildren)
+    return Regex::Alternation.new(*theChildren[3])
+  end
+
+  # rule 'alternatives' => %w[alternatives COMMA quantifiable]
+  def reduce_alternatives_0(aProduction, aRange, theTokens, theChildren)
+    return theChildren[0] << theChildren[-1]
+  end
+
+  # rule 'alternatives' => %w[alternatives quantifiable]
+  def reduce_alternatives_1(aProduction, aRange, theTokens, theChildren)
+    return theChildren[0] << theChildren[-1]
+  end
+  
+  # rule 'alternatives' => 'quantifiable'
+  def reduce_alternatives_2(aProduction, aRange, theTokens, theChildren)
+    return [theChildren.last]
+  end
+  
+  # rule 'grouping' => %w[LPAREN pattern RPAREN]
+  def reduce_grouping_0(aProduction, aRange, theTokens, theChildren)
+    return Regex::NonCapturingGroup.new(theChildren[1])  
+  end
+  
   # rule 'quantifier' => %w[EXACTLY count TIMES]
   def reduce_quantifier_2(aProduction, aRange, theTokens, theChildren)
     count = theChildren[1].token.lexeme.to_i
     multiplicity(count, count)
-  end
+  end  
 
   # rule 'quantifier' => %w[BETWEEN count AND count times_suffix]
   def reduce_quantifier_3(aProduction, aRange, theTokens, theChildren)
     lower = theChildren[1].token.lexeme.to_i
     upper = theChildren[3].token.lexeme.to_i