lib/lrama/grammar.rb in lrama-0.5.8 vs lib/lrama/grammar.rb in lrama-0.5.9
- old
+ new
@@ -1,33 +1,34 @@
require "strscan"
require "lrama/grammar/auxiliary"
require "lrama/grammar/code"
require "lrama/grammar/error_token"
+require "lrama/grammar/percent_code"
require "lrama/grammar/precedence"
require "lrama/grammar/printer"
require "lrama/grammar/reference"
require "lrama/grammar/rule"
require "lrama/grammar/symbol"
require "lrama/grammar/union"
require "lrama/lexer"
require "lrama/type"
module Lrama
- Token = Lrama::Lexer::Token
-
# Grammar is the result of parsing an input grammar file
class Grammar
- attr_reader :eof_symbol, :error_symbol, :undef_symbol, :accept_symbol, :aux
+ attr_reader :percent_codes, :eof_symbol, :error_symbol, :undef_symbol, :accept_symbol, :aux
attr_accessor :union, :expect,
:printers, :error_tokens,
:lex_param, :parse_param, :initial_action,
:symbols, :types,
:rules, :_rules,
:sym_to_rules
def initialize
+ # Code defined by "%code"
+ @percent_codes = []
@printers = []
@error_tokens = []
@symbols = []
@types = []
@_rules = []
@@ -41,10 +42,14 @@
@aux = Auxiliary.new
append_special_symbols
end
+ def add_percent_code(id:, code:)
+ @percent_codes << PercentCode.new(id, code)
+ end
+
def add_printer(ident_or_tags:, code:, lineno:)
@printers << Printer.new(ident_or_tags: ident_or_tags, code: code, lineno: lineno)
end
def add_error_token(ident_or_tags:, code:, lineno:)
@@ -120,20 +125,11 @@
def add_rule(lhs:, rhs:, lineno:)
@_rules << [lhs, rhs, lineno]
end
- def build_references(token_code)
- token_code.references.map! do |type, value, tag, first_column, last_column|
- Reference.new(type: type, value: value, ex_tag: tag, first_column: first_column, last_column: last_column)
- end
-
- token_code
- end
-
def build_code(type, token_code)
- build_references(token_code)
Code.new(type: type, token_code: token_code)
end
def prologue_first_lineno=(prologue_first_lineno)
@aux.prologue_first_lineno = prologue_first_lineno
@@ -150,10 +146,11 @@
def epilogue=(epilogue)
@aux.epilogue = epilogue
end
def prepare
+ extract_references
normalize_rules
collect_symbols
replace_token_with_symbol
fill_symbol_number
fill_default_precedence
@@ -312,35 +309,37 @@
start = scanner.pos
case
# $ references
# It need to wrap an identifier with brackets to use ".-" for identifiers
when scanner.scan(/\$(<[a-zA-Z0-9_]+>)?\$/) # $$, $<long>$
- tag = scanner[1] ? Lrama::Lexer::Token.new(type: Lrama::Lexer::Token::Tag, s_value: scanner[1]) : nil
- return [:dollar, "$", tag, start, scanner.pos - 1]
+ tag = scanner[1] ? Lrama::Lexer::Token::Tag.new(s_value: scanner[1]) : nil
+ return Reference.new(type: :dollar, value: "$", ex_tag: tag, first_column: start, last_column: scanner.pos - 1)
when scanner.scan(/\$(<[a-zA-Z0-9_]+>)?(\d+)/) # $1, $2, $<long>1
- tag = scanner[1] ? Lrama::Lexer::Token.new(type: Lrama::Lexer::Token::Tag, s_value: scanner[1]) : nil
- return [:dollar, Integer(scanner[2]), tag, start, scanner.pos - 1]
+ tag = scanner[1] ? Lrama::Lexer::Token::Tag.new(s_value: scanner[1]) : nil
+ return Reference.new(type: :dollar, value: Integer(scanner[2]), ex_tag: tag, first_column: start, last_column: scanner.pos - 1)
when scanner.scan(/\$(<[a-zA-Z0-9_]+>)?([a-zA-Z_][a-zA-Z0-9_]*)/) # $foo, $expr, $<long>program (named reference without brackets)
- tag = scanner[1] ? Lrama::Lexer::Token.new(type: Lrama::Lexer::Token::Tag, s_value: scanner[1]) : nil
- return [:dollar, scanner[2], tag, start, scanner.pos - 1]
+ tag = scanner[1] ? Lrama::Lexer::Token::Tag.new(s_value: scanner[1]) : nil
+ return Reference.new(type: :dollar, value: scanner[2], ex_tag: tag, first_column: start, last_column: scanner.pos - 1)
when scanner.scan(/\$(<[a-zA-Z0-9_]+>)?\[([a-zA-Z_.][-a-zA-Z0-9_.]*)\]/) # $expr.right, $expr-right, $<long>program (named reference with brackets)
- tag = scanner[1] ? Lrama::Lexer::Token.new(type: Lrama::Lexer::Token::Tag, s_value: scanner[1]) : nil
- return [:dollar, scanner[2], tag, start, scanner.pos - 1]
+ tag = scanner[1] ? Lrama::Lexer::Token::Tag.new(s_value: scanner[1]) : nil
+ return Reference.new(type: :dollar, value: scanner[2], ex_tag: tag, first_column: start, last_column: scanner.pos - 1)
# @ references
# It need to wrap an identifier with brackets to use ".-" for identifiers
when scanner.scan(/@\$/) # @$
- return [:at, "$", nil, start, scanner.pos - 1]
+ return Reference.new(type: :at, value: "$", first_column: start, last_column: scanner.pos - 1)
when scanner.scan(/@(\d+)/) # @1
- return [:at, Integer(scanner[1]), nil, start, scanner.pos - 1]
+ return Reference.new(type: :at, value: Integer(scanner[1]), first_column: start, last_column: scanner.pos - 1)
when scanner.scan(/@([a-zA-Z][a-zA-Z0-9_]*)/) # @foo, @expr (named reference without brackets)
- return [:at, scanner[1], nil, start, scanner.pos - 1]
+ return Reference.new(type: :at, value: scanner[1], first_column: start, last_column: scanner.pos - 1)
when scanner.scan(/@\[([a-zA-Z_.][-a-zA-Z0-9_.]*)\]/) # @expr.right, @expr-right (named reference with brackets)
- return [:at, scanner[1], nil, start, scanner.pos - 1]
+ return Reference.new(type: :at, value: scanner[1], first_column: start, last_column: scanner.pos - 1)
end
end
+ private
+
def extract_references
unless initial_action.nil?
scanner = StringScanner.new(initial_action.s_value)
references = []
@@ -351,11 +350,10 @@
scanner.getch
end
end
initial_action.token_code.references = references
- build_references(initial_action.token_code)
end
@printers.each do |printer|
scanner = StringScanner.new(printer.code.s_value)
references = []
@@ -367,11 +365,10 @@
scanner.getch
end
end
printer.code.token_code.references = references
- build_references(printer.code.token_code)
end
@error_tokens.each do |error_token|
scanner = StringScanner.new(error_token.code.s_value)
references = []
@@ -383,16 +380,15 @@
scanner.getch
end
end
error_token.code.token_code.references = references
- build_references(error_token.code.token_code)
end
@_rules.each do |lhs, rhs, _|
rhs.each_with_index do |token, index|
- next if token.class == Lrama::Grammar::Symbol || token.type != Lrama::Lexer::Token::User_code
+ next unless token.class == Lrama::Lexer::Token::UserCode
scanner = StringScanner.new(token.s_value)
references = []
while !scanner.eos? do
@@ -405,18 +401,15 @@
scanner.getch
end
end
token.references = references
- token.numberize_references(lhs, rhs)
- build_references(token)
+ numberize_references(lhs, rhs, token.references)
end
end
end
- private
-
def find_nterm_by_id!(id)
nterms.find do |nterm|
nterm.id == id
end || (raise "Nterm not found: #{id}")
end
@@ -426,33 +419,58 @@
# term = add_term(id: Token.new(Token::Ident, "YYEMPTY"), token_id: -2)
# term.number = -2
# @empty_symbol = term
# YYEOF
- term = add_term(id: Token.new(type: Token::Ident, s_value: "YYEOF"), alias_name: "\"end of file\"", token_id: 0)
+ term = add_term(id: Lrama::Lexer::Token::Ident.new(s_value: "YYEOF"), alias_name: "\"end of file\"", token_id: 0)
term.number = 0
term.eof_symbol = true
@eof_symbol = term
# YYerror
- term = add_term(id: Token.new(type: Token::Ident, s_value: "YYerror"), alias_name: "error")
+ term = add_term(id: Lrama::Lexer::Token::Ident.new(s_value: "YYerror"), alias_name: "error")
term.number = 1
term.error_symbol = true
@error_symbol = term
# YYUNDEF
- term = add_term(id: Token.new(type: Token::Ident, s_value: "YYUNDEF"), alias_name: "\"invalid token\"")
+ term = add_term(id: Lrama::Lexer::Token::Ident.new(s_value: "YYUNDEF"), alias_name: "\"invalid token\"")
term.number = 2
term.undef_symbol = true
@undef_symbol = term
# $accept
- term = add_nterm(id: Token.new(type: Token::Ident, s_value: "$accept"))
+ term = add_nterm(id: Lrama::Lexer::Token::Ident.new(s_value: "$accept"))
term.accept_symbol = true
@accept_symbol = term
end
+ def numberize_references(lhs, rhs, references)
+ references.map! {|ref|
+ ref_name = ref.value
+ if ref_name.is_a?(::String) && ref_name != '$'
+ value =
+ if lhs.referred_by?(ref_name)
+ '$'
+ else
+ index = rhs.find_index {|token| token.referred_by?(ref_name) }
+
+ if index
+ index + 1
+ else
+ raise "'#{ref_name}' is invalid name."
+ end
+ end
+
+ ref.value = value
+ ref
+ else
+ ref
+ end
+ }
+ end
+
# 1. Add $accept rule to the top of rules
# 2. Extract precedence and last action
# 3. Extract action in the middle of RHS into new Empty rule
# 4. Append id and extract action then create Rule
#
@@ -491,21 +509,21 @@
# 2. Extract precedence and last action
rhs.reverse.each do |r|
case
when r.is_a?(Symbol) # precedence_sym
precedence_sym = r
- when (r.type == Token::User_code) && precedence_sym.nil? && code.nil? && rhs1.empty?
+ when r.is_a?(Lrama::Lexer::Token::UserCode) && precedence_sym.nil? && code.nil? && rhs1.empty?
code = r
else
rhs1 << r
end
end
rhs1.reverse!
# Bison n'th component is 1-origin
(rhs1 + [code]).compact.each.with_index(1) do |token, i|
- if token.type == Token::User_code
+ if token.is_a?(Lrama::Lexer::Token::UserCode)
token.references.each do |ref|
# Need to keep position_in_rhs for actions in the middle of RHS
ref.position_in_rhs = i - 1
next if ref.type == :at
# $$, $n, @$, @n can be used in any actions
@@ -530,13 +548,13 @@
end
end
end
rhs2 = rhs1.map do |token|
- if token.type == Token::User_code
+ if token.is_a?(Lrama::Lexer::Token::UserCode)
prefix = token.referred ? "@" : "$@"
- new_token = Token.new(type: Token::Ident, s_value: prefix + extracted_action_number.to_s)
+ new_token = Lrama::Lexer::Token::Ident.new(s_value: prefix + extracted_action_number.to_s)
extracted_action_number += 1
a << [new_token, token]
new_token
else
token
@@ -548,27 +566,54 @@
a.each do |new_token, code|
@rules << Rule.new(id: @rules.count, lhs: new_token, rhs: [], code: Code.new(type: :user_code, token_code: code), lineno: code.line)
end
c = code ? Code.new(type: :user_code, token_code: code) : nil
- @rules << Rule.new(id: @rules.count, lhs: lhs, rhs: rhs2, code: c, precedence_sym: precedence_sym, lineno: lineno)
-
+ # Expand Parameterizing rules
+ if rhs2.any? {|r| r.is_a?(Lrama::Lexer::Token::Parameterizing) }
+ expand_parameterizing_rules(lhs, rhs2, c, precedence_sym, lineno)
+ else
+ @rules << Rule.new(id: @rules.count, lhs: lhs, rhs: rhs2, code: c, precedence_sym: precedence_sym, lineno: lineno)
+ end
add_nterm(id: lhs)
a.each do |new_token, _|
add_nterm(id: new_token)
end
end
end
+ def expand_parameterizing_rules(lhs, rhs, code, precedence_sym, lineno)
+ token = Lrama::Lexer::Token::Ident.new(s_value: rhs[0].s_value)
+ if rhs.any? {|r| r.is_a?(Lrama::Lexer::Token::Parameterizing) && r.option? }
+ option_token = Lrama::Lexer::Token::Ident.new(s_value: "option_#{rhs[0].s_value}")
+ add_term(id: option_token)
+ @rules << Rule.new(id: @rules.count, lhs: lhs, rhs: [option_token], code: code, precedence_sym: precedence_sym, lineno: lineno)
+ @rules << Rule.new(id: @rules.count, lhs: option_token, rhs: [], code: code, precedence_sym: precedence_sym, lineno: lineno)
+ @rules << Rule.new(id: @rules.count, lhs: option_token, rhs: [token], code: code, precedence_sym: precedence_sym, lineno: lineno)
+ elsif rhs.any? {|r| r.is_a?(Lrama::Lexer::Token::Parameterizing) && r.nonempty_list? }
+ nonempty_list_token = Lrama::Lexer::Token::Ident.new(s_value: "nonempty_list_#{rhs[0].s_value}")
+ add_term(id: nonempty_list_token)
+ @rules << Rule.new(id: @rules.count, lhs: lhs, rhs: [nonempty_list_token], code: code, precedence_sym: precedence_sym, lineno: lineno)
+ @rules << Rule.new(id: @rules.count, lhs: nonempty_list_token, rhs: [token], code: code, precedence_sym: precedence_sym, lineno: lineno)
+ @rules << Rule.new(id: @rules.count, lhs: nonempty_list_token, rhs: [nonempty_list_token, token], code: code, precedence_sym: precedence_sym, lineno: lineno)
+ elsif rhs.any? {|r| r.is_a?(Lrama::Lexer::Token::Parameterizing) && r.list? }
+ list_token = Lrama::Lexer::Token::Ident.new(s_value: "list_#{rhs[0].s_value}")
+ add_term(id: list_token)
+ @rules << Rule.new(id: @rules.count, lhs: lhs, rhs: [list_token], code: code, precedence_sym: precedence_sym, lineno: lineno)
+ @rules << Rule.new(id: @rules.count, lhs: list_token, rhs: [], code: code, precedence_sym: precedence_sym, lineno: lineno)
+ @rules << Rule.new(id: @rules.count, lhs: list_token, rhs: [list_token, token], code: code, precedence_sym: precedence_sym, lineno: lineno)
+ end
+ end
+
# Collect symbols from rules
def collect_symbols
@rules.flat_map(&:rhs).each do |s|
case s
- when Token
- if s.type == Token::Char
- add_term(id: s)
- end
+ when Lrama::Lexer::Token::Char
+ add_term(id: s)
+ when Lrama::Lexer::Token
+ # skip
when Symbol
# skip
else
raise "Unknown class: #{s}"
end
@@ -605,11 +650,11 @@
number += 1
end
# If id is Token::Char, it uses ASCII code
if sym.term? && sym.token_id.nil?
- if sym.id.type == Token::Char
+ if sym.id.is_a?(Lrama::Lexer::Token::Char)
# Ignore ' on the both sides
case sym.id.s_value[1..-2]
when "\\b"
sym.token_id = 8
when "\\f"
@@ -658,21 +703,21 @@
if rule.code
rule.code.references.each do |ref|
next if ref.type == :at
- if ref.referring_symbol.type != Token::User_code
+ if !ref.referring_symbol.is_a?(Lrama::Lexer::Token::UserCode)
ref.referring_symbol = token_to_symbol(ref.referring_symbol)
end
end
end
end
end
def token_to_symbol(token)
case token
- when Token
+ when Lrama::Lexer::Token
find_symbol_by_id!(token)
when Symbol
token
else
raise "Unknown class: #{token}"
@@ -714,14 +759,14 @@
def fill_symbol_printer
@symbols.each do |sym|
@printers.each do |printer|
printer.ident_or_tags.each do |ident_or_tag|
- case ident_or_tag.type
- when Token::Ident
+ case ident_or_tag
+ when Lrama::Lexer::Token::Ident
sym.printer = printer if sym.id == ident_or_tag
- when Token::Tag
+ when Lrama::Lexer::Token::Tag
sym.printer = printer if sym.tag == ident_or_tag
else
raise "Unknown token type. #{printer}"
end
end
@@ -731,13 +776,13 @@
def fill_symbol_error_token
@symbols.each do |sym|
@error_tokens.each do |error_token|
error_token.ident_or_tags.each do |ident_or_tag|
- case ident_or_tag.type
- when Token::Ident
+ case ident_or_tag
+ when Lrama::Lexer::Token::Ident
sym.error_token = error_token if sym.id == ident_or_tag
- when Token::Tag
+ when Lrama::Lexer::Token::Tag
sym.error_token = error_token if sym.tag == ident_or_tag
else
raise "Unknown token type. #{error_token}"
end
end