lib/lrama/grammar.rb in lrama-0.5.8 vs lib/lrama/grammar.rb in lrama-0.5.9

- old
+ new

@@ -1,33 +1,34 @@ require "strscan" require "lrama/grammar/auxiliary" require "lrama/grammar/code" require "lrama/grammar/error_token" +require "lrama/grammar/percent_code" require "lrama/grammar/precedence" require "lrama/grammar/printer" require "lrama/grammar/reference" require "lrama/grammar/rule" require "lrama/grammar/symbol" require "lrama/grammar/union" require "lrama/lexer" require "lrama/type" module Lrama - Token = Lrama::Lexer::Token - # Grammar is the result of parsing an input grammar file class Grammar - attr_reader :eof_symbol, :error_symbol, :undef_symbol, :accept_symbol, :aux + attr_reader :percent_codes, :eof_symbol, :error_symbol, :undef_symbol, :accept_symbol, :aux attr_accessor :union, :expect, :printers, :error_tokens, :lex_param, :parse_param, :initial_action, :symbols, :types, :rules, :_rules, :sym_to_rules def initialize + # Code defined by "%code" + @percent_codes = [] @printers = [] @error_tokens = [] @symbols = [] @types = [] @_rules = [] @@ -41,10 +42,14 @@ @aux = Auxiliary.new append_special_symbols end + def add_percent_code(id:, code:) + @percent_codes << PercentCode.new(id, code) + end + def add_printer(ident_or_tags:, code:, lineno:) @printers << Printer.new(ident_or_tags: ident_or_tags, code: code, lineno: lineno) end def add_error_token(ident_or_tags:, code:, lineno:) @@ -120,20 +125,11 @@ def add_rule(lhs:, rhs:, lineno:) @_rules << [lhs, rhs, lineno] end - def build_references(token_code) - token_code.references.map! do |type, value, tag, first_column, last_column| - Reference.new(type: type, value: value, ex_tag: tag, first_column: first_column, last_column: last_column) - end - - token_code - end - def build_code(type, token_code) - build_references(token_code) Code.new(type: type, token_code: token_code) end def prologue_first_lineno=(prologue_first_lineno) @aux.prologue_first_lineno = prologue_first_lineno @@ -150,10 +146,11 @@ def epilogue=(epilogue) @aux.epilogue = epilogue end def prepare + extract_references normalize_rules collect_symbols replace_token_with_symbol fill_symbol_number fill_default_precedence @@ -312,35 +309,37 @@ start = scanner.pos case # $ references # It need to wrap an identifier with brackets to use ".-" for identifiers when scanner.scan(/\$(<[a-zA-Z0-9_]+>)?\$/) # $$, $<long>$ - tag = scanner[1] ? Lrama::Lexer::Token.new(type: Lrama::Lexer::Token::Tag, s_value: scanner[1]) : nil - return [:dollar, "$", tag, start, scanner.pos - 1] + tag = scanner[1] ? Lrama::Lexer::Token::Tag.new(s_value: scanner[1]) : nil + return Reference.new(type: :dollar, value: "$", ex_tag: tag, first_column: start, last_column: scanner.pos - 1) when scanner.scan(/\$(<[a-zA-Z0-9_]+>)?(\d+)/) # $1, $2, $<long>1 - tag = scanner[1] ? Lrama::Lexer::Token.new(type: Lrama::Lexer::Token::Tag, s_value: scanner[1]) : nil - return [:dollar, Integer(scanner[2]), tag, start, scanner.pos - 1] + tag = scanner[1] ? Lrama::Lexer::Token::Tag.new(s_value: scanner[1]) : nil + return Reference.new(type: :dollar, value: Integer(scanner[2]), ex_tag: tag, first_column: start, last_column: scanner.pos - 1) when scanner.scan(/\$(<[a-zA-Z0-9_]+>)?([a-zA-Z_][a-zA-Z0-9_]*)/) # $foo, $expr, $<long>program (named reference without brackets) - tag = scanner[1] ? Lrama::Lexer::Token.new(type: Lrama::Lexer::Token::Tag, s_value: scanner[1]) : nil - return [:dollar, scanner[2], tag, start, scanner.pos - 1] + tag = scanner[1] ? Lrama::Lexer::Token::Tag.new(s_value: scanner[1]) : nil + return Reference.new(type: :dollar, value: scanner[2], ex_tag: tag, first_column: start, last_column: scanner.pos - 1) when scanner.scan(/\$(<[a-zA-Z0-9_]+>)?\[([a-zA-Z_.][-a-zA-Z0-9_.]*)\]/) # $expr.right, $expr-right, $<long>program (named reference with brackets) - tag = scanner[1] ? Lrama::Lexer::Token.new(type: Lrama::Lexer::Token::Tag, s_value: scanner[1]) : nil - return [:dollar, scanner[2], tag, start, scanner.pos - 1] + tag = scanner[1] ? Lrama::Lexer::Token::Tag.new(s_value: scanner[1]) : nil + return Reference.new(type: :dollar, value: scanner[2], ex_tag: tag, first_column: start, last_column: scanner.pos - 1) # @ references # It need to wrap an identifier with brackets to use ".-" for identifiers when scanner.scan(/@\$/) # @$ - return [:at, "$", nil, start, scanner.pos - 1] + return Reference.new(type: :at, value: "$", first_column: start, last_column: scanner.pos - 1) when scanner.scan(/@(\d+)/) # @1 - return [:at, Integer(scanner[1]), nil, start, scanner.pos - 1] + return Reference.new(type: :at, value: Integer(scanner[1]), first_column: start, last_column: scanner.pos - 1) when scanner.scan(/@([a-zA-Z][a-zA-Z0-9_]*)/) # @foo, @expr (named reference without brackets) - return [:at, scanner[1], nil, start, scanner.pos - 1] + return Reference.new(type: :at, value: scanner[1], first_column: start, last_column: scanner.pos - 1) when scanner.scan(/@\[([a-zA-Z_.][-a-zA-Z0-9_.]*)\]/) # @expr.right, @expr-right (named reference with brackets) - return [:at, scanner[1], nil, start, scanner.pos - 1] + return Reference.new(type: :at, value: scanner[1], first_column: start, last_column: scanner.pos - 1) end end + private + def extract_references unless initial_action.nil? scanner = StringScanner.new(initial_action.s_value) references = [] @@ -351,11 +350,10 @@ scanner.getch end end initial_action.token_code.references = references - build_references(initial_action.token_code) end @printers.each do |printer| scanner = StringScanner.new(printer.code.s_value) references = [] @@ -367,11 +365,10 @@ scanner.getch end end printer.code.token_code.references = references - build_references(printer.code.token_code) end @error_tokens.each do |error_token| scanner = StringScanner.new(error_token.code.s_value) references = [] @@ -383,16 +380,15 @@ scanner.getch end end error_token.code.token_code.references = references - build_references(error_token.code.token_code) end @_rules.each do |lhs, rhs, _| rhs.each_with_index do |token, index| - next if token.class == Lrama::Grammar::Symbol || token.type != Lrama::Lexer::Token::User_code + next unless token.class == Lrama::Lexer::Token::UserCode scanner = StringScanner.new(token.s_value) references = [] while !scanner.eos? do @@ -405,18 +401,15 @@ scanner.getch end end token.references = references - token.numberize_references(lhs, rhs) - build_references(token) + numberize_references(lhs, rhs, token.references) end end end - private - def find_nterm_by_id!(id) nterms.find do |nterm| nterm.id == id end || (raise "Nterm not found: #{id}") end @@ -426,33 +419,58 @@ # term = add_term(id: Token.new(Token::Ident, "YYEMPTY"), token_id: -2) # term.number = -2 # @empty_symbol = term # YYEOF - term = add_term(id: Token.new(type: Token::Ident, s_value: "YYEOF"), alias_name: "\"end of file\"", token_id: 0) + term = add_term(id: Lrama::Lexer::Token::Ident.new(s_value: "YYEOF"), alias_name: "\"end of file\"", token_id: 0) term.number = 0 term.eof_symbol = true @eof_symbol = term # YYerror - term = add_term(id: Token.new(type: Token::Ident, s_value: "YYerror"), alias_name: "error") + term = add_term(id: Lrama::Lexer::Token::Ident.new(s_value: "YYerror"), alias_name: "error") term.number = 1 term.error_symbol = true @error_symbol = term # YYUNDEF - term = add_term(id: Token.new(type: Token::Ident, s_value: "YYUNDEF"), alias_name: "\"invalid token\"") + term = add_term(id: Lrama::Lexer::Token::Ident.new(s_value: "YYUNDEF"), alias_name: "\"invalid token\"") term.number = 2 term.undef_symbol = true @undef_symbol = term # $accept - term = add_nterm(id: Token.new(type: Token::Ident, s_value: "$accept")) + term = add_nterm(id: Lrama::Lexer::Token::Ident.new(s_value: "$accept")) term.accept_symbol = true @accept_symbol = term end + def numberize_references(lhs, rhs, references) + references.map! {|ref| + ref_name = ref.value + if ref_name.is_a?(::String) && ref_name != '$' + value = + if lhs.referred_by?(ref_name) + '$' + else + index = rhs.find_index {|token| token.referred_by?(ref_name) } + + if index + index + 1 + else + raise "'#{ref_name}' is invalid name." + end + end + + ref.value = value + ref + else + ref + end + } + end + # 1. Add $accept rule to the top of rules # 2. Extract precedence and last action # 3. Extract action in the middle of RHS into new Empty rule # 4. Append id and extract action then create Rule # @@ -491,21 +509,21 @@ # 2. Extract precedence and last action rhs.reverse.each do |r| case when r.is_a?(Symbol) # precedence_sym precedence_sym = r - when (r.type == Token::User_code) && precedence_sym.nil? && code.nil? && rhs1.empty? + when r.is_a?(Lrama::Lexer::Token::UserCode) && precedence_sym.nil? && code.nil? && rhs1.empty? code = r else rhs1 << r end end rhs1.reverse! # Bison n'th component is 1-origin (rhs1 + [code]).compact.each.with_index(1) do |token, i| - if token.type == Token::User_code + if token.is_a?(Lrama::Lexer::Token::UserCode) token.references.each do |ref| # Need to keep position_in_rhs for actions in the middle of RHS ref.position_in_rhs = i - 1 next if ref.type == :at # $$, $n, @$, @n can be used in any actions @@ -530,13 +548,13 @@ end end end rhs2 = rhs1.map do |token| - if token.type == Token::User_code + if token.is_a?(Lrama::Lexer::Token::UserCode) prefix = token.referred ? "@" : "$@" - new_token = Token.new(type: Token::Ident, s_value: prefix + extracted_action_number.to_s) + new_token = Lrama::Lexer::Token::Ident.new(s_value: prefix + extracted_action_number.to_s) extracted_action_number += 1 a << [new_token, token] new_token else token @@ -548,27 +566,54 @@ a.each do |new_token, code| @rules << Rule.new(id: @rules.count, lhs: new_token, rhs: [], code: Code.new(type: :user_code, token_code: code), lineno: code.line) end c = code ? Code.new(type: :user_code, token_code: code) : nil - @rules << Rule.new(id: @rules.count, lhs: lhs, rhs: rhs2, code: c, precedence_sym: precedence_sym, lineno: lineno) - + # Expand Parameterizing rules + if rhs2.any? {|r| r.is_a?(Lrama::Lexer::Token::Parameterizing) } + expand_parameterizing_rules(lhs, rhs2, c, precedence_sym, lineno) + else + @rules << Rule.new(id: @rules.count, lhs: lhs, rhs: rhs2, code: c, precedence_sym: precedence_sym, lineno: lineno) + end add_nterm(id: lhs) a.each do |new_token, _| add_nterm(id: new_token) end end end + def expand_parameterizing_rules(lhs, rhs, code, precedence_sym, lineno) + token = Lrama::Lexer::Token::Ident.new(s_value: rhs[0].s_value) + if rhs.any? {|r| r.is_a?(Lrama::Lexer::Token::Parameterizing) && r.option? } + option_token = Lrama::Lexer::Token::Ident.new(s_value: "option_#{rhs[0].s_value}") + add_term(id: option_token) + @rules << Rule.new(id: @rules.count, lhs: lhs, rhs: [option_token], code: code, precedence_sym: precedence_sym, lineno: lineno) + @rules << Rule.new(id: @rules.count, lhs: option_token, rhs: [], code: code, precedence_sym: precedence_sym, lineno: lineno) + @rules << Rule.new(id: @rules.count, lhs: option_token, rhs: [token], code: code, precedence_sym: precedence_sym, lineno: lineno) + elsif rhs.any? {|r| r.is_a?(Lrama::Lexer::Token::Parameterizing) && r.nonempty_list? } + nonempty_list_token = Lrama::Lexer::Token::Ident.new(s_value: "nonempty_list_#{rhs[0].s_value}") + add_term(id: nonempty_list_token) + @rules << Rule.new(id: @rules.count, lhs: lhs, rhs: [nonempty_list_token], code: code, precedence_sym: precedence_sym, lineno: lineno) + @rules << Rule.new(id: @rules.count, lhs: nonempty_list_token, rhs: [token], code: code, precedence_sym: precedence_sym, lineno: lineno) + @rules << Rule.new(id: @rules.count, lhs: nonempty_list_token, rhs: [nonempty_list_token, token], code: code, precedence_sym: precedence_sym, lineno: lineno) + elsif rhs.any? {|r| r.is_a?(Lrama::Lexer::Token::Parameterizing) && r.list? } + list_token = Lrama::Lexer::Token::Ident.new(s_value: "list_#{rhs[0].s_value}") + add_term(id: list_token) + @rules << Rule.new(id: @rules.count, lhs: lhs, rhs: [list_token], code: code, precedence_sym: precedence_sym, lineno: lineno) + @rules << Rule.new(id: @rules.count, lhs: list_token, rhs: [], code: code, precedence_sym: precedence_sym, lineno: lineno) + @rules << Rule.new(id: @rules.count, lhs: list_token, rhs: [list_token, token], code: code, precedence_sym: precedence_sym, lineno: lineno) + end + end + # Collect symbols from rules def collect_symbols @rules.flat_map(&:rhs).each do |s| case s - when Token - if s.type == Token::Char - add_term(id: s) - end + when Lrama::Lexer::Token::Char + add_term(id: s) + when Lrama::Lexer::Token + # skip when Symbol # skip else raise "Unknown class: #{s}" end @@ -605,11 +650,11 @@ number += 1 end # If id is Token::Char, it uses ASCII code if sym.term? && sym.token_id.nil? - if sym.id.type == Token::Char + if sym.id.is_a?(Lrama::Lexer::Token::Char) # Ignore ' on the both sides case sym.id.s_value[1..-2] when "\\b" sym.token_id = 8 when "\\f" @@ -658,21 +703,21 @@ if rule.code rule.code.references.each do |ref| next if ref.type == :at - if ref.referring_symbol.type != Token::User_code + if !ref.referring_symbol.is_a?(Lrama::Lexer::Token::UserCode) ref.referring_symbol = token_to_symbol(ref.referring_symbol) end end end end end def token_to_symbol(token) case token - when Token + when Lrama::Lexer::Token find_symbol_by_id!(token) when Symbol token else raise "Unknown class: #{token}" @@ -714,14 +759,14 @@ def fill_symbol_printer @symbols.each do |sym| @printers.each do |printer| printer.ident_or_tags.each do |ident_or_tag| - case ident_or_tag.type - when Token::Ident + case ident_or_tag + when Lrama::Lexer::Token::Ident sym.printer = printer if sym.id == ident_or_tag - when Token::Tag + when Lrama::Lexer::Token::Tag sym.printer = printer if sym.tag == ident_or_tag else raise "Unknown token type. #{printer}" end end @@ -731,13 +776,13 @@ def fill_symbol_error_token @symbols.each do |sym| @error_tokens.each do |error_token| error_token.ident_or_tags.each do |ident_or_tag| - case ident_or_tag.type - when Token::Ident + case ident_or_tag + when Lrama::Lexer::Token::Ident sym.error_token = error_token if sym.id == ident_or_tag - when Token::Tag + when Lrama::Lexer::Token::Tag sym.error_token = error_token if sym.tag == ident_or_tag else raise "Unknown token type. #{error_token}" end end