lib/lrama/grammar.rb in lrama-0.5.9 vs lib/lrama/grammar.rb in lrama-0.5.10

- old
+ new

@@ -1,15 +1,15 @@ -require "strscan" - require "lrama/grammar/auxiliary" require "lrama/grammar/code" +require "lrama/grammar/counter" require "lrama/grammar/error_token" require "lrama/grammar/percent_code" require "lrama/grammar/precedence" require "lrama/grammar/printer" require "lrama/grammar/reference" require "lrama/grammar/rule" +require "lrama/grammar/rule_builder" require "lrama/grammar/symbol" require "lrama/grammar/union" require "lrama/lexer" require "lrama/type" @@ -19,21 +19,23 @@ attr_reader :percent_codes, :eof_symbol, :error_symbol, :undef_symbol, :accept_symbol, :aux attr_accessor :union, :expect, :printers, :error_tokens, :lex_param, :parse_param, :initial_action, :symbols, :types, - :rules, :_rules, + :rules, :rule_builders, :sym_to_rules - def initialize + def initialize(rule_counter) + @rule_counter = rule_counter + # Code defined by "%code" @percent_codes = [] @printers = [] @error_tokens = [] @symbols = [] @types = [] - @_rules = [] + @rule_builders = [] @rules = [] @sym_to_rules = {} @empty_symbol = nil @eof_symbol = nil @error_symbol = nil @@ -46,16 +48,16 @@ def add_percent_code(id:, code:) @percent_codes << PercentCode.new(id, code) end - def add_printer(ident_or_tags:, code:, lineno:) - @printers << Printer.new(ident_or_tags: ident_or_tags, code: code, lineno: lineno) + def add_printer(ident_or_tags:, token_code:, lineno:) + @printers << Printer.new(ident_or_tags: ident_or_tags, token_code: token_code, lineno: lineno) end - def add_error_token(ident_or_tags:, code:, lineno:) - @error_tokens << ErrorToken.new(ident_or_tags: ident_or_tags, code: code, lineno: lineno) + def add_error_token(ident_or_tags:, token_code:, lineno:) + @error_tokens << ErrorToken.new(ident_or_tags: ident_or_tags, token_code: token_code, lineno: lineno) end def add_term(id:, alias_name: nil, tag: nil, token_id: nil, replace: false) if token_id && (sym = @symbols.find {|s| s.token_id == token_id }) if replace @@ -121,18 +123,14 @@ def set_union(code, lineno) @union = Union.new(code: code, lineno: lineno) end - def add_rule(lhs:, rhs:, lineno:) - @_rules << [lhs, rhs, lineno] + def add_rule_builder(builder) + @rule_builders << builder end - def build_code(type, token_code) - Code.new(type: type, token_code: token_code) - end - def prologue_first_lineno=(prologue_first_lineno) @aux.prologue_first_lineno = prologue_first_lineno end def prologue=(prologue) @@ -146,29 +144,88 @@ def epilogue=(epilogue) @aux.epilogue = epilogue end def prepare - extract_references normalize_rules collect_symbols replace_token_with_symbol fill_symbol_number fill_default_precedence fill_sym_to_rules fill_nterm_type fill_symbol_printer fill_symbol_error_token @symbols.sort_by!(&:number) + compute_nullable + compute_first_set end # TODO: More validation methods + # + # * Validaiton for no_declared_type_reference def validate! validate_symbol_number_uniqueness! - validate_no_declared_type_reference! + validate_symbol_alias_name_uniqueness! + validate_rule_lhs_is_nterm! end + def find_symbol_by_s_value(s_value) + @symbols.find do |sym| + sym.id.s_value == s_value + end + end + + def find_symbol_by_s_value!(s_value) + find_symbol_by_s_value(s_value) || (raise "Symbol not found: #{s_value}") + end + + def find_symbol_by_id(id) + @symbols.find do |sym| + sym.id == id || sym.alias_name == id.s_value + end + end + + def find_symbol_by_id!(id) + find_symbol_by_id(id) || (raise "Symbol not found: #{id}") + end + + def find_symbol_by_number!(number) + sym = @symbols[number] + + raise "Symbol not found: #{number}" unless sym + raise "[BUG] Symbol number mismatch. #{number}, #{sym}" if sym.number != number + + sym + end + + def find_rules_by_symbol!(sym) + find_rules_by_symbol(sym) || (raise "Rules for #{sym} not found") + end + + def find_rules_by_symbol(sym) + @sym_to_rules[sym.number] + end + + def terms_count + terms.count + end + + def terms + @terms ||= @symbols.select(&:term?) + end + + def nterms_count + nterms.count + end + + def nterms + @nterms ||= @symbols.select(&:nterm?) + end + + private + def compute_nullable @rules.each do |rule| case when rule.rhs.empty? rule.nullable = true @@ -249,167 +306,16 @@ find_symbol_by_number!(number) end.to_set end end - def find_symbol_by_s_value(s_value) - @symbols.find do |sym| - sym.id.s_value == s_value + def setup_rules + @rule_builders.each do |builder| + builder.setup_rules end end - def find_symbol_by_s_value!(s_value) - find_symbol_by_s_value(s_value) || (raise "Symbol not found: #{s_value}") - end - - def find_symbol_by_id(id) - @symbols.find do |sym| - # TODO: validate uniqueness of Token#s_value and Symbol#alias_name - sym.id == id || sym.alias_name == id.s_value - end - end - - def find_symbol_by_id!(id) - find_symbol_by_id(id) || (raise "Symbol not found: #{id}") - end - - def find_symbol_by_number!(number) - sym = @symbols[number] - - raise "Symbol not found: #{number}" unless sym - raise "[BUG] Symbol number mismatch. #{number}, #{sym}" if sym.number != number - - sym - end - - def find_rules_by_symbol!(sym) - find_rules_by_symbol(sym) || (raise "Rules for #{sym} not found") - end - - def find_rules_by_symbol(sym) - @sym_to_rules[sym.number] - end - - def terms_count - terms.count - end - - def terms - @terms ||= @symbols.select(&:term?) - end - - def nterms_count - nterms.count - end - - def nterms - @nterms ||= @symbols.select(&:nterm?) - end - - def scan_reference(scanner) - start = scanner.pos - case - # $ references - # It need to wrap an identifier with brackets to use ".-" for identifiers - when scanner.scan(/\$(<[a-zA-Z0-9_]+>)?\$/) # $$, $<long>$ - tag = scanner[1] ? Lrama::Lexer::Token::Tag.new(s_value: scanner[1]) : nil - return Reference.new(type: :dollar, value: "$", ex_tag: tag, first_column: start, last_column: scanner.pos - 1) - when scanner.scan(/\$(<[a-zA-Z0-9_]+>)?(\d+)/) # $1, $2, $<long>1 - tag = scanner[1] ? Lrama::Lexer::Token::Tag.new(s_value: scanner[1]) : nil - return Reference.new(type: :dollar, value: Integer(scanner[2]), ex_tag: tag, first_column: start, last_column: scanner.pos - 1) - when scanner.scan(/\$(<[a-zA-Z0-9_]+>)?([a-zA-Z_][a-zA-Z0-9_]*)/) # $foo, $expr, $<long>program (named reference without brackets) - tag = scanner[1] ? Lrama::Lexer::Token::Tag.new(s_value: scanner[1]) : nil - return Reference.new(type: :dollar, value: scanner[2], ex_tag: tag, first_column: start, last_column: scanner.pos - 1) - when scanner.scan(/\$(<[a-zA-Z0-9_]+>)?\[([a-zA-Z_.][-a-zA-Z0-9_.]*)\]/) # $expr.right, $expr-right, $<long>program (named reference with brackets) - tag = scanner[1] ? Lrama::Lexer::Token::Tag.new(s_value: scanner[1]) : nil - return Reference.new(type: :dollar, value: scanner[2], ex_tag: tag, first_column: start, last_column: scanner.pos - 1) - - # @ references - # It need to wrap an identifier with brackets to use ".-" for identifiers - when scanner.scan(/@\$/) # @$ - return Reference.new(type: :at, value: "$", first_column: start, last_column: scanner.pos - 1) - when scanner.scan(/@(\d+)/) # @1 - return Reference.new(type: :at, value: Integer(scanner[1]), first_column: start, last_column: scanner.pos - 1) - when scanner.scan(/@([a-zA-Z][a-zA-Z0-9_]*)/) # @foo, @expr (named reference without brackets) - return Reference.new(type: :at, value: scanner[1], first_column: start, last_column: scanner.pos - 1) - when scanner.scan(/@\[([a-zA-Z_.][-a-zA-Z0-9_.]*)\]/) # @expr.right, @expr-right (named reference with brackets) - return Reference.new(type: :at, value: scanner[1], first_column: start, last_column: scanner.pos - 1) - end - end - - private - - def extract_references - unless initial_action.nil? - scanner = StringScanner.new(initial_action.s_value) - references = [] - - while !scanner.eos? do - if reference = scan_reference(scanner) - references << reference - else - scanner.getch - end - end - - initial_action.token_code.references = references - end - - @printers.each do |printer| - scanner = StringScanner.new(printer.code.s_value) - references = [] - - while !scanner.eos? do - if reference = scan_reference(scanner) - references << reference - else - scanner.getch - end - end - - printer.code.token_code.references = references - end - - @error_tokens.each do |error_token| - scanner = StringScanner.new(error_token.code.s_value) - references = [] - - while !scanner.eos? do - if reference = scan_reference(scanner) - references << reference - else - scanner.getch - end - end - - error_token.code.token_code.references = references - end - - @_rules.each do |lhs, rhs, _| - rhs.each_with_index do |token, index| - next unless token.class == Lrama::Lexer::Token::UserCode - - scanner = StringScanner.new(token.s_value) - references = [] - - while !scanner.eos? do - case - when reference = scan_reference(scanner) - references << reference - when scanner.scan(/\/\*/) - scanner.scan_until(/\*\//) - else - scanner.getch - end - end - - token.references = references - numberize_references(lhs, rhs, token.references) - end - end - end - def find_nterm_by_id!(id) nterms.find do |nterm| nterm.id == id end || (raise "Nterm not found: #{id}") end @@ -442,39 +348,13 @@ term = add_nterm(id: Lrama::Lexer::Token::Ident.new(s_value: "$accept")) term.accept_symbol = true @accept_symbol = term end - def numberize_references(lhs, rhs, references) - references.map! {|ref| - ref_name = ref.value - if ref_name.is_a?(::String) && ref_name != '$' - value = - if lhs.referred_by?(ref_name) - '$' - else - index = rhs.find_index {|token| token.referred_by?(ref_name) } - - if index - index + 1 - else - raise "'#{ref_name}' is invalid name." - end - end - - ref.value = value - ref - else - ref - end - } - end - # 1. Add $accept rule to the top of rules - # 2. Extract precedence and last action - # 3. Extract action in the middle of RHS into new Empty rule - # 4. Append id and extract action then create Rule + # 2. Extract action in the middle of RHS into new Empty rule + # 3. Append id and extract action then create Rule # # Bison 3.8.2 uses different orders for symbol number and rule number # when a rule has actions in the middle of a rule. # # For example, @@ -491,131 +371,42 @@ # 164 program # 165 $@1 # def normalize_rules # 1. Add $accept rule to the top of rules - accept = find_symbol_by_s_value!("$accept") - eof = find_symbol_by_number!(0) - lineno = @_rules.first ? @_rules.first[2] : 0 - @rules << Rule.new(id: @rules.count, lhs: accept, rhs: [@_rules.first[0], eof], code: nil, lineno: lineno) + accept = @accept_symbol + eof = @eof_symbol + lineno = @rule_builders.first ? @rule_builders.first.line : 0 + @rules << Rule.new(id: @rule_counter.increment, _lhs: accept.id, _rhs: [@rule_builders.first.lhs, eof.id], token_code: nil, lineno: lineno) - extracted_action_number = 1 # @n as nterm + setup_rules - @_rules.each do |lhs, rhs, lineno| - a = [] - rhs1 = [] - code = nil - precedence_sym = nil - - # 2. Extract precedence and last action - rhs.reverse.each do |r| - case - when r.is_a?(Symbol) # precedence_sym - precedence_sym = r - when r.is_a?(Lrama::Lexer::Token::UserCode) && precedence_sym.nil? && code.nil? && rhs1.empty? - code = r - else - rhs1 << r - end + @rule_builders.each do |builder| + # Extract actions in the middle of RHS into new rules. + builder.midrule_action_rules.each do |rule| + @rules << rule end - rhs1.reverse! - # Bison n'th component is 1-origin - (rhs1 + [code]).compact.each.with_index(1) do |token, i| - if token.is_a?(Lrama::Lexer::Token::UserCode) - token.references.each do |ref| - # Need to keep position_in_rhs for actions in the middle of RHS - ref.position_in_rhs = i - 1 - next if ref.type == :at - # $$, $n, @$, @n can be used in any actions - - if ref.value == "$" - # TODO: Should be postponed after middle actions are extracted? - ref.referring_symbol = lhs - elsif ref.value.is_a?(Integer) - raise "Can not refer following component. #{ref.value} >= #{i}. #{token}" if ref.value >= i - rhs1[ref.value - 1].referred = true - ref.referring_symbol = rhs1[ref.value - 1] - elsif ref.value.is_a?(String) - target_tokens = ([lhs] + rhs1 + [code]).compact.first(i) - referring_symbol_candidate = target_tokens.filter {|token| token.referred_by?(ref.value) } - raise "Referring symbol `#{ref.value}` is duplicated. #{token}" if referring_symbol_candidate.size >= 2 - raise "Referring symbol `#{ref.value}` is not found. #{token}" if referring_symbol_candidate.count == 0 - - referring_symbol = referring_symbol_candidate.first - referring_symbol.referred = true - ref.referring_symbol = referring_symbol - end - end - end + builder.rules.each do |rule| + add_nterm(id: rule._lhs) + @rules << rule end - rhs2 = rhs1.map do |token| - if token.is_a?(Lrama::Lexer::Token::UserCode) - prefix = token.referred ? "@" : "$@" - new_token = Lrama::Lexer::Token::Ident.new(s_value: prefix + extracted_action_number.to_s) - extracted_action_number += 1 - a << [new_token, token] - new_token - else - token - end + builder.midrule_action_rules.each do |rule| + add_nterm(id: rule._lhs) end - - # Extract actions in the middle of RHS - # into new rules. - a.each do |new_token, code| - @rules << Rule.new(id: @rules.count, lhs: new_token, rhs: [], code: Code.new(type: :user_code, token_code: code), lineno: code.line) - end - - c = code ? Code.new(type: :user_code, token_code: code) : nil - # Expand Parameterizing rules - if rhs2.any? {|r| r.is_a?(Lrama::Lexer::Token::Parameterizing) } - expand_parameterizing_rules(lhs, rhs2, c, precedence_sym, lineno) - else - @rules << Rule.new(id: @rules.count, lhs: lhs, rhs: rhs2, code: c, precedence_sym: precedence_sym, lineno: lineno) - end - add_nterm(id: lhs) - a.each do |new_token, _| - add_nterm(id: new_token) - end end end - def expand_parameterizing_rules(lhs, rhs, code, precedence_sym, lineno) - token = Lrama::Lexer::Token::Ident.new(s_value: rhs[0].s_value) - if rhs.any? {|r| r.is_a?(Lrama::Lexer::Token::Parameterizing) && r.option? } - option_token = Lrama::Lexer::Token::Ident.new(s_value: "option_#{rhs[0].s_value}") - add_term(id: option_token) - @rules << Rule.new(id: @rules.count, lhs: lhs, rhs: [option_token], code: code, precedence_sym: precedence_sym, lineno: lineno) - @rules << Rule.new(id: @rules.count, lhs: option_token, rhs: [], code: code, precedence_sym: precedence_sym, lineno: lineno) - @rules << Rule.new(id: @rules.count, lhs: option_token, rhs: [token], code: code, precedence_sym: precedence_sym, lineno: lineno) - elsif rhs.any? {|r| r.is_a?(Lrama::Lexer::Token::Parameterizing) && r.nonempty_list? } - nonempty_list_token = Lrama::Lexer::Token::Ident.new(s_value: "nonempty_list_#{rhs[0].s_value}") - add_term(id: nonempty_list_token) - @rules << Rule.new(id: @rules.count, lhs: lhs, rhs: [nonempty_list_token], code: code, precedence_sym: precedence_sym, lineno: lineno) - @rules << Rule.new(id: @rules.count, lhs: nonempty_list_token, rhs: [token], code: code, precedence_sym: precedence_sym, lineno: lineno) - @rules << Rule.new(id: @rules.count, lhs: nonempty_list_token, rhs: [nonempty_list_token, token], code: code, precedence_sym: precedence_sym, lineno: lineno) - elsif rhs.any? {|r| r.is_a?(Lrama::Lexer::Token::Parameterizing) && r.list? } - list_token = Lrama::Lexer::Token::Ident.new(s_value: "list_#{rhs[0].s_value}") - add_term(id: list_token) - @rules << Rule.new(id: @rules.count, lhs: lhs, rhs: [list_token], code: code, precedence_sym: precedence_sym, lineno: lineno) - @rules << Rule.new(id: @rules.count, lhs: list_token, rhs: [], code: code, precedence_sym: precedence_sym, lineno: lineno) - @rules << Rule.new(id: @rules.count, lhs: list_token, rhs: [list_token, token], code: code, precedence_sym: precedence_sym, lineno: lineno) - end - end - # Collect symbols from rules def collect_symbols - @rules.flat_map(&:rhs).each do |s| + @rules.flat_map(&:_rhs).each do |s| case s when Lrama::Lexer::Token::Char add_term(id: s) when Lrama::Lexer::Token # skip - when Symbol - # skip else raise "Unknown class: #{s}" end end end @@ -693,34 +484,22 @@ end end def replace_token_with_symbol @rules.each do |rule| - rule.lhs = token_to_symbol(rule.lhs) + rule.lhs = token_to_symbol(rule._lhs) if rule._lhs - rule.rhs.map! do |t| + rule.rhs = rule._rhs.map do |t| token_to_symbol(t) end - - if rule.code - rule.code.references.each do |ref| - next if ref.type == :at - - if !ref.referring_symbol.is_a?(Lrama::Lexer::Token::UserCode) - ref.referring_symbol = token_to_symbol(ref.referring_symbol) - end - end - end end end def token_to_symbol(token) case token when Lrama::Lexer::Token find_symbol_by_id!(token) - when Symbol - token else raise "Unknown class: #{token}" end end @@ -799,20 +578,26 @@ return if invalid.empty? raise "Symbol number is duplicated. #{invalid}" end - def validate_no_declared_type_reference! + def validate_symbol_alias_name_uniqueness! + invalid = @symbols.select(&:alias_name).group_by(&:alias_name).select do |alias_name, syms| + syms.count > 1 + end + + return if invalid.empty? + + raise "Symbol alias name is duplicated. #{invalid}" + end + + def validate_rule_lhs_is_nterm! errors = [] rules.each do |rule| - next unless rule.code + next if rule.lhs.nterm? - rule.code.references.select do |ref| - ref.type == :dollar && !ref.tag - end.each do |ref| - errors << "$#{ref.value} of '#{rule.lhs.id.s_value}' has no declared type" - end + errors << "[BUG] LHS of #{rule} (line: #{rule.lineno}) is term. It should be nterm." end return if errors.empty? raise errors.join("\n")