lib/ebnf/parser.rb in ebnf-2.0.0 vs lib/ebnf/parser.rb in ebnf-2.1.0

- old
+ new

@@ -1,322 +1,305 @@ +require_relative 'ebnf/meta' +require 'logger' + module EBNF - module Parser - ## - # Iterate over rule strings. - # a line that starts with '\[' or '@' starts a new rule + class Parser + include EBNF::PEG::Parser + include EBNF::Terminals + + # Abstract syntax tree from parse # - # @param [StringScanner] scanner - # @yield rule_string - # @yieldparam [String] rule_string - def eachRule(scanner) - cur_lineno = 1 - r = '' - until scanner.eos? - case - when s = scanner.scan(%r(\s+)m) - # Eat whitespace - cur_lineno += s.count("\n") - #debug("eachRule(ws)") { "[#{cur_lineno}] #{s.inspect}" } - when s = scanner.scan(%r(/\*([^\*]|\*[^\/])*\*/)m) - # Eat comments /* .. */ - cur_lineno += s.count("\n") - debug("eachRule(comment)") { "[#{cur_lineno}] #{s.inspect}" } - when s = scanner.scan(%r(\(\*([^\*]|\*[^\)])*\*\))m) - # Eat comments (* .. *) - cur_lineno += s.count("\n") - debug("eachRule(comment)") { "[#{cur_lineno}] #{s.inspect}" } - when s = scanner.scan(%r((#(?!x)|//).*$)) - # Eat comments // & # - cur_lineno += s.count("\n") - debug("eachRule(comment)") { "[#{cur_lineno}] #{s.inspect}" } - when s = scanner.scan(/\A["']/) - # Found a quote, scan until end of matching quote - s += scanner.scan_until(/#{scanner.matched}|$/) - r += s - when s = scanner.scan(%r(^@terminals)) - #debug("eachRule(@terminals)") { "[#{cur_lineno}] #{s.inspect}" } - yield(r) unless r.empty? - @lineno = cur_lineno - yield(s) - r = '' - when s = scanner.scan(/@pass/) - # Found rule start, if we've already collected a rule, yield it - #debug("eachRule(@pass)") { "[#{cur_lineno}] #{s.inspect}" } - yield r unless r.empty? - @lineno = cur_lineno - r = s - when s = scanner.scan(/(?:\[[\w\.]+\])\s*[\w\.]+\s*::=/) - # Found rule start, if we've already collected a rule, yield it - yield r unless r.empty? - #debug("eachRule(rule)") { "[#{cur_lineno}] #{s.inspect}" } - @lineno = cur_lineno - r = s - else - # Collect until end of line, or start of comment or quote - s = scanner.scan_until(%r{(?:[/\(]\*)|#(?!x)|//|["']|$}) - if scanner.matched.length > 0 - # Back up scan head before ending match - scanner.pos = scanner.pos - scanner.matched.length + # @return [Array<EBNF::Rule>] + attr_reader :ast - # Remove matched from end of string - s = s[0..-(scanner.matched.length+1)] - end - cur_lineno += s.count("\n") - #debug("eachRule(rest)") { "[#{cur_lineno}] #{s.inspect}" } - r += s - end - end - yield r unless r.empty? + # ## Terminals + # Define rules for Terminals, placing results on the input stack, making them available to upstream non-Terminal rules. + # + # Terminals are defined with a symbol matching the associated rule name, and an optional (although strongly encouraged) regular expression used to match the head of the input stream. + # + # The result of the terminal block is the semantic value of that terminal, which if often a string, but may be any instance which reflects the semantic interpretation of that terminal. + # + # The `value` parameter is the value matched by the regexp, if defined, or by the sub-terminal rules otherwise. + # + # The `prod` parameter is the name of the parent rule for which this terminal is matched, which may have a bearing in some circumstances, although not used in this example. + # + # If no block is provided, then the value which would have been passed to the block is used as the result directly. + + # Match the Left hand side of a rule or terminal + # + # [11] LHS ::= ('[' SYMBOL+ ']' ' '+)? SYMBOL ' '* '::=' + terminal(:LHS, LHS) do |value, prod| + value.to_s.scan(/(?:\[([^\]]+)\])?\s*(\w+)\s*::=/).first end - - ## - # Parse a rule into an optional rule number, a symbol and an expression + + # Match `SYMBOL` terminal # - # @param [String] rule - # @return [Rule] - def ruleParts(rule) - num_sym, expr = rule.split('::=', 2).map(&:strip) - num, sym = num_sym.split(']', 2).map(&:strip) - num, sym = "", num if sym.nil? - num = num[1..-1] - r = Rule.new(sym && sym.to_sym, num, expression(expr).first, ebnf: self) - debug("ruleParts") { r.inspect } - r + # [12] SYMBOL ::= ([a-z] | [A-Z] | [0-9] | '_' | '.')+ + terminal(:SYMBOL, SYMBOL) do |value| + value.to_sym end - ## - # Parse a string into an expression tree and a remaining string + # Match `HEX` terminal # - # @example - # >>> expression("a b c") - # ((seq a b c) '') - # - # >>> expression("a? b+ c*") - # ((seq (opt a) (plus b) (star c)) '') - # - # >>> expression(" | x xlist") - # ((alt (seq) (seq x xlist)) '') - # - # >>> expression("a | (b - c)") - # ((alt a (diff b c)) '') - # - # >>> expression("a b | c d") - # ((alt (seq a b) (seq c d)) '') - # - # >>> expression("a | b | c") - # ((alt a b c) '') - # - # >>> expression("a) b c") - # (a ' b c') - # - # >>> expression("BaseDecl? PrefixDecl*") - # ((seq (opt BaseDecl) (star PrefixDecl)) '') - # - # >>> expression("NCCHAR1 | diff | [0-9] | #x00B7 | [#x0300-#x036F] | \[#x203F-#x2040\]") - # ((alt NCCHAR1 diff - # (range '0-9') - # (hex '#x00B7') - # (range '#x0300-#x036F') - # (range, '#x203F-#x2040')) '') - # - # @param [String] s - # @return [Array] - def expression(s) - debug("expression") {"(#{s.inspect})"} - e, s = depth {alt(s)} - debug {"=> alt returned #{[e, s].inspect}"} - unless s.to_s.empty? - t, ss = depth {terminal(s)} - debug {"=> terminal returned #{[t, ss].inspect}"} - return [e, ss] if t.is_a?(Array) && t.first == :")" - end - [e, s] + # [13] HEX ::= #x' ([a-f] | [A-F] | [0-9])+ + terminal(:HEX, HEX) do |value| + [:hex, value] end - - ## - # Parse alt - # >>> alt("a | b | c") - # ((alt a b c) '') - # @param [String] s - # @return [Array] - def alt(s) - debug("alt") {"(#{s.inspect})"} - args = [] - while !s.to_s.empty? - e, s = depth {seq(s)} - debug {"=> seq returned #{[e, s].inspect}"} - if e.to_s.empty? - break unless args.empty? - e = [:seq, []] # empty sequence - end - args << e - unless s.to_s.empty? - t, ss = depth {terminal(s)} - break unless t[0] == :alt - s = ss - end - end - args.length > 1 ? [args.unshift(:alt), s] : [e, s] + + # Terminal for `RANGE` is matched as part of a `primary` rule. + # + # [14] RANGE ::= '[' ((R_CHAR '-' R_CHAR) | (HEX '-' HEX) | R_CHAR | HEX)+ '-'? ']' - LHS + terminal(:RANGE, RANGE) do |value| + [:range, value[1..-2]] end - - ## - # parse seq + + # Terminal for `O_RANGE` is matched as part of a `primary` rule. # - # >>> seq("a b c") - # ((seq a b c) '') - # - # >>> seq("a b? c") - # ((seq a (opt b) c) '') - def seq(s) - debug("seq") {"(#{s.inspect})"} - args = [] - while !s.to_s.empty? - e, ss = depth {diff(s)} - debug {"=> diff returned #{[e, ss].inspect}"} - unless e.to_s.empty? - args << e - s = ss - else - break; - end - end - if args.length > 1 - [args.unshift(:seq), s] - elsif args.length == 1 - args + [s] + # [15] O_RANGE ::= '[^' ((R_CHAR '-' R_CHAR) | (HEX '-' HEX) | R_CHAR | HEX)+ '-'? ']' + terminal(:O_RANGE, O_RANGE) do |value| + [:range, value[1..-2]] + end + + # Match double quote string + # + # [16] STRING1 ::= '"' (CHAR - '"')* '"' + terminal(:STRING1, STRING1) do |value| + value[1..-2] + end + + # Match single quote string + # + # [17] STRING2 ::= "'" (CHAR - "'")* "'" + terminal(:STRING2, STRING2) do |value| + value[1..-2] + end + + # The `CHAR` and `R_CHAR` productions are not used explicitly + + # Match `POSTFIX` terminal + # + # [20] POSTFIX ::= [?*+] + terminal(:POSTFIX, POSTFIX) + + # The `PASS` productions is not used explicitly + + # ## Non-terminal productions + # Define productions for non-Termainals. This can include `start_production` as well as `production` to hook into rule start and end. In some cases, we need to use sub-productions as generated when turning EBNF into PEG. + # + # Productions are defined with a symbol matching the associated rule name. + # + # The result of the productions is typically the abstract syntax tree matched by the rule, so far, but could be a specific semantic value, or could be ignored with the result being returned via the `callback`. + # + # The `value` parameter is the result returned from child productions + # + # The `data` parameter other data which may be returned by child productions placing information onto their input (unused in this example). + # + # The `callback` parameter provides access to a callback defined in the call to `parse`). + + # Production for end of `declaration` non-terminal. + # + # Look for `@terminals` to change parser state to parsing terminals. + # + # Clears the packrat parser when called. + # + # `@pass` is ignored here. + # + # [2] declaration ::= '@terminals' | pass + production(:declaration, clear_packrat: true) do |value, data, callback| + # value contains a declaration. + # Invoke callback + callback.call(:terminals) if value == '@terminals' + nil + end + + # Production for end of `rule` non-terminal. + # + # By setting `as_hash: true` in the `start_production`, the `value` parameter will be in the form `{LHS: "v", expression: "v"}`. Otherwise, it would be expressed using an array of hashes of the form `[{LHS: "v"}, {expression: "v"}]`. + # + # Clears the packrat parser when called. + # + # Create rule from expression value and pass to callback + # + # [3] rule ::= LHS expression + start_production(:rule, as_hash: true) + production(:rule, clear_packrat: true) do |value, data, callback| + # value contains an expression. + # Invoke callback + id, sym = value[:LHS] + expression = value[:expression] + callback.call(:rule, EBNF::Rule.new(sym.to_sym, id, expression)) + nil + end + + # Production for end of `expression` non-terminal. + # Passes through the optimized value of the alt production as follows: + # + # The `value` parameter, is of the form `[{alt: "v"}]`. + # + # [:alt foo] => foo + # [:alt foo bar] => [:alt foo bar] + # + # [4] expression ::= alt + production(:expression) do |value| + value.first[:alt] + end + + # Production for end of `alt` non-terminal. + # Passes through the optimized value of the seq production as follows: + # + # The `value` parameter, is of the form `{seq: "v", _alt_1: "v"}`. + # + # [:seq foo] => foo + # [:seq foo bar] => [:seq foo bar] + # + # Note that this also may just pass through from `_alt_1` + # + # [5] alt ::= seq ('|' seq)* + start_production(:alt, as_hash: true) + production(:alt) do |value| + if value[:_alt_1].length > 0 + [:alt, value[:seq]] + value[:_alt_1] else - ["", s] + value[:seq] end end - - ## - # parse diff - # - # >>> diff("a - b") - # ((diff a b) '') - def diff(s) - debug("diff") {"(#{s.inspect})"} - e1, s = depth {postfix(s)} - debug {"=> postfix returned #{[e1, s].inspect}"} - unless e1.to_s.empty? - unless s.to_s.empty? - t, ss = depth {terminal(s)} - debug {"diff #{[t, ss].inspect}"} - if t.is_a?(Array) && t.first == :diff - s = ss - e2, s = primary(s) - unless e2.to_s.empty? - return [[:diff, e1, e2], s] - else - error("diff", "Syntax Error") - raise "Syntax Error" - end - end - end - end - [e1, s] + + # Production for end of `_alt_1` non-terminal. + # Used to collect the `('|' seq)*` portion of the `alt` non-terminal: + # + # The `value` parameter, is of the form `[{seq: ["v"]}]`. + # + # [5] _alt_1 ::= ('|' seq)* + production(:_alt_1) do |value| + value.map {|a1| a1.last[:seq]}.compact # Get rid of '|' end - - ## - # parse postfix - # - # >>> postfix("a b c") - # (a ' b c') - # - # >>> postfix("a? b c") - # ((opt, a) ' b c') - def postfix(s) - debug("postfix") {"(#{s.inspect})"} - e, s = depth {primary(s)} - debug {"=> primary returned #{[e, s].inspect}"} - return ["", s] if e.to_s.empty? - if !s.to_s.empty? - t, ss = depth {terminal(s)} - debug {"=> #{[t, ss].inspect}"} - if t.is_a?(Array) && [:opt, :star, :plus].include?(t.first) - return [[t.first, e], ss] - end - end - [e, s] + + # Production for end of `seq` non-terminal. + # Passes through the optimized value of the `diff` production as follows: + # + # The `value` parameter, is an array of values, which cannot be empty. + # + # [:diff foo] => foo + # [:diff foo bar] => [:diff foo bar] + # + # Note that this also may just pass through from `_seq_1` + # + # [6] seq ::= diff+ + production(:seq) do |value| + value.length == 1 ? value.first : ([:seq] + value) end - ## - # parse primary - # - # >>> primary("a b c") - # (a ' b c') - def primary(s) - debug("primary") {"(#{s.inspect})"} - t, s = depth {terminal(s)} - debug {"=> terminal returned #{[t, s].inspect}"} - if t.is_a?(Symbol) || t.is_a?(String) - [t, s] - elsif %w(range hex).map(&:to_sym).include?(t.first) - [t, s] - elsif t.first == :"(" - e, s = depth {expression(s)} - debug {"=> expression returned #{[e, s].inspect}"} - [e, s] + # `Diff` production returns concatenated postfix values + # + # The `value` parameter, is of the form `{postfix: "v", _diff_1: "v"}`. + # + # [7] diff ::= postfix ('-' postfix)? + start_production(:diff, as_hash: true) + production(:diff) do |value| + if value[:_diff_1] + [:diff, value[:postfix], value[:_diff_1]] else - ["", s] + value[:postfix] end end - - ## - # parse one terminal; return the terminal and the remaining string - # - # A terminal is represented as a tuple whose 1st item gives the type; - # some types have additional info in the tuple. - # - # @example - # >>> terminal("'abc' def") - # ('abc' ' def') - # - # >>> terminal("[0-9]") - # ((range '0-9') '') - # >>> terminal("#x00B7") - # ((hex '#x00B7') '') - # >>> terminal ("\[#x0300-#x036F\]") - # ((range '#x0300-#x036F') '') - # >>> terminal("\[^<>'{}|^`\]-\[#x00-#x20\]") - # ((range "^<>'{}|^`") '-\[#x00-#x20\]') - def terminal(s) - s = s.strip - #STDERR.puts s.inspect - case m = s[0,1] - when '"', "'" # STRING1 or STRING2 - l, s = s[1..-1].split(m.rstrip, 2) - [LL1::Lexer.unescape_string(l), s] - when '[' # RANGE, O_RANGE - l, s = s[1..-1].split(/(?<=[^\\])\]/, 2) - [[:range, LL1::Lexer.unescape_string(l)], s] - when '#' # HEX - s.match(/(#x\h+)(.*)$/) - l, s = $1, $2 - [[:hex, l], s] - when /[\w\.]/ # SYMBOL - s.match(/([\w\.]+)(.*)$/) - l, s = $1, $2 - [l.to_sym, s] - when '@' # @pass or @terminals - s.match(/@(#\w+)(.*)$/) - l, s = $1, $2 - [[:"@", l], s] - when '-' - [[:diff], s[1..-1]] - when '?' - [[:opt], s[1..-1]] - when '|' - [[:alt], s[1..-1]] - when '+' - [[:plus], s[1..-1]] - when '*' - [[:star], s[1..-1]] - when /[\(\)]/ # '(' or ')' - [[m.to_sym], s[1..-1]] - else - error("terminal", "unrecognized terminal: #{s.inspect}") - raise "Syntax Error, unrecognized terminal: #{s.inspect}" + + production(:_diff_1) do |value| + value.last[:postfix] if value + end + + # Production for end of `postfix` non-terminal. + # Either returns the `primary` production value, or as modified by the `postfix`. + # + # The `value` parameter, is of the form `{primary: "v", _postfix_1: "v"}`. + # + # [:primary] => [:primary] + # [:primary, '*'] => [:star, :primary] + # [:primary, '+'] => [:plus, :primary] + # [:primary, '?'] => [:opt, :primary] + # + # [8] postfix ::= primary POSTFIX? + start_production(:postfix, as_hash: true) + production(:postfix) do |value| + # Push result onto input stack, as the `diff` production can have some number of `postfix` values that are applied recursively + case value[:_postfix_1] + when "*" then [:star, value[:primary]] + when "+" then [:plus, value[:primary]] + when "?" then [:opt, value[:primary]] + else value[:primary] end + end + + # Production for end of `primary` non-terminal. + # Places `:primary` on the stack + # + # The `value` parameter, is either a string (for a terminal) or an array of the form `['(': '(', expression: "v", ')', ')']`. + # + # This may either be a terminal, or the result of an `expression`. + # + # [9] primary ::= HEX + # | SYMBOL + # | RANGE + # | O_RANGE + # | STRING1 + # | STRING2 + # | '(' expression ')' + production(:primary) do |value| + Array(value).length > 2 ? value[1][:expression] : value + end + + # Production for end of pass non-terminal. + # + # [10] pass ::= '@pass' expression + production(:pass) do |value, data, callback| + # Invoke callback + callback.call(:pass, value.last[:expression]) + end + + # ## Parser invocation. + # On start, yield ourselves if a block is given, otherwise, return this parser instance + # + # @param [#read, #to_s] input + # @param [Hash{Symbol => Object}] options + # @option options [Boolean] :level + # Trace level. 0(debug), 1(info), 2(warn), 3(error). + # @return [EBNFParser] + def initialize(input, **options, &block) + # If the `level` option is set, instantiate a logger for collecting trace information. + if options.has_key?(:level) + options[:logger] = Logger.new(STDERR) + options[:logger].level = options[:level] + options[:logger].formatter = lambda {|severity, datetime, progname, msg| "#{severity} #{msg}\n"} + end + + # Read input, if necessary, which will be used in a Scanner. + @input = input.respond_to?(:read) ? input.read : input.to_s + + parsing_terminals = false + @ast = [] + parse(@input, :ebnf, EBNFMeta::RULES, + # Use an optimized Regexp for whitespace + whitespace: EBNF::Terminals::PASS, + **options + ) do |context, *data| + rule = case context + when :terminals + # After parsing `@terminals` + # This changes the state of the parser to treat subsequent rules as terminals. + parsing_terminals = true + rule = EBNF::Rule.new(nil, nil, data.first, kind: :terminals) + when :pass + # After parsing `@pass` + # This defines a specific rule for whitespace. + rule = EBNF::Rule.new(nil, nil, data.first, kind: :pass) + when :rule + # A rule which has already been turned into a `Rule` object. + rule = data.first + rule.kind = :terminal if parsing_terminals + rule + end + @ast << rule if rule + end + rescue EBNF::PEG::Parser::Error => e + raise SyntaxError, e.message end end end \ No newline at end of file