require_relative 'abnf/core' require_relative 'abnf/meta' require 'logger' # ABNF parser # Parses ABNF into an array of {EBNF::Rule}. module EBNF class ABNF include EBNF::PEG::Parser # Regular expressions for both "Core" and ABNF-specific terminals. ALPHA = %r{[\x41-\x5A\x61-\x7A]} VCHAR = %r{[\x20-\x7E]} WSP = %r{[\x20\x09]} CRLF = %r{\x0D?\x0A} COMMENT = %r{;(?:#{WSP}|#{VCHAR})*#{CRLF}} C_NL = %r{#{COMMENT}|#{CRLF}} C_WSP = %r{#{WSP}|(?:#{C_NL}#{WSP})} ## # Hash of generated {EBNF::Rule} objects by symbol # # @return [Hash{Symbol => EBNF::Rule}] attr_reader :parsed_rules ## # The following ABNF grammar rules are treated as terminals. # `rulename ::= ALPHA (ALPHA | DIGIT | "-")*` terminal(:rulename, /#{ALPHA}(?:#{ALPHA}|[0-9-])*/) do |value| value.to_sym end # `defined_as ::= c_wsp* ("=" | "=/") c_wsp*` terminal(:defined_as, /#{C_WSP}*=\/?#{C_WSP}*/) {|value| value.strip} # `quoted_string::= DQUOTE [#x20-#x21#x23-#x7E]* DQUOTE` terminal(:quoted_string, /"[\x20-\x21\x23-\x7E]*"/) do |value| value[1..-2] end # `bin_val ::= "b" BIT+ (("." BIT+)+ | ("-" BIT+))?` terminal(:bin_val, /b[01]+(?:(?:(?:\.[01]+)+)|(?:-[01]+))?/) do |value| if value.include?('.') # Interpret segments in binary creating a sequence of hex characters or a string hex_or_string(value[1..-1].split('.').map {|b| b.to_i(base=2).chr(Encoding::UTF_8)}) elsif value.include?('-') # Interpret as a range [:range, value[1..-1].split('-').map {|b| "#x%x" % b.to_i(base=2)}.join("-")] else # Interpret as a single HEX character [:hex, "#x%x" % value[1..-1].to_i(base=2)] end end # `dec_val ::= "d" DIGIT+ (("." DIGIT+)+ | ("-" DIGIT+))?` terminal(:dec_val, /d[0-9]+(?:(?:(?:\.[0-9]+)+)|(?:-[0-9]+))?/) do |value| if value.include?('.') # Interpret segments in decimal creating a sequence of hex characters or a string hex_or_string(value[1..-1].split('.').map {|b| b.to_i.chr(Encoding::UTF_8)}) elsif value.include?('-') # Interpret as a range [:range, value[1..-1].split('-').map {|d| "#x%x" % d.to_i}.join("-")] else # Interpret as a single HEX character [:hex, "#x%x" % value[1..-1].to_i] end end # `hex_val ::= "x" HEXDIG+ (("." HEXDIG+)+ | ("-" HEXDIG+))?` terminal(:hex_val, /x[0-9A-F]+(?:(?:(?:\.[0-9A-F]+)+)|(?:-[0-9A-F]+))?/i) do |value| if value.include?('.') # Interpret segments in hexadecimal creating a sequence of hex characters or a string hex_or_string(value[1..-1].split('.').map {|b| b.to_i(base=16).chr(Encoding::UTF_8)}) elsif value.include?('-') # Interpret as a range [:range, value[1..-1].split('-').map {|h| "#x%x" % h.to_i(base=16)}.join("-")] else # Interpret as a single HEX character [:hex, "#x#{value[1..-1]}"] end end # `c_wsp ::= WSP | (c_nl WSP)` terminal(:c_wsp, C_WSP) # `c_nl ::= comment | CRLF` terminal(:c_nl, C_NL) # `DIGIT ::= [#x30-#x39]` terminal(:DIGIT, /\d/) # ## Non-terminal productions # The `start_production` on `:rule` allows the parser to present the value as a single Hash, rather than an array of individual hashes. start_production(:rule, as_hash: true) # `rule ::= rulename defined_as elements c_nl` production(:rule) do |value| # value contains an expression. # Invoke callback sym = value[:rulename] elements = value[:elements] if value[:defined_as] == "=/" # append to rule alternate rule = parsed_rules.fetch(sym) {raise "No existing rule found for #{sym}"} rule.expr = [:alt, rule.expr] unless rule.alt? if elements.is_a?(Array) && elements.first == :alt # append alternatives to rule rule.expr.concat(elements[1..-1]) else # add elements as last alternative rule.expr.push(elements) end else # There shouldn't be an existing rule raise "Redefining rule #{sym}" if parsed_rules.has_key?(sym) parsed_rules[sym] = EBNF::Rule.new(sym.to_sym, nil, elements) end progress(:rule, level: 2) {parsed_rules[sym].to_sxp} sym end # `elements ::= alternation c_wsp*` production(:elements) do |value| value.first[:alternation] end # `alternation ::= concatenation (c_wsp* "/" c_wsp* concatenation)*` production(:alternation) do |value| unless value.last[:_alternation_1].empty? [:alt, value.first[:concatenation]] + value.last[:_alternation_1] else value.first[:concatenation] end end # The `_aleteration_2` rule comes from the expanded PEG grammar and serves as an opportunity to custommize the values presented to the `aleteration` rule. production(:_alternation_2) do |value| if Array(value.last[:concatenation]).first == :alt value.last[:concatenation][1..-1] else [value.last[:concatenation]] end value.last[:concatenation] end # `concatenation::= repetition (c_wsp+ repetition)*` production(:concatenation) do |value| unless value.last[:_concatenation_1].empty? [:seq, value.first[:repetition]] + value.last[:_concatenation_1] else value.first[:repetition] end end start_production(:_concatenation_2, as_hash: true) production(:_concatenation_2) do |value| value[:repetition] end # `repetition ::= repeat? element` production(:repetition) do |value| rept = value.first[:_repetition_1] elt = value.last[:element] case rept when [0, '*'] then [:star, elt] when [1, '*'] then [:plus, elt] when nil then elt else [:rept, rept.first, rept.last, elt] end end # `repeat ::= DIGIT+ | (DIGIT* "*" DIGIT*)` production(:repeat) do |value| if value.is_a?(Integer) [value, value] else [value.first, value.last] end end start_production(:_repeat_1, as_hash: true) production(:_repeat_1) {|value| value.values} production(:_repeat_2) {|value| value.join("").to_i} production(:_repeat_3) {|value| value.join("").to_i} production(:_repeat_4) {|value| value.length > 0 ? value.join("").to_i : '*'} # `element ::= rulename | group | option | char_val | num_val | prose_val` production(:element) do |value| value end # `group ::= "(" c_wsp* alternation c_wsp* ")"` start_production(:group, as_hash: true) production(:group) do |value| value[:alternation] end # `option ::= "[" c_wsp* alternation c_wsp* "]"` start_production(:option, as_hash: true) production(:option) do |value| [:opt, value[:alternation]] end # `case_insensitive_string ::= "%i"? quoted_string` production(:case_insensitive_string) do |value| str = value.last[:quoted_string] if str.match?(/[[:alpha:]]/) # Only need to use case-insensitive if there are alphabetic characters in the string. [:istr, value.last[:quoted_string].tap {|s| s.quote_style = :dquote}] else value.last[:quoted_string].tap {|s| s.quote_style = :dquote} end end # `case_sensitive_string ::= "%s" quoted_string` production(:case_sensitive_string) do |value| value.last[:quoted_string].tap {|s| s.quote_style = :squote} end # `num_val ::= "%" (bin_val | dec_val | hex_val)` production(:num_val) do |value| value.last[:_num_val_1] end # ## Parser invocation. # On start, yield ourselves if a block is given, otherwise, return this parser instance # # @param [#read, #to_s] input # @param [Hash{Symbol => Object}] options # @option options [Boolean] :level # Trace level. 0(debug), 1(info), 2(warn), 3(error). # @return [EBNFParser] def initialize(input, **options) # If the `level` option is set, instantiate a logger for collecting trace information. if options.has_key?(:level) options[:logger] = Logger.new(STDERR) options[:logger].level = options[:level] options[:logger].formatter = lambda {|severity, datetime, progname, msg| "#{severity} #{msg}\n"} end # Read input, if necessary, which will be used in a Scanner. @input = input.respond_to?(:read) ? input.read : input.to_s @parsed_rules = {} # Parses into `@parsed_rules` parse(@input, :rulelist, # Starting rule ABNFMeta::RULES, # PEG rules whitespace: '', # No implicit whitespace **options) rescue EBNF::PEG::Parser::Error => e raise SyntaxError, e.message end ## # The AST includes the parsed rules along with built-in rules for ABNF used within the parsed grammar. # # @return [Array] def ast # Add built-in rules for standard ABNF rules not parsed_rules.values.map(&:symbols).flatten.uniq.each do |sym| rule = ABNFCore::RULES.detect {|r| r.sym == sym} parsed_rules[sym] ||= rule if rule end parsed_rules.values end private # Generate a combination of seq and string to represent a sequence of characters # # @param [Array] characters # @return [String,Array] def hex_or_string(characters) seq = [:seq] str_result = "" characters.each do |c| if VCHAR.match?(c) str_result << c else if str_result.length > 0 seq << str_result str_result = "" end seq << [:hex, "#x%x" % c.codepoints.first] end end seq << str_result if str_result.length > 0 # Either return the sequence, or a string if seq.length == 2 && seq.last.is_a?(String) seq.last else seq end end end end