require 'strscan' # Extended Bakus-Nour Form (EBNF), being the W3C variation is # originaly defined in the # [W3C XML 1.0 Spec](http://www.w3.org/TR/REC-xml/#sec-notation). # # This version attempts to be less strict than the strict definition # to allow for coloquial variations (such as in the Turtle syntax). # # A rule takes the following form: # \[1\] symbol ::= expression # # Comments include the content between '/*' and '*/' # # @see http://www.w3.org/2000/10/swap/grammar/ebnf2turtle.py # @see http://www.w3.org/2000/10/swap/grammar/ebnf2bnf.n3 # # Based on bnf2turtle by Dan Connolly. # # Motivation # ---------- # # Many specifications include grammars that look formal but are not # actually checked, by machine, against test data sets. Debugging the # grammar in the XML specification has been a long, tedious manual # process. Only when the loop is closed between a fully formal grammar # and a large test data set can we be confident that we have an accurate # specification of a language (and even then, only the syntax of the language). # # # The grammar in the [N3 design note][] has evolved based on the original # manual transcription into a python recursive-descent parser and # subsequent development of test cases. Rather than maintain the grammar # and the parser independently, our [goal] is to formalize the language # syntax sufficiently to replace the manual implementation with one # derived mechanically from the specification. # # # [N3 design note]: http://www.w3.org/DesignIssues/Notation3 # # Related Work # ------------ # # Sean Palmer's [n3p announcement][] demonstrated the feasibility of the # approach, though that work did not cover some aspects of N3. # # In development of the [SPARQL specification][], Eric Prud'hommeaux # developed [Yacker][], which converts EBNF syntax to perl and C and C++ # yacc grammars. It includes an interactive facility for checking # strings against the resulting grammars. # Yosi Scharf used it in [cwm Release 1.1.0rc1][], which includes # a SPAQRL parser that is *almost* completely mechanically generated. # # The N3/turtle output from yacker is lower level than the EBNF notation # from the XML specification; it has the ?, +, and * operators compiled # down to pure context-free rules, obscuring the grammar # structure. Since that transformation is straightforwardly expressed in # semantic web rules (see [bnf-rules.n3][]), it seems best to keep the RDF # expression of the grammar in terms of the higher level EBNF # constructs. # # [goal]: http://www.w3.org/2002/02/mid/1086902566.21030.1479.camel@dirk;list=public-cwm-bugs # [n3p announcement]: http://lists.w3.org/Archives/Public/public-cwm-talk/2004OctDec/0029.html # [Yacker]: http://www.w3.org/1999/02/26-modules/User/Yacker # [SPARQL specification]: http://www.w3.org/TR/rdf-sparql-query/ # [Cwm Release 1.1.0rc1]: http://lists.w3.org/Archives/Public/public-cwm-announce/2005JulSep/0000.html # [bnf-rules.n3]: http://www.w3.org/2000/10/swap/grammar/bnf-rules.n3 # # Open Issues and Future Work # --------------------------- # # The yacker output also has the terminals compiled to elaborate regular # expressions. The best strategy for dealing with lexical tokens is not # yet clear. Many tokens in SPARQL are case insensitive; this is not yet # captured formally. # # The schema for the EBNF vocabulary used here (``g:seq``, ``g:alt``, ...) # is not yet published; it should be aligned with [swap/grammar/bnf][] # and the [bnf2html.n3][] rules (and/or the style of linked XHTML grammar # in the SPARQL and XML specificiations). # # It would be interesting to corroborate the claim in the SPARQL spec # that the grammar is LL(1) with a mechanical proof based on N3 rules. # # [swap/grammar/bnf]: http://www.w3.org/2000/10/swap/grammar/bnf # [bnf2html.n3]: http://www.w3.org/2000/10/swap/grammar/bnf2html.n3 # # Background # ---------- # # The [N3 Primer] by Tim Berners-Lee introduces RDF and the Semantic # web using N3, a teaching and scribbling language. Turtle is a subset # of N3 that maps directly to (and from) the standard XML syntax for # RDF. # # [N3 Primer]: http://www.w3.org/2000/10/swap/Primer.html # # @author Gregg Kellogg class EBNF class Rule # Operations which are flattened to seprate rules in to_bnf BNF_OPS = %w{ seq alt diff opt star plus }.map(&:to_sym).freeze # @!attribute [rw] sym for rule # @return [Symbol] attr_accessor :sym # @!attribute [rw] id of rule # @return [String] attr_accessor :id # @!attribute [rw] kind of rule # @return [:rule, :terminal, or :pass] attr_accessor :kind # @!attribute [rw] expr rule expression # @return [Array] attr_accessor :expr # @!attribute [rw] orig original rule # @return [String] attr_accessor :orig # @!attribute [r] first terminals that immediately procede this rule # @return [Array] attr_reader :first # @!attribute [r] follow terminals that immediately follow this rule # @return [Array] attr_reader :follow # @!attribute [rw] start indicates that this is a starting rule # @return [Boolean] attr_accessor :start # @param [Integer] id # @param [Symbol] sym # @param [Array] expr # @param [EBNF] ebnf # @param [Hash{Symbol => Object}] option # @option options [Symbol] :kind # @option options [String] :ebnf def initialize(sym, id, expr, options = {}) @sym, @id = sym, id @expr = expr.is_a?(Array) ? expr : [:seq, expr] @ebnf = options[:ebnf] @kind = case when options[:kind] then options[:kind] when sym.to_s == sym.to_s.upcase then :terminal when !BNF_OPS.include?(@expr.first) then :terminal else :rule end end # Serializes this rule to an S-Expression # @return [String] def to_sxp elements = [sym, id, [:kind, kind]] elements << [:start, true] if start elements << first.sort_by(&:to_s).unshift(:first) if first elements << follow.sort_by(&:to_s).unshift(:follow) if follow elements << expr elements.to_sxp end def to_s; to_sxp; end # Serializes this rule to an Turtle # @return [String] def to_ttl @ebnf.debug("to_ttl") {inspect} comment = orig.strip. gsub(/"""/, '\"\"\"'). gsub("\\", "\\\\"). sub(/^\"/, '\"'). sub(/\"$/m, '\"') statements = [ %{:#{id} rdfs:label "#{id}"; rdf:value "#{sym}";}, %{ rdfs:comment #{comment.inspect};}, ] statements += ttl_expr(expr, kind == :terminal ? "re" : "g", 1, false) "\n" + statements.join("\n") end ## # Transform EBNF rule to BNF rules: # # * Transform (a [n] rule (op1 (op2))) into two rules: # (a [n] rule (op1 a.2)) # (_a_1 [n.1] rule (op2)) # * Transform (a rule (opt b)) into (a rule (alt _empty "foo")) # * Transform (a rule (star b)) into (a rule (alt _empty (seq b a))) # * Transform (a rule (plus b)) into (a rule (seq b (star b) # @return [Array] def to_bnf new_rules = [] return [self] unless kind == :rule # Look for rules containing recursive definition and rewrite to multiple rules. If `expr` contains elements which are in array form, where the first element of that array is a symbol, create a new rule for it. if expr.any? {|e| e.is_a?(Array) && BNF_OPS.include?(e.first)} # * Transform (a [n] rule (op1 (op2))) into two rules: # (a.1 [n.1] rule (op1 a.2)) # (a.2 [n.2] rule (op2)) # duplicate ourselves for rewriting this = dup rule_seq = 1 new_rules << this expr.each_with_index do |e, index| next unless e.is_a?(Array) && e.first.is_a?(Symbol) new_sym, new_id = "_#{sym}_#{rule_seq}".to_sym, "#{id}.#{rule_seq}" rule_seq += 1 this.expr[index] = new_sym new_rule = Rule.new(new_sym, new_id, e, :ebnf => @ebnf) new_rules << new_rule end # Return new rules after recursively applying #to_bnf new_rules = new_rules.map {|r| r.to_bnf}.flatten elsif expr.first == :opt # * Transform (a rule (opt b)) into (a rule (alt _empty "foo")) new_rules = Rule.new(sym, id, [:alt, :_empty, expr.last], :ebnf => @ebnf).to_bnf elsif expr.first == :star # * Transform (a rule (star b)) into (a rule (alt _empty (seq b a))) new_rules = [Rule.new(sym, id, [:alt, :_empty, "_#{sym}_star".to_sym], :ebnf => @ebnf)] + Rule.new("_#{sym}_star".to_sym, "#{id}*", [:seq, expr.last, sym], :ebnf => @ebnf).to_bnf elsif expr.first == :plus # * Transform (a rule (plus b)) into (a rule (seq b (star b) new_rules = Rule.new(sym, id, [:seq, expr.last, [:star, expr.last]], :ebnf => @ebnf).to_bnf else # Otherwise, no further transformation necessary new_rules << self end return new_rules end # Does this rule start with a sym? It does if expr is that sym, # expr starts with alt and contains that sym, or # expr starts with seq and the next element is that sym # @param [Symbol, class] sym # Symbol matching any start element, or if it is String, any start element which is a String # @return [Array] list of symbol (singular), or strings which are start symbol, or nil if there are none def starts_with(sym) if seq? && sym === (v = expr.fetch(1, nil)) [v] elsif alt? && expr.any? {|e| sym === e} expr.select {|e| sym === e} else nil end end # Add terminal as proceding this rule # @param [Array] terminals # @return [Integer] if number of terminals added def add_first(terminals) @first ||= [] terminals -= @first # Remove those already in first @first += terminals terminals.length end # Add terminal as following this rule. Don't add _eps as a follow # # @param [Array] terminals # @return [Integer] if number of terminals added def add_follow(terminals) terminals -= @follow || [] # Remove those already in first terminals -= [:_eps] # Special case, don't add empty string as a follow terminal unless terminals.empty? @follow ||= [] @follow += terminals end terminals.length end # Is this rule of the form (seq ...)? def seq? expr.is_a?(Array) && expr.first == :seq end # Is this rule of the form (alt ...)? def alt? expr.is_a?(Array) && expr.first == :alt end def inspect "# sym, :id => id, :kind => kind, :expr => expr}.inspect + ">" end # Two rules are equal if they have the same {#sym}, {#kind} and {#expr} # @param [Rule] other # @return [Boolean] def ==(other) sym == other.sym && kind == other.kind && expr == other.expr end # Two rules are equivalent if they have the same {#expr} # @param [Rule] other # @return [Boolean] def equivalent?(other) expr == other.expr end # Rewrite the rule substituting src_rule for dst_rule wherever # it is used in the production (first level only). # @param [Rule] src_rule # @param [Rule] dst_rule # @return [Rule] def rewrite(src_rule, dst_rule) case @expr when Array @expr = @expr.map {|e| e == src_rule.sym ? dst_rule.sym : e} else @expr = dst_rule.sym if @expr == src_rule.sym end self end # Rules compare using their ids def <=>(other) if id.to_i == other.id.to_i id <=> other.id else id.to_i <=> other.id.to_i end end private def ttl_expr(expr, pfx, depth, is_obj = true) indent = ' ' * depth @ebnf.debug("ttl_expr", :depth => depth) {expr.inspect} op = expr.shift if expr.is_a?(Array) statements = [] if is_obj bra, ket = "[ ", " ]" else bra = ket = '' end case op when :seq, :alt, :diff statements << %{#{indent}#{bra}#{pfx}:#{op} (} expr.each {|a| statements += ttl_expr(a, pfx, depth + 1)} statements << %{#{indent} )#{ket}} when :opt, :plus, :star statements << %{#{indent}#{bra}#{pfx}:#{op} } statements += ttl_expr(expr.first, pfx, depth + 1) statements << %{#{indent} #{ket}} unless ket.empty? when :_empty, :_eps, :_empty statements << %{#{indent}"g:#{op.to_s[1..-1]}"} when :"'" statements << %{#{indent}"#{esc(expr)}"} when :range statements << %{#{indent}#{bra} re:matches #{cclass(expr.first).inspect} #{ket}} when :hex raise "didn't expect \" in expr" if expr.include?(:'"') statements << %{#{indent}#{bra} re:matches #{cclass(expr.first).inspect} #{ket}} else if is_obj statements << %{#{indent}#{expr.inspect}} else statements << %{#{indent}g:seq ( #{expr.inspect} )} end end statements.last << " ." unless is_obj @ebnf.debug("statements", :depth => depth) {statements.join("\n")} statements end ## # turn an XML BNF character class into an N3 literal for that # character class (less the outer quote marks) # # >>> cclass("^<>'{}|^`") # "[^<>'{}|^`]" # >>> cclass("#x0300-#x036F") # "[\\u0300-\\u036F]" # >>> cclass("#xC0-#xD6") # "[\\u00C0-\\u00D6]" # >>> cclass("#x370-#x37D") # "[\\u0370-\\u037D]" # # as in: ECHAR ::= '\' [tbnrf\"'] # >>> cclass("tbnrf\\\"'") # 'tbnrf\\\\\\"\'' # # >>> cclass("^#x22#x5C#x0A#x0D") # '^\\u0022\\\\\\u005C\\u000A\\u000D' def cclass(txt) '[' + txt.gsub(/\#x[0-9a-fA-F]+/) do |hx| hx = hx[2..-1] if hx.length <= 4 "\\u#{'0' * (4 - hx.length)}#{hx}" elsif hx.length <= 8 "\\U#{'0' * (8 - hx.length)}#{hx}" end end + ']' end end # Abstract syntax tree from parse attr_reader :ast # Parse the string or file input generating an abstract syntax tree # in S-Expressions (similar to SPARQL SSE) # # @param [#read, #to_s] input # @param [Hash{Symbol => Object}] options # @option options [Boolean, Array] :debug # Output debug information to an array or STDOUT. def initialize(input, options = {}) @options = options @lineno, @depth = 1, 0 terminal = false @ast = [] input = input.respond_to?(:read) ? input.read : input.to_s scanner = StringScanner.new(input) eachRule(scanner) do |r| debug("rule string") {r.inspect} case r when /^@terminals/ # Switch mode to parsing terminals terminal = true when /^@pass\s*(.*)$/m rule = depth {ruleParts("[0] " + r)} rule.kind = :pass rule.orig = r @ast << rule else rule = depth {ruleParts(r)} rule.kind = :terminal if terminal # Override after we've parsed @terminals rule.orig = r @ast << rule end end end ## # Transform EBNF Rule set to BNF: # # * Add rule [0] (_empty rule (seq)) # * Transform each rule into a set of rules that are just BNF, using {Rule#to_bnf}. # @return [ENBF] self def make_bnf new_ast = [Rule.new(:_empty, "0", [:seq], :kind => :rule)] ast.each do |rule| debug("make_bnf") {"expand from: #{rule.inspect}"} new_rules = rule.to_bnf debug(" => ") {new_rules.map(&:sym).join(', ')} new_ast += new_rules end # Consolodate equivalent terminal rules to_rewrite = {} new_ast.select {|r| r.kind == :terminal}.each do |src_rule| new_ast.select {|r| r.kind == :terminal}.each do |dst_rule| if src_rule.equivalent?(dst_rule) && src_rule != dst_rule debug("make_bnf") {"equivalent rules: #{src_rule.inspect} and #{dst_rule.inspect}"} (to_rewrite[src_rule] ||= []) << dst_rule end end end # Replace references to equivalent rules with canonical rule to_rewrite.each do |src_rule, dst_rules| dst_rules.each do |dst_rule| new_ast.each do |mod_rule| debug("make_bnf") {"rewrite #{mod_rule.inspect} from #{dst_rule.sym} to #{src_rule.sym}"} mod_rule.rewrite(dst_rule, src_rule) end end end # AST now has just rewritten rules compacted_ast = new_ast - to_rewrite.values.flatten.compact # Sort AST by number @ast = compacted_ast self end # Iterate over each rule or terminal # @param [:termina, :rule] kind # @yield rule # @yieldparam [Rule] rule def each(kind, &block) ast.each {|r| block.call(r) if r.kind == kind} end ## # Create first/follow for each rule using techniques defined for LL(1) parsers. # # @return [EBNF] self # @see http://en.wikipedia.org/wiki/LL_parser#Constructing_an_LL.281.29_parsing_table # @param [Array] starts # Set of symbols which are start rules def first_follow(starts) # Add _eof to follow all start rules starts.map(&:to_sym).each do |sym| rule = ast.detect {|r| r.sym == sym} raise "No rule found for start symbol #{sym}" unless rule rule.add_follow([:_eof]) rule.start = true end # Comprehnsion rule, create shorter versions of all non-terminal sequences comprehensions = [] begin comprehensions = [] ast.select {|r| r.seq? && r.kind == :rule && r.expr.length > 2}.each do |rule| new_expr = rule.expr[2..-1].unshift(:seq) unless ast.any? {|r| r.expr == new_expr} debug("first_follow") {"add comprehension rule for #{rule.sym} => #{new_expr.inspect}"} new_rule = Rule.new("_#{rule.sym}_comp".to_sym, "#{rule.id}.comp", new_expr) comprehensions << new_rule end end @ast += comprehensions debug("first_follow") {"comprehensions #{comprehensions.length}"} end while !comprehensions.empty? # Fi(a w' ) = { a } for every terminal a # For each rule who's expr's first element of a seq a terminal, or having any element of alt a terminal, add that terminal to the first set for this rule each(:rule) do |rule| each(:terminal) do |terminal| rule.add_first([terminal.sym]) if rule.starts_with(terminal.sym) end # Add strings to first for strings which are start elements start_strs = rule.starts_with(String) rule.add_first(start_strs) if start_strs end # # Fi(ε) = { ε } # Add _eps as a first of _empty empty = ast.detect {|r| r.sym == :_empty} empty.add_first([:_eps]) # Loop until no more first elements are added firsts, follows = 0, 0 begin firsts, follows = 0, 0 each(:rule) do |rule| each(:rule) do |first_rule| next if first_rule == rule || first_rule.first.nil? # Fi(A w' ) = Fi(A) for every nonterminal A with ε not in Fi(A) # For each rule that starts with another rule having firsts, add the firsts of that rule to this rule, unless it already has those terminals in its first if rule.starts_with(first_rule.sym) depth {debug("FF.1") {"add first #{first_rule.first.inspect} to #{rule.sym}"}} firsts += rule.add_first(first_rule.first) end # Fi(A w' ) = Fi(A) \ { ε } ∪ Fi(w' ) for every nonterminal A with ε in Fi(A) # For each rule starting with eps, add the terminals for the comprehension of this rule if rule.seq? && rule.expr.fetch(1, nil) == first_rule && first_rule.first.include?(:_eps) && (comp = find_comp(rule)) depth {debug("FF.2") {"add first #{first_rule.first.inspect} to #{comp.sym}"}} firsts += comp.add_first(first_rule.first) end end # Only run these rules if the rule is a sequence having two or more elements, whos first element is also a sequence and first_rule is the comprehension of rule if rule.seq? && (comp = find_comp(rule)) #if there is a rule of the form Aj → wAiw' , then # if (ai = find_rule(rule.expr[1])) && ai.kind == :rule && comp.first # * if the terminal a is in Fi(w' ), then add a to Fo(Ai) # # Add follow terminals based on the first terminals # of a comprehension of this rule (having the same # sequence other than the first rule in the sequence) # # @example # rule: (seq a b c) # first_rule: (seq b c) # if first_rule.first == [T] # => a.follow += [T] depth {debug("FF.3") {"add follow #{comp.first.inspect} to #{ai.sym}"}} follows += ai.add_follow(comp.first) end # Follows of a rule are also follows of the comprehension of the rule. if rule.follow depth {debug("FF.4") {"add follow #{rule.follow.inspect} to #{comp.sym}"}} follows += comp.add_follow(rule.follow) end # * if ε is in Fi(w' ), then add Fo(Aj) to Fo(Ai) # # If the comprehension of a sequence has an _eps first, then the follows of the rule also become the follows of the first member of the rule if comp.first && comp.first.include?(:_eps) && rule.first member = find_rule(rule.expr.fetch(1, nil)) depth {debug("FF.4") {"add follow #{rule.follow.inspect} to #{member.sym}"}} follows += member.add_follow(rule.first) if member.kind == :rule end end # Follows of a rule are also follows of the last production in the rule if rule.seq? && rule.follow && (member = find_rule(rule.expr.last)) && member.kind == :rule depth {debug("FF.5") {"add follow #{rule.follow.inspect} to #{member.sym}"}} follows += member.add_follow(rule.follow) end # For alts, anything that follows the rule follows each member of the rule if rule.alt? && rule.follow rule.expr[1..-1].map {|s| find_rule(s)}.each do |mem| if mem && mem.kind == :rule depth {debug("FF.6") {"add follow #{rule.first.inspect} to #{mem.sym}"}} follows += mem.add_follow(rule.follow) end end end end debug("first_follow") {"firsts #{firsts}, follows #{follows}"} end while (firsts + follows) > 0 end ## # Write out parsed syntax string as an S-Expression # @return [String] def to_sxp begin require 'sxp' SXP::Generator.string(ast.sort) rescue LoadError ast.to_sxp end end def to_s; to_sxp; end def dup new_obj = super new_obj.instance_variable_set(:@ast, @ast.dup) new_obj end ## # Find a rule given a symbol # @param [Symbol] sym # @return [Rule] def find_rule(sym) (@find ||= {})[sym] ||= ast.detect {|r| r.sym == sym} end ## # Find the comprehension of a rule # Comprehensions are created in {#first_follow} then the rule is a sequence with more than 1 element. They are named automatically as "_sym_comp" where "sym" is the symbol of the source rule # @param [Rule] source # @return [Rule] def find_comp(source) (@comp ||= {})[source.sym] ||= source.seq? && source.expr.length > 2 && find_rule("_#{source.sym}_comp".to_sym) end ## # Write out syntax tree as Turtle # @param [String] prefix for language # @param [String] ns URI for language # @return [String] def to_ttl(prefix, ns) unless ast.empty? [ "@prefix rdf: .", "@prefix rdfs: .", "@prefix #{prefix}: <#{ns}>.", "@prefix : <#{ns}>.", "@prefix re: .", "@prefix g: .", "", ":language rdfs:isDefinedBy <>; g:start :#{ast.first.id}.", "", ] end.join("\n") + ast.sort. select {|a| [:rule, :terminal].include?(a.kind)}. map(&:to_ttl). join("\n") end ## # Iterate over rule strings. # a line that starts with '\[' or '@' starts a new rule # # @param [StringScanner] scanner # @yield rule_string # @yieldparam [String] rule_string def eachRule(scanner) cur_lineno = 1 r = '' until scanner.eos? case when s = scanner.scan(%r(\s+)m) # Eat whitespace cur_lineno += s.count("\n") #debug("eachRule(ws)") { "[#{cur_lineno}] #{s.inspect}" } when s = scanner.scan(%r(/\*([^\*]|\*[^\/])*\*/)m) # Eat comments cur_lineno += s.count("\n") debug("eachRule(comment)") { "[#{cur_lineno}] #{s.inspect}" } when s = scanner.scan(%r(^@terminals)) #debug("eachRule(@terminals)") { "[#{cur_lineno}] #{s.inspect}" } yield(r) unless r.empty? @lineno = cur_lineno yield(s) r = '' when s = scanner.scan(/@pass/) # Found rule start, if we've already collected a rule, yield it #debug("eachRule(@pass)") { "[#{cur_lineno}] #{s.inspect}" } yield r unless r.empty? @lineno = cur_lineno r = s when s = scanner.scan(/\[(?=\w+\])/) # Found rule start, if we've already collected a rule, yield it yield r unless r.empty? #debug("eachRule(rule)") { "[#{cur_lineno}] #{s.inspect}" } @lineno = cur_lineno r = s else # Collect until end of line, or start of comment s = scanner.scan_until(%r((?:/\*)|$)m) cur_lineno += s.count("\n") #debug("eachRule(rest)") { "[#{cur_lineno}] #{s.inspect}" } r += s end end yield r unless r.empty? end ## # Parse a rule into a rule number, a symbol and an expression # # @param [String] rule # @return [Rule] def ruleParts(rule) num_sym, expr = rule.split('::=', 2).map(&:strip) num, sym = num_sym.split(']', 2).map(&:strip) num = num[1..-1] r = Rule.new(sym && sym.to_sym, num, ebnf(expr).first, :ebnf => self) debug("ruleParts") { r.inspect } r end ## # Parse a string into an expression tree and a remaining string # # @example # >>> ebnf("a b c") # ((seq, \[('id', 'a'), ('id', 'b'), ('id', 'c')\]), '') # # >>> ebnf("a? b+ c*") # ((seq, \[(opt, ('id', 'a')), (plus, ('id', 'b')), ('*', ('id', 'c'))\]), '') # # >>> ebnf(" | x xlist") # ((alt, \[(seq, \[\]), (seq, \[('id', 'x'), ('id', 'xlist')\])\]), '') # # >>> ebnf("a | (b - c)") # ((alt, \[('id', 'a'), (diff, \[('id', 'b'), ('id', 'c')\])\]), '') # # >>> ebnf("a b | c d") # ((alt, \[(seq, \[('id', 'a'), ('id', 'b')\]), (seq, \[('id', 'c'), ('id', 'd')\])\]), '') # # >>> ebnf("a | b | c") # ((alt, \[('id', 'a'), ('id', 'b'), ('id', 'c')\]), '') # # >>> ebnf("a) b c") # (('id', 'a'), ' b c') # # >>> ebnf("BaseDecl? PrefixDecl*") # ((seq, \[(opt, ('id', 'BaseDecl')), ('*', ('id', 'PrefixDecl'))\]), '') # # >>> ebnf("NCCHAR1 | diff | [0-9] | #x00B7 | [#x0300-#x036F] | \[#x203F-#x2040\]") # ((alt, \[('id', 'NCCHAR1'), ("'", diff), (range, '0-9'), (hex, '#x00B7'), (range, '#x0300-#x036F'), (range, '#x203F-#x2040')\]), '') # # @param [String] s # @return [Array] def ebnf(s) debug("ebnf") {"(#{s.inspect})"} e, s = depth {alt(s)} debug {"=> alt returned #{[e, s].inspect}"} unless s.empty? t, ss = depth {terminal(s)} debug {"=> terminal returned #{[t, ss].inspect}"} return [e, ss] if t.is_a?(Array) && t.first == :")" end [e, s] end ## # Parse alt # >>> alt("a | b | c") # ((alt, \[('id', 'a'), ('id', 'b'), ('id', 'c')\]), '') # @param [String] s # @return [Array] def alt(s) debug("alt") {"(#{s.inspect})"} args = [] while !s.empty? e, s = depth {seq(s)} debug {"=> seq returned #{[e, s].inspect}"} if e.to_s.empty? break unless args.empty? e = [:seq, []] # empty sequence end args << e unless s.empty? t, ss = depth {terminal(s)} break unless t[0] == :alt s = ss end end args.length > 1 ? [args.unshift(:alt), s] : [e, s] end ## # parse seq # # >>> seq("a b c") # ((seq, \[('id', 'a'), ('id', 'b'), ('id', 'c')\]), '') # # >>> seq("a b? c") # ((seq, \[('id', 'a'), (opt, ('id', 'b')), ('id', 'c')\]), '') def seq(s) debug("seq") {"(#{s.inspect})"} args = [] while !s.empty? e, ss = depth {diff(s)} debug {"=> diff returned #{[e, ss].inspect}"} unless e.to_s.empty? args << e s = ss else break; end end if args.length > 1 [args.unshift(:seq), s] elsif args.length == 1 args + [s] else ["", s] end end ## # parse diff # # >>> diff("a - b") # ((diff, \[('id', 'a'), ('id', 'b')\]), '') def diff(s) debug("diff") {"(#{s.inspect})"} e1, s = depth {postfix(s)} debug {"=> postfix returned #{[e1, s].inspect}"} unless e1.to_s.empty? unless s.empty? t, ss = depth {terminal(s)} debug {"diff #{[t, ss].inspect}"} if t.is_a?(Array) && t.first == :diff s = ss e2, s = primary(s) unless e2.to_s.empty? return [[:diff, e1, e2], s] else raise "Syntax Error" end end end end [e1, s] end ## # parse postfix # # >>> postfix("a b c") # (('id', 'a'), ' b c') # # >>> postfix("a? b c") # ((opt, ('id', 'a')), ' b c') def postfix(s) debug("postfix") {"(#{s.inspect})"} e, s = depth {primary(s)} debug {"=> primary returned #{[e, s].inspect}"} return ["", s] if e.to_s.empty? if !s.empty? t, ss = depth {terminal(s)} debug {"=> #{[t, ss].inspect}"} if t.is_a?(Array) && [:opt, :star, :plus].include?(t.first) return [[t.first, e], ss] end end [e, s] end ## # parse primary # # >>> primary("a b c") # (('id', 'a'), ' b c') def primary(s) debug("primary") {"(#{s.inspect})"} t, s = depth {terminal(s)} debug {"=> terminal returned #{[t, s].inspect}"} if t.is_a?(Symbol) || t.is_a?(String) [t, s] elsif %w(range hex).map(&:to_sym).include?(t.first) [t, s] elsif t.first == :"(" e, s = depth {ebnf(s)} debug {"=> ebnf returned #{[e, s].inspect}"} [e, s] else ["", s] end end ## # parse one terminal; return the terminal and the remaining string # # A terminal is represented as a tuple whose 1st item gives the type; # some types have additional info in the tuple. # # @example # >>> terminal("'abc' def") # (("'", 'abc'), ' def') # # >>> terminal("[0-9]") # ((range, '0-9'), '') # >>> terminal("#x00B7") # ((hex, '#x00B7'), '') # >>> terminal ("\[#x0300-#x036F\]") # ((range, '#x0300-#x036F'), '') # >>> terminal("\[^<>'{}|^`\]-\[#x00-#x20\]") # ((range, "^<>'{}|^`"), '-\[#x00-#x20\]') def terminal(s) s = s.strip case m = s[0,1] when '"', "'" l, s = s[1..-1].split(m, 2) [l, s] when '[' l, s = s[1..-1].split(']', 2) [[:range, l], s] when '#' s.match(/(#\w+)(.*)$/) l, s = $1, $2 [[:hex, l], s] when /[[:alpha:]]/ s.match(/(\w+)(.*)$/) l, s = $1, $2 [l.to_sym, s] when '@' s.match(/@(#\w+)(.*)$/) l, s = $1, $2 [[:"@", l], s] when '-' [[:diff], s[1..-1]] when '?' [[:opt], s[1..-1]] when '|' [[:alt], s[1..-1]] when '+' [[:plus], s[1..-1]] when '*' [[:star], s[1..-1]] when /[\(\)]/ [[m.to_sym], s[1..-1]] else raise "unrecognized terminal: #{s.inspect}" end end def depth @depth += 1 ret = yield @depth -= 1 ret end ## # Progress output when debugging # # @overload debug(node, message) # @param [String] node relative location in input # @param [String] message ("") # # @overload debug(message) # @param [String] message ("") # # @yieldreturn [String] added to message def debug(*args) return unless @options[:debug] options = args.last.is_a?(Hash) ? args.pop : {} depth = options[:depth] || @depth message = args.pop message = message.call if message.is_a?(Proc) args << message if message args << yield if block_given? message = "#{args.join(': ')}" str = "[#{@lineno}]#{' ' * depth}#{message}" @options[:debug] << str if @options[:debug].is_a?(Array) $stderr.puts(str) if @options[:debug] == true end end