require 'strscan'

# Extended Bakus-Nour Form (EBNF), being the W3C variation is
# originaly defined in the
# [W3C XML 1.0 Spec](http://www.w3.org/TR/REC-xml/#sec-notation).
#
# This version attempts to be less strict than the strict definition
# to allow for coloquial variations (such as in the Turtle syntax).
#
# A rule takes the following form:
#     [1]  symbol ::= expression
#
# Comments include the content between '/*' and '*/'
#
# @see http://www.w3.org/2000/10/swap/grammar/ebnf2turtle.py
# @see http://www.w3.org/2000/10/swap/grammar/ebnf2bnf.n3
#
# Based on bnf2turtle by Dan Connolly.
#
# Motivation
# ----------
# 
# Many specifications include grammars that look formal but are not
# actually checked, by machine, against test data sets. Debugging the
# grammar in the XML specification has been a long, tedious manual
# process. Only when the loop is closed between a fully formal grammar
# and a large test data set can we be confident that we have an accurate
# specification of a language [#]_.
# 
# 
# The grammar in the `N3 design note`_ has evolved based on the original
# manual transcription into a python recursive-descent parser and
# subsequent development of test cases. Rather than maintain the grammar
# and the parser independently, our goal_ is to formalize the language
# syntax sufficiently to replace the manual implementation with one
# derived mechanically from the specification.
# 
# 
# .. [#] and even then, only the syntax of the language.
# .. _N3 design note: http://www.w3.org/DesignIssues/Notation3
# 
# Related Work
# ------------
# 
# Sean Palmer's `n3p announcement`_ demonstrated the feasibility of the
# approach, though that work did not cover some aspects of N3.
# 
# In development of the `SPARQL specification`_, Eric Prud'hommeaux
# developed Yacker_, which converts EBNF syntax to perl and C and C++
# yacc grammars. It includes an interactive facility for checking
# strings against the resulting grammars.
# Yosi Scharf used it in `cwm Release 1.1.0rc1`_, which includes
# a SPAQRL parser that is *almost* completely mechanically generated.
# 
# The N3/turtle output from yacker is lower level than the EBNF notation
# from the XML specification; it has the ?, +, and * operators compiled
# down to pure context-free rules, obscuring the grammar
# structure. Since that transformation is straightforwardly expressed in
# semantic web rules (see bnf-rules.n3_), it seems best to keep the RDF
# expression of the grammar in terms of the higher level EBNF
# constructs.
# 
# .. _goal: http://www.w3.org/2002/02/mid/1086902566.21030.1479.camel@dirk;list=public-cwm-bugs
# .. _n3p announcement: http://lists.w3.org/Archives/Public/public-cwm-talk/2004OctDec/0029.html
# .. _Yacker: http://www.w3.org/1999/02/26-modules/User/Yacker
# .. _SPARQL specification: http://www.w3.org/TR/rdf-sparql-query/
# .. _Cwm Release 1.1.0rc1: http://lists.w3.org/Archives/Public/public-cwm-announce/2005JulSep/0000.html
# .. _bnf-rules.n3: http://www.w3.org/2000/10/swap/grammar/bnf-rules.n3
# 
# Open Issues and Future Work
# ---------------------------
# 
# The yacker output also has the terminals compiled to elaborate regular
# expressions. The best strategy for dealing with lexical tokens is not
# yet clear. Many tokens in SPARQL are case insensitive; this is not yet
# captured formally.
# 
# The schema for the EBNF vocabulary used here (``g:seq``, ``g:alt``, ...)
# is not yet published; it should be aligned with `swap/grammar/bnf`_
# and the bnf2html.n3_ rules (and/or the style of linked XHTML grammar
# in the SPARQL and XML specificiations).
# 
# It would be interesting to corroborate the claim in the SPARQL spec
# that the grammar is LL(1) with a mechanical proof based on N3 rules.
# 
# .. _swap/grammar/bnf: http://www.w3.org/2000/10/swap/grammar/bnf
# .. _bnf2html.n3: http://www.w3.org/2000/10/swap/grammar/bnf2html.n3  
# 
# 
# 
# Background
# ----------
# 
# The `N3 Primer`_ by Tim Berners-Lee introduces RDF and the Semantic
# web using N3, a teaching and scribbling language. Turtle is a subset
# of N3 that maps directly to (and from) the standard XML syntax for
# RDF.
# 
# 
# 
# .. _N3 Primer: _http://www.w3.org/2000/10/swap/Primer.html
# 
# @author Gregg Kellogg
class EBNF
  class Rule
    # @attr [Symbol] sym
    attr_reader :sym
    # @attr [String] id
    attr_reader :id
    # @attr [Symbol] kind one of :rule, :token, or :pass
    attr_accessor :kind
    # @attr [Array] expr
    attr_reader :expr
    # @attr [String] orig
    attr_accessor :orig

    # @param [Integer] id
    # @param [Symbol] sym
    # @param [Array] expr
    # @param [String] orig
    # @param [EBNF] ebnf
    def initialize(id, sym, expr, ebnf)
      @id, @sym, @expr, @ebnf = id, sym, expr, ebnf
    end
    
    def to_sxp
      [id, sym, kind, expr].to_sxp
    end
    
    def to_ttl
      @ebnf.debug("to_ttl") {inspect}
      comment = orig.strip.
        gsub(/"""/, '\"\"\"').
        gsub("\\", "\\\\").
        sub(/^\"/, '\"').
        sub(/\"$/m, '\"')
      statements = [
        %{:#{id} rdfs:label "#{id}"; rdf:value "#{sym}";},
        %{  rdfs:comment #{comment.inspect};},
      ]
      
      statements += ttl_expr(expr, kind == :token ? "re" : "g", 1, false)
      "\n" + statements.join("\n")
    end

    def inspect
      {:sym => sym, :id => id, kind => kind, :expr => expr}.inspect
    end
    
    private
    def ttl_expr(expr, pfx, depth, is_obj = true)
      indent = '  ' * depth
      @ebnf.debug("ttl_expr", :depth => depth) {expr.inspect}
      op = expr.shift if expr.is_a?(Array)
      statements = []
      
      if is_obj
        bra, ket = "[ ", " ]"
      else
        bra = ket = ''
      end

      case op
      when :seq, :alt, :diff
        statements << %{#{indent}#{bra}#{pfx}:#{op} (}
        expr.each {|a| statements += ttl_expr(a, pfx, depth + 1)}
        statements << %{#{indent} )#{ket}}
      when :opt, :plus, :star
        statements << %{#{indent}#{bra}#{pfx}:#{op} }
        statements += ttl_expr(expr.first, pfx, depth + 1)
        statements << %{#{indent} #{ket}} unless ket.empty?
      when :"'"
        statements << %{#{indent}"#{esc(expr)}"}
      when :range
        statements << %{#{indent}#{bra} re:matches #{cclass(expr.first).inspect} #{ket}}
      when :hex
        raise "didn't expect \" in expr" if expr.include?(:'"')
        statements << %{#{indent}#{bra} re:matches #{cclass(expr.first).inspect} #{ket}}
      else
        if is_obj
          statements << %{#{indent}#{expr.inspect}}
        else
          statements << %{#{indent}g:seq ( #{expr.inspect} )}
        end
      end
      
      statements.last << " ." unless is_obj
      @ebnf.debug("statements", :depth => depth) {statements.join("\n")}
      statements
    end
    
    ##
    # turn an XML BNF character class into an N3 literal for that
    # character class (less the outer quote marks)
    #
    #   >>> cclass("^<>'{}|^`")
    #   "[^<>'{}|^`]"
    #   >>> cclass("#x0300-#x036F")
    #   "[\\u0300-\\u036F]"
    #   >>> cclass("#xC0-#xD6")
    #   "[\\u00C0-\\u00D6]"
    #   >>> cclass("#x370-#x37D")
    #   "[\\u0370-\\u037D]"
    # 
    #   as in: ECHAR ::= '\' [tbnrf\"']
    #   >>> cclass("tbnrf\\\"'")
    #   'tbnrf\\\\\\"\''
    # 
    #   >>> cclass("^#x22#x5C#x0A#x0D")
    #   '^\\u0022\\\\\\u005C\\u000A\\u000D'
    def cclass(txt)
      '[' +
      txt.gsub(/\#x[0-9a-fA-F]+/) do |hx|
        hx = hx[2..-1]
        if hx.length <= 4
          "\\u#{'0' * (4 - hx.length)}#{hx}" 
        elsif hx.length <= 8
          "\\U#{'0' * (8 - hx.length)}#{hx}" 
        end
      end +
      ']'
    end
  end

  # Abstract syntax tree from parse
  attr_reader :ast

  # Parse the string or file input generating an abstract syntax tree
  # in S-Expressions (similar to SPARQL SSE)
  #
  # @param [#read, #to_s] input
  def initialize(input, options = {})
    @options = options
    @lineno, @depth = 1, 0
    token = false
    @ast = []

    input = input.respond_to?(:read) ? input.read : input.to_s
    scanner = StringScanner.new(input)

    eachRule(scanner) do |r|
      debug("rule string") {r.inspect}
      case r
      when /^@terminals/
        # Switch mode to parsing tokens
        token = true
      when /^@pass\s*(.*)$/m
        rule = depth {ruleParts("[0] " + r)}
        rule.kind = :pass
        rule.orig = r
        @ast << rule
      else
        rule = depth {ruleParts(r)}
        
        # all caps symbols are tokens. Once a token is seen
        # we don't go back
        token ||= !!(rule.sym.to_s =~ /^[A-Z_]+$/)
        rule.kind = token ? :token : :rule
        rule.orig = r
        @ast << rule
      end
    end
  end
  
  ##
  # Write out parsed syntax string as an S-Expression
  def to_sxp
    begin
      require 'sxp'
      SXP::Generator.string(ast)
    rescue LoadError
      ast.to_sxp
    end
  end

  ##
  # Write out syntax tree as Turtle
  # @param [String] prefix for language
  # @return [String]
  def to_ttl(prefix, ns)
    token = false
    
    unless ast.empty?
      [
        "@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>.",
        "@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>.",
        "@prefix #{prefix}: <#{ns}>.",
        "@prefix : <#{ns}>.",
        "@prefix re: <http://www.w3.org/2000/10/swap/grammar/regex#>.",
        "@prefix g: <http://www.w3.org/2000/10/swap/grammar/ebnf#>.",
        "",
        ":language rdfs:isDefinedBy <>; g:start :#{ast.first.id}.",
        "",
      ]
    end.join("\n") +

    ast.
      select {|a| [:rule, :token].include?(a.kind)}.
      map(&:to_ttl).
      join("\n")
  end

  ##
  # Iterate over rule strings.
  # a line that starts with '[' or '@' starts a new rule
  #
  # @param [StringScanner] scanner
  # @yield rule_string
  # @yieldparam [String] rule_string
  def eachRule(scanner)
    cur_lineno = 1
    r = ''
    until scanner.eos?
      case
      when s = scanner.scan(%r(\s+)m)
        # Eat whitespace
        cur_lineno += s.count("\n")
        #debug("eachRule(ws)") { "[#{cur_lineno}] #{s.inspect}" }
      when s = scanner.scan(%r(/\*([^\*]|\*[^\/])*\*/)m)
        # Eat comments
        cur_lineno += s.count("\n")
        debug("eachRule(comment)") { "[#{cur_lineno}] #{s.inspect}" }
      when s = scanner.scan(%r(^@terminals))
        #debug("eachRule(@terminals)") { "[#{cur_lineno}] #{s.inspect}" }
        yield(r) unless r.empty?
        @lineno = cur_lineno
        yield(s)
        r = ''
      when s = scanner.scan(/@pass/)
        # Found rule start, if we've already collected a rule, yield it
        #debug("eachRule(@pass)") { "[#{cur_lineno}] #{s.inspect}" }
        yield r unless r.empty?
        @lineno = cur_lineno
        r = s
      when s = scanner.scan(/\[(?=\w+\])/)
        # Found rule start, if we've already collected a rule, yield it
        yield r unless r.empty?
        #debug("eachRule(rule)") { "[#{cur_lineno}] #{s.inspect}" }
        @lineno = cur_lineno
        r = s
      else
        # Collect until end of line, or start of comment
        s = scanner.scan_until(%r((?:/\*)|$)m)
        cur_lineno += s.count("\n")
        #debug("eachRule(rest)") { "[#{cur_lineno}] #{s.inspect}" }
        r += s
      end
    end
    yield r unless r.empty?
  end
  
  ##
  # Parse a rule into a rule number, a symbol and an expression
  #
  # @param [String] rule
  # @return [Rule]
  def ruleParts(rule)
    num_sym, expr = rule.split('::=', 2).map(&:strip)
    num, sym = num_sym.split(']', 2).map(&:strip)
    num = num[1..-1]
    r = Rule.new(sym && sym.to_sym, num, ebnf(expr).first, self)
    debug("ruleParts") { r.inspect }
    r
  end
  
  ##
  # Parse a string into an expression tree and a remaining string
  #
  # @example
  #     >>> ebnf("a b c")
  #     ((seq, [('id', 'a'), ('id', 'b'), ('id', 'c')]), '')
  #     
  #     >>> ebnf("a? b+ c*")
  #     ((seq, [(opt, ('id', 'a')), (plus, ('id', 'b')), ('*', ('id', 'c'))]), '')
  #     
  #     >>> ebnf(" | x xlist")
  #     ((alt, [(seq, []), (seq, [('id', 'x'), ('id', 'xlist')])]), '')
  #     
  #     >>> ebnf("a | (b - c)")
  #     ((alt, [('id', 'a'), (diff, [('id', 'b'), ('id', 'c')])]), '')
  #     
  #     >>> ebnf("a b | c d")
  #     ((alt, [(seq, [('id', 'a'), ('id', 'b')]), (seq, [('id', 'c'), ('id', 'd')])]), '')
  #     
  #     >>> ebnf("a | b | c")
  #     ((alt, [('id', 'a'), ('id', 'b'), ('id', 'c')]), '')
  #     
  #     >>> ebnf("a) b c")
  #     (('id', 'a'), ' b c')
  #     
  #     >>> ebnf("BaseDecl? PrefixDecl*")
  #     ((seq, [(opt, ('id', 'BaseDecl')), ('*', ('id', 'PrefixDecl'))]), '')
  #     
  #     >>> ebnf("NCCHAR1 | diff | [0-9] | #x00B7 | [#x0300-#x036F] | [#x203F-#x2040]")
  #     ((alt, [('id', 'NCCHAR1'), ("'", diff), (range, '0-9'), (hex, '#x00B7'), (range, '#x0300-#x036F'), (range, '#x203F-#x2040')]), '')
  #     
  # @param [String] s
  # @return [Array]
  def ebnf(s)
    debug("ebnf") {"(#{s.inspect})"}
    e, s = depth {alt(s)}
    debug {"=> alt returned #{[e, s].inspect}"}
    unless s.empty?
      t, ss = depth {token(s)}
      debug {"=> token returned #{[t, ss].inspect}"}
      return [e, ss] if t.is_a?(Array) && t.first == :")"
    end
    [e, s]
  end
  
  ##
  # Parse alt
  #     >>> alt("a | b | c")
  #     ((alt, [('id', 'a'), ('id', 'b'), ('id', 'c')]), '')
  # @param [String] s
  # @return [Array]
  def alt(s)
    debug("alt") {"(#{s.inspect})"}
    args = []
    while !s.empty?
      e, s = depth {seq(s)}
      debug {"=> seq returned #{[e, s].inspect}"}
      if e.to_s.empty?
        break unless args.empty?
        e = [:seq, []] # empty sequence
      end
      args << e
      unless s.empty?
        t, ss = depth {token(s)}
        break unless t[0] == :alt
        s = ss
      end
    end
    args.length > 1 ? [args.unshift(:alt), s] : [e, s]
  end
  
  ##
  # parse seq
  #
  #     >>> seq("a b c")
  #     ((seq, [('id', 'a'), ('id', 'b'), ('id', 'c')]), '')
  #     
  #     >>> seq("a b? c")
  #     ((seq, [('id', 'a'), (opt, ('id', 'b')), ('id', 'c')]), '')
  def seq(s)
    debug("seq") {"(#{s.inspect})"}
    args = []
    while !s.empty?
      e, ss = depth {diff(s)}
      debug {"=> diff returned #{[e, ss].inspect}"}
      unless e.to_s.empty?
        args << e
        s = ss
      else
        break;
      end
    end
    if args.length > 1
      [args.unshift(:seq), s]
    elsif args.length == 1
      args + [s]
    else
      ["", s]
    end
  end
  
  ##
  # parse diff
  # 
  #     >>> diff("a - b")
  #     ((diff, [('id', 'a'), ('id', 'b')]), '')
  def diff(s)
    debug("diff") {"(#{s.inspect})"}
    e1, s = depth {postfix(s)}
    debug {"=> postfix returned #{[e1, s].inspect}"}
    unless e1.to_s.empty?
      unless s.empty?
        t, ss = depth {token(s)}
        debug {"diff #{[t, ss].inspect}"}
        if t.is_a?(Array) && t.first == :diff
          s = ss
          e2, s = primary(s)
          unless e2.to_s.empty?
            return [[:diff, e1, e2], s]
          else
            raise "Syntax Error"
          end
        end
      end
    end
    [e1, s]
  end
  
  ##
  # parse postfix
  # 
  #     >>> postfix("a b c")
  #     (('id', 'a'), ' b c')
  #     
  #     >>> postfix("a? b c")
  #     ((opt, ('id', 'a')), ' b c')
  def postfix(s)
    debug("postfix") {"(#{s.inspect})"}
    e, s = depth {primary(s)}
    debug {"=> primary returned #{[e, s].inspect}"}
    return ["", s] if e.to_s.empty?
    if !s.empty?
      t, ss = depth {token(s)}
      debug {"=> #{[t, ss].inspect}"}
      if t.is_a?(Array) && [:opt, :star, :plus].include?(t.first)
        return [[t.first, e], ss]
      end
    end
    [e, s]
  end
  
  ##
  # parse primary
  # 
  #     >>> primary("a b c")
  #     (('id', 'a'), ' b c')
  def primary(s)
    debug("primary") {"(#{s.inspect})"}
    t, s = depth {token(s)}
    debug {"=> token returned #{[t, s].inspect}"}
    if t.is_a?(Symbol) || t.is_a?(String)
      [t, s]
    elsif %w(range hex).map(&:to_sym).include?(t.first)
      [t, s]
    elsif t.first == :"("
      e, s = depth {ebnf(s)}
      debug {"=> ebnf returned #{[e, s].inspect}"}
      [e, s]
    else
      ["", s]
    end
  end
  
  ##
  # parse one token; return the token and the remaining string
  # 
  # A token is represented as a tuple whose 1st item gives the type;
  # some types have additional info in the tuple.
  # 
  # @example
  #     >>> token("'abc' def")
  #     (("'", 'abc'), ' def')
  #     
  #     >>> token("[0-9]")
  #     ((range, '0-9'), '')
  #     >>> token("#x00B7")
  #     ((hex, '#x00B7'), '')
  #     >>> token ("[#x0300-#x036F]")
  #     ((range, '#x0300-#x036F'), '')
  #     >>> token("[^<>'{}|^`]-[#x00-#x20]")
  #     ((range, "^<>'{}|^`"), '-[#x00-#x20]')
  def token(s)
    s = s.strip
    case m = s[0,1]
    when '"', "'"
      l, s = s[1..-1].split(m, 2)
      [l, s]
    when '['
      l, s = s[1..-1].split(']', 2)
      [[:range, l], s]
    when '#'
      s.match(/(#\w+)(.*)$/)
      l, s = $1, $2
      [[:hex, l], s]
    when /[[:alpha:]]/
      s.match(/(\w+)(.*)$/)
      l, s = $1, $2
      [l.to_sym, s]
    when '@'
      s.match(/@(#\w+)(.*)$/)
      l, s = $1, $2
      [[:"@", l], s]
    when '-'
      [[:diff], s[1..-1]]
    when '?'
      [[:opt], s[1..-1]]
    when '|'
      [[:alt], s[1..-1]]
    when '+'
      [[:plus], s[1..-1]]
    when '*'
      [[:star], s[1..-1]]
    when /[\(\)]/
      [[m.to_sym], s[1..-1]]
    else
      raise "unrecognized token: #{s.inspect}"
    end
  end

  def depth
    @depth += 1
    ret = yield
    @depth -= 1
    ret
  end

  ##
  # Progress output when debugging
  #   param [String] node relative location in input
  #   param [String] message ("")
  #   yieldreturn [String] added to message
  def debug(*args)
    return unless @options[:debug]
    options = args.last.is_a?(Hash) ? args.pop : {}
    depth = options[:depth] || @depth
    message = args.pop
    message = message.call if message.is_a?(Proc)
    args << message if message
    args << yield if block_given?
    message = "#{args.join(': ')}"
    str = "[#{@lineno}]#{' ' * depth}#{message}"
    @options[:debug] << str if @options[:debug].is_a?(Array)
    $stderr.puts(str) if @options[:debug] == true
  end
end