# Simple parser to go through productions without attempting evaluation
module RDF::N3
  module Parser
    START = 'http://www.w3.org/2000/10/swap/grammar/n3#document'
    R_WHITESPACE = Regexp.compile('\A\s*(?:#.*$)?')
    R_MLSTRING = Regexp.compile("^.*([^\"\\\\]*)\"\"\"")
    SINGLE_CHARACTER_SELECTORS = %{\t\r\n !\"#$\%&'()*.,+/;<=>?[\\]^`{|}~}
    NOT_QNAME_CHARS = SINGLE_CHARACTER_SELECTORS + "@"
    NOT_NAME_CHARS = NOT_QNAME_CHARS + ":"
    
    def error(str)
      raise RDF::ReaderError, "\n#{@line}\n#{'-' * @pos}^\nError on line #{@lineno} at offset #{@pos}: #{str}"
    end
    
    def parse(prod)
      todo_stack = [{:prod => prod, :terms => nil}]
      while !todo_stack.empty?
        pushed = false
        if todo_stack.last[:terms].nil?
          todo_stack.last[:terms] = []
          tok = self.token
          #puts "parse tok: '#{tok}', prod #{todo_stack.last[:prod]}"
          
          # Got an opened production
          onStart(abbr(todo_stack.last[:prod]))
          break if tok.nil?
          
          cur_prod = todo_stack.last[:prod]
          prod_branch = @branches[cur_prod]
          error("No branches found for '#{abbr(cur_prod)}'") if prod_branch.nil?
          sequence = prod_branch[tok]
          if sequence.nil?
            dump_stack(todo_stack) if $verbose
            expected = prod_branch.values.uniq.map {|u| u.map {|v| abbr(v).inspect}.join(",")}
            error("Found '#{tok}' when parsing a #{abbr(cur_prod)}. expected #{expected.join(' | ')}")
          end
          #puts "sequence: #{sequence.inspect}"
          todo_stack.last[:terms] += sequence
        end
        
        #puts "parse: #{todo_stack.last.inspect}"
        while !todo_stack.last[:terms].to_a.empty?
          term = todo_stack.last[:terms].shift
          if term.is_a?(String)
            puts "parse term(string): #{term}" if $verbose
            word = buffer[0, term.length]
            if word == term
              onToken(term, word)
              consume(term.length)
            elsif '@' + word.chop == term && @keywords.include?(word.chop)
              onToken(term, word.chop)
              consume(term.length - 1)
            else
              error("Found '#{buffer[0, 10]}...'; #{term} expected")
            end
          elsif regexp = @regexps[term]
            if abbr(term) == 'string' && buffer[0, 3] == '"""'
              # Read until end of multi-line comment if this is the start of a multi-line comment
              string = '"""'
              consume(3)
              next_line = buffer
              #puts "ml-str(start): #{next_line.dump}" if $verbose
              until md = R_MLSTRING.match(next_line)
                begin
                  string += next_line
                  next_line = readline
                rescue EOFError => e
                  error("EOF reached searching for end of multi-line comment")
                end
              end
              string += md[0].to_s
              consume(md[0].to_s.length)
              onToken('string', string)
              #puts "ml-str now #{buffer.dump}"
            else
              md = regexp.match(buffer)
              error("Token(#{abbr(term)}) '#{buffer[0, 10]}...' should match #{regexp}") unless md
              puts "parse term(#{abbr(term)}:regexp): #{term}, #{regexp}.match('#{buffer[0, 10]}...') => '#{md.inspect}'" if $verbose
              onToken(abbr(term), md.to_s)
              consume(md[0].length)
            end
          else
            puts "parse term(push): #{term}" if $verbose
            todo_stack << {:prod => term, :terms => nil}
            pushed = true
            break
          end
          self.token
        end
        
        while !pushed && todo_stack.last[:terms].to_a.empty?
          todo_stack.pop
          self.onFinish
        end
      end
      while !todo_stack.empty?
        todo_stack.pop
        self.onFinish
      end
    end

    # Memoizer for get_token
    def token
      unless @memo.has_key?(@pos)
        tok = self.get_token
        @memo[@pos] = tok
        puts "token: '#{tok}'('#{buffer[0, 10]}...')" if buffer && $verbose
      end
      @memo[@pos]
    end

    def get_token
      whitespace
      
      return nil if buffer.nil?
      
      ch2 = buffer[0, 2]
      return ch2 if %w(=> <= ^^).include?(ch2)
      
      ch = buffer[0, 1]
      @keyword_mode = false if ch == '.' && @keyword_mode
      
      return ch if SINGLE_CHARACTER_SELECTORS.include?(ch)
      return ":" if ch == ":"
      return "0" if "+-0123456789".include?(ch)
      
      if ch == '@'
        return '@' if @pos > 0 && @line[@pos-1, 1] == '"'

        j = 0
        j += 1 while buffer[j+1, 1] && !NOT_NAME_CHARS.include?(buffer[j+1, 1])
        name = buffer[1, j]
        if name == 'keywords'
          @keywords = []
          @keyword_mode = true
        end
        return '@' + name
      end

      j = 0
      j += 1 while buffer[j, 1] && !NOT_QNAME_CHARS.include?(buffer[j, 1])
      word = buffer[0, j]
      error("Tokenizer expected qname, found #{buffer[0, 10]}") unless word
      if @keyword_mode
        @keywords << word
      elsif @keywords.include?(word)
        if word == 'keywords'
          @keywords = []
          @keyword_mode = true
        end
        return '@' + word.to_s # implicit keyword
      end

      'a'
    end
    
    def whitespace
      while buffer && md = R_WHITESPACE.match(buffer)
        return unless md[0].length > 0
        consume(md[0].length)
        #puts "ws: '#{md[0]}', pos=#{@pos}"
      end
    end
    
    def readline
      @line = @input.readline
      @lineno += 1
      @line.force_encoding(Encoding::UTF_8) if @line.respond_to?(:force_encoding) # for Ruby 1.9+
      puts "readline[#{@lineno}]: #{@line.dump}" if $verbose
      @pos = 0
      @line
    rescue EOFError => e
      @line, @pos = nil, 0
    end
    
    # Return data from current off set to end of line
    def buffer
      @line[@pos, @line.length - @pos] unless @line.nil?
    end
    
    # Cause n characters of line to be consumed. Read new line while line is empty or until eof
    def consume(n)
      @memo = {}
      @pos += n
      readline while @line && @line.length <= @pos
      #puts "consume[#{n}]: '#{buffer}'" if $verbose
    end
    
    def abbr(prodURI)
      prodURI.to_s.split('#').last
    end
    
    def onStart(prod)
      $stdout.puts ' ' * @productions.length + prod
      @productions << prod
    end

    def onFinish
      prod = @productions.pop()
      $stdout.puts ' ' * @productions.length + '/' + prod
    end

    def onToken(prod, tok)
      $stdout.puts ' ' * @productions.length + "#{prod}(#{tok})"
    end
    
    def dump_stack(stack)
      STDERR.puts "\nstack trace:"
      stack.reverse.each do |se|
        STDERR.puts "#{se[:prod]}"
        STDERR.puts "  " + case se[:terms]
        when nil then "nil"
        when [] then "empty"
        else          se[:terms].join(",\n  ")
        end
      end
    end
    
    def test(input, branches, regexps)
      # FIXME: for now, read in entire doc, eventually, process as stream
      @input = input.respond_to?(:read) ? (input.rewind; input) : StringIO.new(input.to_s)
      @lineno = 0
      readline  # Prime the pump
      $stdout ||= STDOUT
      
      @memo = {}
      @keyword_mode = false
      @keywords = %w(a is of this has)
      @productions = []

      @branches = branches
      @regexps = regexps
      parse(START.to_sym)
    end
  end
end