#! /your/favourite/path/to/racc
# -*- coding: utf-8 -*-

# Copyright (c) 2014 Urabe, Shyouhei.  All rights reserved.
#
# Redistribution  and  use  in  source   and  binary  forms,  with  or  without
# modification, are  permitted provided that the following  conditions are met:
#
#     - Redistributions  of source  code must  retain the  above copyright
#       notice, this list of conditions and the following disclaimer.
#
#     - Redistributions in binary form  must reproduce the above copyright
#       notice, this  list of conditions  and the following  disclaimer in
#       the  documentation  and/or   other  materials  provided  with  the
#       distribution.
#
#     - Neither the name of Internet  Society, IETF or IETF Trust, nor the
#       names of specific contributors, may  be used to endorse or promote
#       products derived from this software without specific prior written
#       permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS”
# AND ANY  EXPRESS OR  IMPLIED WARRANTIES, INCLUDING,  BUT NOT LIMITED  TO, THE
# IMPLIED WARRANTIES  OF MERCHANTABILITY AND  FITNESS FOR A  PARTICULAR PURPOSE
# ARE  DISCLAIMED. IN NO  EVENT SHALL  THE COPYRIGHT  OWNER OR  CONTRIBUTORS BE
# LIABLE  FOR   ANY  DIRECT,  INDIRECT,  INCIDENTAL,   SPECIAL,  EXEMPLARY,  OR
# CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT   NOT  LIMITED  TO,  PROCUREMENT  OF
# SUBSTITUTE  GOODS OR SERVICES;  LOSS OF  USE, DATA,  OR PROFITS;  OR BUSINESS
# INTERRUPTION)  HOWEVER CAUSED  AND ON  ANY  THEORY OF  LIABILITY, WHETHER  IN
# CONTRACT,  STRICT  LIABILITY, OR  TORT  (INCLUDING  NEGLIGENCE OR  OTHERWISE)
# ARISING IN ANY  WAY OUT OF THE USE  OF THIS SOFTWARE, EVEN IF  ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.

# This is  almost one-to-one translation of  RFC7159 section 2 through  7, from
# Augmented BNF  to Racc BNF.  Should  be the easiest to  verify implementation
# against the spec.
#
# @note This  parser has several  shift/reduct conflicts.  They are  all around
#   handling of white spaces (called "ws"), so can silently be ignored.  I also
#   checked the parser internal and made sure they are OK.
class RFC7159::Parser

	options no_result_var
	expect 28
	rule

	# Notes  about nonterminal's  names: in  order to  make manual  verification
	# easy, all the nonterminals that appear in the RFC are named as such.  ABNF
	# is  much  concise  than  plain  BNF,  so  here  we  added  several  helper
	# nonterminals; they  are prefixed  with "__"  so you  can distinguish  if a
	# nonterminal is RFC-origin or not.

	# RFC7159 section 2

	JSON_text       : ws value  ws                              { val[1] }
	begin_array     : ws "\x5B" ws # [ left square bracket
	begin_object    : ws "\x7B" ws # { left curly bracket
	end_array       : ws "\x5D" ws # ] right square bracket
	end_object      : ws "\x7D" ws # } right curly bracket
	name_separator  : ws "\x3A" ws # : colon
	value_separator : ws "\x2C" ws # , comma
	ws              :              # <- this is the '*' in the ABNF
	                | ws "\x20"    # Space
	                | ws "\x09"    # Horizontal tab
	                | ws "\x0A"    # Line feed or New line
	                | ws "\x0D"    # Carriage return

	# RFC7159 section 3

	value           : false | null | true | object | array | number | string
	false           : "\x66" "\x61" "\x6c" "\x73" "\x65"        { [ :false ] } # false
	null            : "\x6e" "\x75" "\x6c" "\x6c"               { [ :null  ] } # null
	true            : "\x74" "\x72" "\x75" "\x65"               { [ :true  ] } # true

	# RFC7159 section 4

	object          : begin_object             end_object       { [ :object ] }
	                | begin_object __members__ end_object       { [ :object, *val[1] ] }
	__members__     : member                                    { val }
	                | __members__ value_separator member        { [ *val[0],  val[2] ] }
	member          : string name_separator value               { [  val[0],  val[2] ] }

	# RFC7159 section 5

	array           : begin_array          end_array            { [ :array ] }
	                | begin_array __list__ end_array            { [ :array, *val[1] ] }
	__list__        : value                                     { val }
	                | __list__ value_separator value            { [ *val[0], val[2] ] }

	# RFC7159 section 6

	number          : __minus_p__ int __frac_p__ __exp_p__      { [ :number, *val ] }
	__minus_p__     : | minus
	__frac_p__      : | frac
	__exp_p__       : | exp
	decimal_point   : "\x2E" # .
	digit1_9        : "\x31" | "\x32" | "\x33" | "\x34" | "\x35"
	                | "\x36" | "\x37" | "\x38" | "\x39"
	e               : "\x65" | "\x45"    # e E
	exp             : e __sign__ __digit_plus__                 { val }
	frac            : decimal_point __digit_plus__              { val }
	int             : zero                                      { val }
	                | digit1_9                                  { val }
	                | digit1_9 __digit_plus__                   { [ val[0], *val[1] ] }
	minus           : "\x2D" # -
	plus            : "\x2B" # +
	zero            : "\x30" # 0
	DIGIT           : zero  | digit1_9
	__sign__        : | plus  | minus
	__digit_plus__  : DIGIT                                     { val }
	                | __digit_plus__ DIGIT                      { [ *val[0], val[1] ] }

	# RFC7159 section 7

	string          : quotation_mark           quotation_mark   { [ :string ] }
	                | quotation_mark __chars__ quotation_mark   { [ :string, *val[1] ] }
	__chars__       : char                                      { val }
	                | __chars__ char                            { [ *val[0],  val[1] ] }
	char            : unescaped | escape __ctrl__               { val.flatten }
	__ctrl__        : "\x22" # "    quotation mark  U+0022
	                | "\x5C" # \    reverse solidus U+005C
	                | "\x2F" # /    solidus         U+002F
	                | "\x62" # b    backspace       U+0008
	                | "\x66" # f    form feed       U+000C
	                | "\x6E" # n    line feed       U+000A
	                | "\x72" # r    carriage return U+000D
	                | "\x74" # t    tab             U+0009
	                | "\x75" # uXXXX                U+XXXX
	                         HEXDIG HEXDIG HEXDIG HEXDIG        { val }
	escape          : "\x5C" # \
	quotation_mark  : "\x22" # "
	HEXDIG          : DIGIT
	                | "\x61" | "\x62" | "\x63" | "\x64" | "\x65" | "\x66"
	                | "\x41" | "\x42" | "\x43" | "\x44" | "\x45" | "\x46"

	# "unescaped" is too much to list up here; use lexer instead.
	# unescaped = %x20-21 / %x23-5B / %x5D-10FFFF
end

---- inner

	# @param [true, false] accept_bom  Whether to accept BOMs
	# @param [true, false] yydebug     Whether to enable debug mode
	def initialize accept_bom: false, yydebug: false
		@accept_bom = accept_bom
		@yydebug    = yydebug
	end

	# Parses str  and generates  AST.  The  str must consist  of _a_  valid JSON
	# text, otherwise an exception shall raise.
	#
	# @param  [#each_char] str    IO or String or something to parse
	# @return [::Array]           Parsed AST
	# @raise  [Racc::ParseError]  The input is invalid
	# @raise  [Encoding::CompatibilityError] The input is invalid
	def parse str
		@state      = :init
		@enum       = str.enum_for:each_char
		firstchar   = @enum.peek
		@lineno     = 1
		@column     = 1

		case @enc = firstchar.encoding
		when Encoding::UTF_8,
		     Encoding::US_ASCII, # true subset of UTF-8
		     Encoding::UTF8_MAC, # true subset of UTF-8
		     Encoding::UTF_16LE,
		     Encoding::UTF_16BE,
		     Encoding::UTF_32LE,
		     Encoding::UTF_32BE
			# RFC7159 sectoin 8.1 explicitly states  that the input string must be
			# either UTF  8, 16, or  32 -encoded.  That point  is as clear  as the
			# sky.  All other  encodings are NG.  However, what we  call the ASCII
			# encoding is the  true subset of UTF-8.  A string  of ASCII must also
			# be valid as UTF-8.  So we allow this.
			#
			# There are disucssions about parsing BOMs.  The original RFC4627 said
			# nothing about  BOMs, however  its section  3 ("Encoding")  cannot be
			# read  as  if  it  expected BOMs.   Current  RFC7159  _prohibits_  to
			# generate JSON texts with BOMs but _allows_ to accept.
			#
			# This parser can control whether to accept BOMs.
			if @accept_bom and firstchar == "\u{feff}".encode(@enc)
				@enum.next # consume
			end
			return do_parse
		else
			raise Encoding::CompatibilityError, <<-"end".gsub(/[\n\s]+/, ' ')
				``JSON text  SHALL be encoded  in UTF-8, UTF-16, or  UTF-32'', said
				RFC7159  section 8.1.   The given  string is  NOT in  any of  those
				encodings (but #{@enc.inspect}).
			end
		end
	end

	private
	def nl
		@nl ||= Regexp.new('[\r\n]'.encode(@enc))
	end

	def sp
		@nl ||= Regexp.new('\s'.encode(@enc))
	end

	def nm
		@nl ||= Regexp.new('\d'.encode(@enc))
	end

	def next_token
		chr = @enum.next
		tok = chr.encode(Encoding::UTF_8) # dfault
		newline, @newline = @newline, nl.match(chr)
		if newline
			@lineno += 1
			@column = 1
		else
			@column += 1
		end
		case @state
		when :string  then # recap: unescaped = %x20-21 / %x23-5B / %x5D-10FFFF
			case chr.ord
			when 0x20..0x21     then    tok = :unescaped
			when 0x22           then @state = :init      # "
			when 0x23..0x5B     then    tok = :unescaped
			when 0x5C           then @state = :escaped   # \
			when 0x5D..0x10FFFF then    tok = :unescaped
			else                     @state = :string    # NG unicode
			end
		when :init    then @state = (chr.ord == '"'.ord) ? :string : :init
		when :escaped then @state = (chr.ord == 'u'.ord) ? :u1     : :string
		when :u1      then @state = :u2
		when :u2      then @state = :u3
		when :u3      then @state = :u4
		when :u4      then @state = :string
		end
		return tok, chr
	rescue StopIteration
		return false, @enum
	end

	def on_error id, val, stack
		reason = case @state
		when :string
			'this character is not allowed in a string; escape it.'
		when :u1, :u2, :u3, :u4
			'\uXXXX must exactly be a four-letter hexadecimal sequence.'
		else
			case val
			when "'"
				'you must use " to quote strings'
			when '}', ']', ','
				'possible extra (dangling) comma?'
			when ':'
				'possible confusion of {} vs []?'
			when sp
				'possible space inside of a number?'
			when nm
				'possible lack of +/- in exponent?'
			else
				'unexpected character'
			end
		end
		msg = sprintf 'Syntax error near line %d, char %d (%p) @ %p: %s',
			@lineno, @column, val, @enum, reason
		raise Racc::ParseError, msg
	end

---- footer

# 
# Local Variables:
# mode: ruby
# coding: utf-8-unix
# indent-tabs-mode: t
# tab-width: 3
# ruby-indent-level: 3
# fill-column: 79
# default-justification: full
# End: