#! /your/favourite/path/to/racc # -*- coding: utf-8 -*- # Copyright (c) 2014 Urabe, Shyouhei. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # - Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # # - Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in # the documentation and/or other materials provided with the # distribution. # # - Neither the name of Internet Society, IETF or IETF Trust, nor the # names of specific contributors, may be used to endorse or promote # products derived from this software without specific prior written # permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. # This is almost one-to-one translation of RFC7159 section 2 through 7, from # Augmented BNF to Racc BNF. Should be the easiest to verify implementation # against the spec. # # @note This parser has several shift/reduct conflicts. They are all around # handling of white spaces (called "ws"), so can silently be ignored. I also # checked the parser internal and made sure they are OK. class RFC7159::Parser options no_result_var expect 28 rule # Notes about nonterminal's names: in order to make manual verification # easy, all the nonterminals that appear in the RFC are named as such. ABNF # is much concise than plain BNF, so here we added several helper # nonterminals; they are prefixed with "__" so you can distinguish if a # nonterminal is RFC-origin or not. # RFC7159 section 2 JSON_text : ws value ws { val[1] } begin_array : ws "\x5B" ws # [ left square bracket begin_object : ws "\x7B" ws # { left curly bracket end_array : ws "\x5D" ws # ] right square bracket end_object : ws "\x7D" ws # } right curly bracket name_separator : ws "\x3A" ws # : colon value_separator : ws "\x2C" ws # , comma ws : # <- this is the '*' in the ABNF | ws "\x20" # Space | ws "\x09" # Horizontal tab | ws "\x0A" # Line feed or New line | ws "\x0D" # Carriage return # RFC7159 section 3 value : false | null | true | object | array | number | string false : "\x66" "\x61" "\x6c" "\x73" "\x65" { [ :false ] } # false null : "\x6e" "\x75" "\x6c" "\x6c" { [ :null ] } # null true : "\x74" "\x72" "\x75" "\x65" { [ :true ] } # true # RFC7159 section 4 object : begin_object end_object { [ :object ] } | begin_object __members__ end_object { [ :object, *val[1] ] } __members__ : member { val } | __members__ value_separator member { [ *val[0], val[2] ] } member : string name_separator value { [ val[0], val[2] ] } # RFC7159 section 5 array : begin_array end_array { [ :array ] } | begin_array __list__ end_array { [ :array, *val[1] ] } __list__ : value { val } | __list__ value_separator value { [ *val[0], val[2] ] } # RFC7159 section 6 number : __minus_p__ int __frac_p__ __exp_p__ { [ :number, *val ] } __minus_p__ : | minus __frac_p__ : | frac __exp_p__ : | exp decimal_point : "\x2E" # . digit1_9 : "\x31" | "\x32" | "\x33" | "\x34" | "\x35" | "\x36" | "\x37" | "\x38" | "\x39" e : "\x65" | "\x45" # e E exp : e __sign__ __digit_plus__ { val } frac : decimal_point __digit_plus__ { val } int : zero { val } | digit1_9 { val } | digit1_9 __digit_plus__ { [ val[0], *val[1] ] } minus : "\x2D" # - plus : "\x2B" # + zero : "\x30" # 0 DIGIT : zero | digit1_9 __sign__ : | plus | minus __digit_plus__ : DIGIT { val } | __digit_plus__ DIGIT { [ *val[0], val[1] ] } # RFC7159 section 7 string : quotation_mark quotation_mark { [ :string ] } | quotation_mark __chars__ quotation_mark { [ :string, *val[1] ] } __chars__ : char { val } | __chars__ char { [ *val[0], val[1] ] } char : unescaped | escape __ctrl__ { val.flatten } __ctrl__ : "\x22" # " quotation mark U+0022 | "\x5C" # \ reverse solidus U+005C | "\x2F" # / solidus U+002F | "\x62" # b backspace U+0008 | "\x66" # f form feed U+000C | "\x6E" # n line feed U+000A | "\x72" # r carriage return U+000D | "\x74" # t tab U+0009 | "\x75" # uXXXX U+XXXX HEXDIG HEXDIG HEXDIG HEXDIG { val } escape : "\x5C" # \ quotation_mark : "\x22" # " HEXDIG : DIGIT | "\x61" | "\x62" | "\x63" | "\x64" | "\x65" | "\x66" | "\x41" | "\x42" | "\x43" | "\x44" | "\x45" | "\x46" # "unescaped" is too much to list up here; use lexer instead. # unescaped = %x20-21 / %x23-5B / %x5D-10FFFF end ---- inner # @param [true, false] accept_bom Whether to accept BOMs # @param [true, false] yydebug Whether to enable debug mode def initialize accept_bom: false, yydebug: false @accept_bom = accept_bom @yydebug = yydebug end # Parses str and generates AST. The str must consist of _a_ valid JSON # text, otherwise an exception shall raise. # # @param [#each_char] str IO or String or something to parse # @return [::Array] Parsed AST # @raise [Racc::ParseError] The input is invalid # @raise [Encoding::CompatibilityError] The input is invalid def parse str @state = :init @enum = str.enum_for:each_char firstchar = @enum.peek @lineno = 1 @column = 1 case @enc = firstchar.encoding when Encoding::UTF_8, Encoding::US_ASCII, # true subset of UTF-8 Encoding::UTF8_MAC, # true subset of UTF-8 Encoding::UTF_16LE, Encoding::UTF_16BE, Encoding::UTF_32LE, Encoding::UTF_32BE # RFC7159 sectoin 8.1 explicitly states that the input string must be # either UTF 8, 16, or 32 -encoded. That point is as clear as the # sky. All other encodings are NG. However, what we call the ASCII # encoding is the true subset of UTF-8. A string of ASCII must also # be valid as UTF-8. So we allow this. # # There are disucssions about parsing BOMs. The original RFC4627 said # nothing about BOMs, however its section 3 ("Encoding") cannot be # read as if it expected BOMs. Current RFC7159 _prohibits_ to # generate JSON texts with BOMs but _allows_ to accept. # # This parser can control whether to accept BOMs. if @accept_bom and firstchar == "\u{feff}".encode(@enc) @enum.next # consume end return do_parse else raise Encoding::CompatibilityError, <<-"end".gsub(/[\n\s]+/, ' ') ``JSON text SHALL be encoded in UTF-8, UTF-16, or UTF-32'', said RFC7159 section 8.1. The given string is NOT in any of those encodings (but #{@enc.inspect}). end end end private def nl @nl ||= Regexp.new('[\r\n]'.encode(@enc)) end def sp @nl ||= Regexp.new('\s'.encode(@enc)) end def nm @nl ||= Regexp.new('\d'.encode(@enc)) end def next_token chr = @enum.next tok = chr.encode(Encoding::UTF_8) # dfault newline, @newline = @newline, nl.match(chr) if newline @lineno += 1 @column = 1 else @column += 1 end case @state when :string then # recap: unescaped = %x20-21 / %x23-5B / %x5D-10FFFF case chr.ord when 0x20..0x21 then tok = :unescaped when 0x22 then @state = :init # " when 0x23..0x5B then tok = :unescaped when 0x5C then @state = :escaped # \ when 0x5D..0x10FFFF then tok = :unescaped else @state = :string # NG unicode end when :init then @state = (chr.ord == '"'.ord) ? :string : :init when :escaped then @state = (chr.ord == 'u'.ord) ? :u1 : :string when :u1 then @state = :u2 when :u2 then @state = :u3 when :u3 then @state = :u4 when :u4 then @state = :string end return tok, chr rescue StopIteration return false, @enum end def on_error id, val, stack reason = case @state when :string 'this character is not allowed in a string; escape it.' when :u1, :u2, :u3, :u4 '\uXXXX must exactly be a four-letter hexadecimal sequence.' else case val when "'" 'you must use " to quote strings' when '}', ']', ',' 'possible extra (dangling) comma?' when ':' 'possible confusion of {} vs []?' when sp 'possible space inside of a number?' when nm 'possible lack of +/- in exponent?' else 'unexpected character' end end msg = sprintf 'Syntax error near line %d, char %d (%p) @ %p: %s', @lineno, @column, val, @enum, reason raise Racc::ParseError, msg end ---- footer # # Local Variables: # mode: ruby # coding: utf-8-unix # indent-tabs-mode: t # tab-width: 3 # ruby-indent-level: 3 # fill-column: 79 # default-justification: full # End: