# Notation3 in Notation3 # Context Free Grammar without tokenization # @prefix rdf: . @prefix rdfs: . @prefix cfg: . @prefix rul: . @prefix : . @prefix n3: . @prefix list: . @prefix string: . @keywords a, is, of. # Issues: # - string token regexp not right FIXED # - tokenizing rules in general: whitespace are not defined in n3.n3 # and it would be nice for the *entire* syntax description to be in RDF. # - encoding really needs specifying # - @keywords affects tokenizing # - comments (tokenizer deals with) # - We assume ASCII, in fact should use not notNameChars for i18n # tokenizing: # Absorb anything until end of regexp, then stil white space # period followed IMMEDIATELY by an opener or name char is taken as "!". # Except after a "." used instead of in those circumstances, # ws may be inserted between tokens. # WS MUST be inserted between tokens where ambiguity would arise. # (possible ending characters of one and beginning characters overlap) # #<> cfg:syntaxFor [ cfg:internetMediaType # ]. # __________________________________________________________________ # # The N3 Full Grammar language a cfg:Language; cfg:document document; cfg:whiteSpace "@@@@@". document a rul:Used; cfg:mustBeOneSequence( ( # [ cfg:zeroOrMore declaration ] # [ cfg:zeroOrMore universal ] # [ cfg:zeroOrMore existential ] statements_optional cfg:eof ) ). statements_optional cfg:mustBeOneSequence (() ( statement "." statements_optional ) ). # Formula does NOT need period on last statement formulacontent cfg:mustBeOneSequence ( ( statementlist ) ). statementlist cfg:mustBeOneSequence ( ( ) ( statement statementtail ) ). statementtail cfg:mustBeOneSequence ( ( ) ( "." statementlist ) ). statement cfg:mustBeOneSequence ( (declaration) (universal) (existential) (simpleStatement) ). universal cfg:mustBeOneSequence ( ( "@forAll" [ cfg:commaSeparatedListOf symbol ] )). existential cfg:mustBeOneSequence( ( "@forSome" [ cfg:commaSeparatedListOf symbol ] )). declaration cfg:mustBeOneSequence( ( "@base" explicituri ) ( "@prefix" prefix explicituri ) ( "@keywords" [ cfg:commaSeparatedListOf barename ] ) ). simpleStatement cfg:mustBeOneSequence(( subject propertylist )). propertylist cfg:mustBeOneSequence ( ( ) ( predicate object objecttail propertylisttail ) ). propertylisttail cfg:mustBeOneSequence ( ( ) ( ";" propertylist ) ). objecttail cfg:mustBeOneSequence ( ( ) ( "," object objecttail ) ). predicate cfg:mustBeOneSequence ( ( expression ) ( "@has" expression ) ( "@is" expression "@of" ) ( "@a" ) ( "=" ) ( "=>" ) ( "<=" ) ). subject cfg:mustBeOneSequence ((expression)). object cfg:mustBeOneSequence ((expression)). expression cfg:mustBeOneSequence( ( pathitem pathtail ) ). pathtail cfg:mustBeOneSequence( ( ) ( "!" expression ) ( "^" expression ) ). pathitem cfg:mustBeOneSequence ( ( symbol ) ( "{" formulacontent "}" ) ( quickvariable ) ( numericliteral ) ( literal ) ( "[" propertylist "]" ) ( "(" pathlist ")" ) ( boolean ) # ( "@this" ) # Deprocated. Was allowed for this log:forAll x ). boolean cfg:mustBeOneSequence ( ( "@true" ) ( "@false" ) ) . pathlist cfg:mustBeOneSequence (() (expression pathlist)). symbol cfg:mustBeOneSequence ( (explicituri) (qname) ). numericliteral cfg:mustBeOneSequence ( ( integer ) ( rational ) ( double ) ( decimal ) ) . rational cfg:mustBeOneSequence (( integer "/" unsignedint)). literal cfg:mustBeOneSequence(( string dtlang)). dtlang cfg:mustBeOneSequence( () ("@" langcode) ("^^" symbol)). #______________________________________________________________________ # # TERMINALS # # "canStartWith" actually gives "a" for the whole class of alpha characters # and "0" for any of the digits 0-9. This is used to build the branching # tables. # integer cfg:matches """[-+]?[0-9]+"""; cfg:canStartWith "0", "-", "+". unsignedint cfg:matches """[0-9]+"""; cfg:canStartWith "0". double cfg:matches """[-+]?[0-9]+(\\.[0-9]+)?([eE][-+]?[0-9]+)"""; cfg:canStartWith "0", "-", "+". decimal cfg:matches """[-+]?[0-9]+\\.[0-9]*"""; cfg:canStartWith "0", "-", "+". #numericliteral cfg:matches """[-+]?[0-9]+(\\.[0-9]+)?(e[-+]?[0-9]+)?"""; # cfg:canStartWith "0", "-", "+". explicituri cfg:matches "<[^>]*>"; cfg:canStartWith "<". prefix cfg:matches "([A-Z_a-z\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u02ff\u0370-\u037d\u037f-\u1fff\u200c-\u200d\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd\U00010000-\U000effff][\\-0-9A-Z_a-z\u00b7\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u037d\u037f-\u1fff\u200c-\u200d\u203f-\u2040\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd\U00010000-\U000effff]*)?:"; cfg:canStartWith "a", "_", ":". # @@ etc unicode qname cfg:matches "(([A-Z_a-z\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u02ff\u0370-\u037d\u037f-\u1fff\u200c-\u200d\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd\U00010000-\U000effff][\\-0-9A-Z_a-z\u00b7\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u037d\u037f-\u1fff\u200c-\u200d\u203f-\u2040\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd\U00010000-\U000effff]*)?:)?[A-Z_a-z\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u02ff\u0370-\u037d\u037f-\u1fff\u200c-\u200d\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd\U00010000-\U000effff][\\-0-9A-Z_a-z\u00b7\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u037d\u037f-\u1fff\u200c-\u200d\u203f-\u2040\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd\U00010000-\U000effff]*"; cfg:canStartWith "a", "_", ":". # @@ etc unicode # ASCII version: #barename cfg:matches "[a-zA-Z_][a-zA-Z0-9_]*"; # subset of qname # cfg:canStartWith "a", "_". # @@ etc # This is the XML1.1 barename cfg:matches "[A-Z_a-z\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u02ff\u0370-\u037d\u037f-\u1fff\u200c-\u200d\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd\U00010000-\U000effff][\\-0-9A-Z_a-z\u00b7\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u037d\u037f-\u1fff\u200c-\u200d\u203f-\u2040\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd\U00010000-\U000effff]*"; cfg:canStartWith "a", "_". # @@ etc . # as far as I can tell, the regexp should be # barename cfg:matches "[A-Z_a-z\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u02ff\u0370-\u037d\u037f-\u1fff\u200c-\u200d\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd\U00010000-\U000effff][\\-0-9A-Z_a-z\u00b7\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u037d\u037f-\u1fff\u200c-\u200d\u203f-\u2040\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd\U00010000-\U000effff]*" . # quickvariable cfg:matches "\\?[A-Z_a-z\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u02ff\u0370-\u037d\u037f-\u1fff\u200c-\u200d\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd\U00010000-\U000effff][\\-0-9A-Z_a-z\u00b7\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u037d\u037f-\u1fff\u200c-\u200d\u203f-\u2040\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd\U00010000-\U000effff]*"; # ? barename cfg:canStartWith "?". # # Maybe dtlang should just be part of string regexp? # Whitespace is not allowed # was: "[a-zA-Z][a-zA-Z0-9]*(-[a-zA-Z0-9]+)?"; langcode cfg:matches "[a-z]+(-[a-z0-9]+)*"; # http://www.w3.org/TR/rdf-testcases/#language cfg:canStartWith "a". # raw regexp single quoted would be "([^"]|(\\"))*" # See: # $ PYTHONPATH=$SWAP python # >>> import tokenize # >>> import notation3 # >>> print notation3.stringToN3(tokenize.Double3) # "[^\"\\\\]*(?:(?:\\\\.|\"(?!\"\"))[^\"\\\\]*)*\"\"\"" # >>> print notation3.stringToN3(tokenize.Double) # "[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*\"" # After that we have to prefix with one or three opening \" which # the python regexp doesn't have. # # string3 cfg:matches "\"\"\"[^\"\\\\]*(?:(?:\\\\.|\"(?!\"\"))[^\"\\\\]*)*\"\"\"". # string1 cfg:matches "\"[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*\"". string cfg:matches "(\"\"\"[^\"\\\\]*(?:(?:\\\\.|\"(?!\"\"))[^\"\\\\]*)*\"\"\")|(\"[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*\")"; cfg:canStartWith "\"". #ends