# -*- encoding: utf-8 -*- require 'strscan' module RDF::NTriples ## # N-Triples parser. # # @example Obtaining an NTriples reader class # RDF::Reader.for(:ntriples) #=> RDF::NTriples::Reader # RDF::Reader.for("etc/doap.nt") # RDF::Reader.for(file_name: "etc/doap.nt") # RDF::Reader.for(file_extension: "nt") # RDF::Reader.for(content_type: "application/n-triples") # # @example Parsing RDF statements from an NTriples file # RDF::NTriples::Reader.open("etc/doap.nt") do |reader| # reader.each_statement do |statement| # puts statement.inspect # end # end # # @example Parsing RDF statements from an NTriples string # data = StringIO.new(File.read("etc/doap.nt")) # RDF::NTriples::Reader.new(data) do |reader| # reader.each_statement do |statement| # puts statement.inspect # end # end # # ** RDF=star # # Supports statements as resources using `<<s p o>>`. # # @see http://www.w3.org/TR/rdf-testcases/#ntriples # @see http://www.w3.org/TR/n-triples/ class Reader < RDF::Reader include RDF::Util::Logger format RDF::NTriples::Format # @see http://www.w3.org/TR/rdf-testcases/#ntrip_strings ESCAPE_CHARS = ["\b", "\f", "\t", "\n", "\r", "\"", "'", "\\"].freeze UCHAR4 = /\\u([0-9A-Fa-f]{4,4})/.freeze UCHAR8 = /\\U([0-9A-Fa-f]{8,8})/.freeze UCHAR = Regexp.union(UCHAR4, UCHAR8).freeze # Terminals from rdf-turtle. # # @see http://www.w3.org/TR/n-triples/ # @see http://www.w3.org/TR/turtle/ ## # Unicode regular expressions. U_CHARS1 = Regexp.compile(<<-EOS.gsub(/\s+/, '')) [\\u00C0-\\u00D6]|[\\u00D8-\\u00F6]|[\\u00F8-\\u02FF]| [\\u0370-\\u037D]|[\\u037F-\\u1FFF]|[\\u200C-\\u200D]| [\\u2070-\\u218F]|[\\u2C00-\\u2FEF]|[\\u3001-\\uD7FF]| [\\uF900-\\uFDCF]|[\\uFDF0-\\uFFFD]|[\\u{10000}-\\u{EFFFF}] EOS U_CHARS2 = Regexp.compile("\\u00B7|[\\u0300-\\u036F]|[\\u203F-\\u2040]").freeze IRI_RANGE = Regexp.compile("[[^<>\"{}\|\^`\\\\]&&[^\\x00-\\x20]]").freeze PN_CHARS_BASE = /[A-Z]|[a-z]|#{U_CHARS1}/.freeze PN_CHARS_U = /_|#{PN_CHARS_BASE}/.freeze PN_CHARS = /-|[0-9]|#{PN_CHARS_U}|#{U_CHARS2}/.freeze ECHAR = /\\[tbnrf"'\\]/.freeze IRIREF = /<((?:#{IRI_RANGE}|#{UCHAR})*)>/.freeze BLANK_NODE_LABEL = /_:((?:[0-9]|#{PN_CHARS_U})(?:(?:#{PN_CHARS}|\.)*#{PN_CHARS})?)/.freeze LANG_DIR = /@([a-zA-Z]+(?:-[a-zA-Z0-9]+)*(?:--[a-zA-Z]+)?)/.freeze STRING_LITERAL_QUOTE = /"((?:[^\"\\\n\r]|#{ECHAR}|#{UCHAR})*)"/.freeze ST_START = /^<</.freeze ST_END = /^\s*>>/.freeze # @see http://www.w3.org/TR/rdf-testcases/#ntrip_grammar COMMENT = /^#\s*(.*)$/.freeze NODEID = /^#{BLANK_NODE_LABEL}/.freeze URIREF = /^#{IRIREF}/.freeze LITERAL_PLAIN = /^#{STRING_LITERAL_QUOTE}/.freeze LITERAL_WITH_LANGUAGE = /^#{STRING_LITERAL_QUOTE}#{LANG_DIR}/.freeze LITERAL_WITH_DATATYPE = /^#{STRING_LITERAL_QUOTE}\^\^#{IRIREF}/.freeze DATATYPE_URI = /^\^\^#{IRIREF}/.freeze LITERAL = Regexp.union(LITERAL_WITH_LANGUAGE, LITERAL_WITH_DATATYPE, LITERAL_PLAIN).freeze SUBJECT = Regexp.union(URIREF, NODEID).freeze PREDICATE = Regexp.union(URIREF).freeze OBJECT = Regexp.union(URIREF, NODEID, LITERAL).freeze END_OF_STATEMENT = /^\s*\.\s*(?:#.*)?$/.freeze # LANGTAG is deprecated LANGTAG = LANG_DIR ## # Reconstructs an RDF value from its serialized N-Triples # representation. # # @param [String] input # @param [{Symbol => Object}] options # From {RDF::Reader#initialize} # @option options [RDF::Util::Logger] :logger ([]) # @return [RDF::Term] def self.unserialize(input, **options) case input when nil then nil else self.new(input, logger: [], **options).read_value end end ## # (see unserialize) # @return [RDF::Resource] def self.parse_subject(input, **options) parse_uri(input, **options) || parse_node(input, **options) end ## # (see unserialize) # @return [RDF::URI] def self.parse_predicate(input, **options) parse_uri(input, intern: true) end ## # (see unserialize) def self.parse_object(input, **options) parse_uri(input, **options) || parse_node(input, **options) || parse_literal(input, **options) end ## # (see unserialize) # @return [RDF::Node] def self.parse_node(input, **options) if input =~ NODEID RDF::Node.new($1) end end ## # (see unserialize) # @param [Boolean] intern (false) Use Interned URI # @return [RDF::URI] def self.parse_uri(input, intern: false, **options) if input =~ URIREF RDF::URI.send(intern ? :intern : :new, unescape($1)) end end ## # (see unserialize) # @return [RDF::Literal] def self.parse_literal(input, **options) case input when LITERAL_WITH_LANGUAGE RDF::Literal.new(unescape($1), language: $4) when LITERAL_WITH_DATATYPE RDF::Literal.new(unescape($1), datatype: $4) when LITERAL_PLAIN RDF::Literal.new(unescape($1)) end end # cache constants to optimize escaping the escape chars in self.unescape ESCAPE_CHARS_ESCAPED = { "\\b" => "\b", "\\f" => "\f", "\\t" => "\t", "\\n" => "\n", "\\r" => "\r", "\\\"" => "\"", "\\'" => "'", "\\\\" => "\\" } .freeze ESCAPE_CHARS_ESCAPED_REGEXP = Regexp.union( ESCAPE_CHARS_ESCAPED.keys ).freeze ## # @param [String] string # @return [String] # @see http://www.w3.org/TR/rdf-testcases/#ntrip_strings # @see http://blog.grayproductions.net/articles/understanding_m17n # @see http://yehudakatz.com/2010/05/17/encodings-unabridged/ def self.unescape(string) # Note: avoiding copying the input string when no escaping is needed # greatly reduces the number of allocations and the processing time. string = string.dup.force_encoding(Encoding::UTF_8) unless string.encoding == Encoding::UTF_8 scanner = StringScanner.new(string) buffer = "" while !scanner.eos? buffer << if scanner.scan(ESCAPE_CHARS_ESCAPED_REGEXP) ESCAPE_CHARS_ESCAPED[scanner.matched] elsif scanner.scan(UCHAR) scanner.matched.sub(UCHAR) {[($1 || $2).hex].pack('U*')} else # Scan one character scanner.getch end end buffer end ## # @return [RDF::Term] def read_value begin read_statement rescue RDF::ReaderError value = read_uriref || read_node || read_literal || read_quotedTriple log_recover value end end ## # @return [Array] # @see http://www.w3.org/TR/rdf-testcases/#ntrip_grammar def read_triple loop do readline.strip! # EOFError thrown on end of input line = @line # for backtracking input in case of parse error begin unless blank? || read_comment subject = read_uriref || read_node || read_quotedTriple || fail_subject predicate = read_uriref(intern: true) || fail_predicate object = read_uriref || read_node || read_literal || read_quotedTriple || fail_object if validate? && !read_eos log_error("Expected end of statement (found: #{current_line.inspect})", lineno: lineno, exception: RDF::ReaderError) end return [subject, predicate, object] end rescue RDF::ReaderError => e @line = line # this allows #read_value to work raise e end end end ## # @return [RDF::Statement] def read_quotedTriple if @options[:rdfstar] && match(ST_START) subject = read_uriref || read_node || read_quotedTriple || fail_subject predicate = read_uriref(intern: true) || fail_predicate object = read_uriref || read_node || read_literal || read_quotedTriple || fail_object if !match(ST_END) log_error("Expected end of statement (found: #{current_line.inspect})", lineno: lineno, exception: RDF::ReaderError) end RDF::Statement.new(subject, predicate, object, quoted: true) end end ## # @return [Boolean] # @see http://www.w3.org/TR/rdf-testcases/#ntrip_grammar (comment) def read_comment match(COMMENT) end ## # @param [Boolean] intern (false) Use Interned Node # @return [RDF::URI] # @see http://www.w3.org/TR/rdf-testcases/#ntrip_grammar (uriref) def read_uriref(intern: false, **options) if uri_str = match(URIREF) uri_str = self.class.unescape(uri_str) uri = RDF::URI.send(intern? && intern ? :intern : :new, uri_str) uri.validate! if validate? uri.canonicalize! if canonicalize? uri end rescue ArgumentError log_error("Invalid URI (found: \"<#{uri_str}>\")", lineno: lineno, token: "<#{uri_str}>", exception: RDF::ReaderError) end ## # @return [RDF::Node] # @see http://www.w3.org/TR/rdf-testcases/#ntrip_grammar (nodeID) def read_node if node_id = match(NODEID) @nodes ||= {} @nodes[node_id] ||= RDF::Node.new(node_id) end end ## # @return [RDF::Literal] # @see http://www.w3.org/TR/rdf-testcases/#ntrip_grammar (literal) def read_literal if literal_str = match(LITERAL_PLAIN) literal_str = self.class.unescape(literal_str) literal = case when lang_dir = match(LANG_DIR) language, direction = lang_dir.split('--') raise ArgumentError if direction && !@options[:rdfstar] RDF::Literal.new(literal_str, language: language, direction: direction) when datatype = match(/^(\^\^)/) # FIXME RDF::Literal.new(literal_str, datatype: read_uriref || fail_object) else RDF::Literal.new(literal_str) # plain string literal end literal.validate! if validate? literal.canonicalize! if canonicalize? literal end rescue ArgumentError v = literal_str v += "@#{lang_dir}" if lang_dir log_error("Invalid Literal (found: \"#{v}\")", lineno: lineno, token: "#v", exception: RDF::ReaderError) end ## # @return [Boolean] # @see http://www.w3.org/TR/rdf-testcases/#ntrip_grammar (triple) def read_eos match(END_OF_STATEMENT) end end # Reader end # RDF::NTriples