lib/json/ld/writer.rb in json-ld-0.0.3 vs lib/json/ld/writer.rb in json-ld-0.0.4

- old
+ new

@@ -1,11 +1,663 @@ module JSON::LD ## # A JSON-LD parser in Ruby. # + # Note that the natural interface is to write a whole graph at a time. + # Writing statements or Triples will create a graph to add them to + # and then serialize the graph. + # + # @example Obtaining a JSON-LD writer class + # RDF::Writer.for(:jsonld) #=> RDF::N3::Writer + # RDF::Writer.for("etc/test.json") + # RDF::Writer.for(:file_name => "etc/test.json") + # RDF::Writer.for(:file_extension => "json") + # RDF::Writer.for(:content_type => "application/turtle") + # + # @example Serializing RDF graph into an JSON-LD file + # JSON::LD::Writer.open("etc/test.json") do |writer| + # writer << graph + # end + # + # @example Serializing RDF statements into an JSON-LD file + # JSON::LD::Writer.open("etc/test.json") do |writer| + # graph.each_statement do |statement| + # writer << statement + # end + # end + # + # @example Serializing RDF statements into an JSON-LD string + # JSON::LD::Writer.buffer do |writer| + # graph.each_statement do |statement| + # writer << statement + # end + # end + # + # The writer will add prefix definitions, and use them for creating @context definitions, and minting CURIEs + # + # @example Creating @base, @vocab and @context prefix definitions in output + # JSON::LD::Writer.buffer( + # :base_uri => "http://example.com/", + # :vocab => "http://example.net/" + # :prefixes => { + # nil => "http://example.com/ns#", + # :foaf => "http://xmlns.com/foaf/0.1/"} + # ) do |writer| + # graph.each_statement do |statement| + # writer << statement + # end + # end + # + # Select the :canonicalize option to output JSON-LD in canonical form + # # @see http://json-ld.org/spec/ED/20110507/ + # @see http://json-ld.org/spec/ED/20110507/#the-normalization-algorithm # @author [Gregg Kellogg](http://greggkellogg.net/) class Writer < RDF::Writer format Format + + # @attr [Graph] Graph of statements serialized + attr :graph + # @attr [URI] Base IRI used for relativizing IRIs + attr :base_uri + # @attr [String] Vocabulary prefix used for relativizing IRIs + attr :vocab + + # Type coersion to use for serialization. Defaults to DEFAULT_COERCION + # + # Maintained as a reverse mapping of `property` => `type`. + # + # @attr [Hash{RDF::URI => RDF::URI}] + attr :coerce, true + + ## + # Return the pre-serialized Hash before turning into JSON + # + # @return [Hash] + def self.hash(*args, &block) + hash = {} + self.new(hash, *args, &block) + hash + end + + ## + # Initializes the RDF-LD writer instance. + # + # @param [IO, File] output + # the output stream + # @param [Hash{Symbol => Object}] options + # any additional options + # @option options [Encoding] :encoding (Encoding::UTF_8) + # the encoding to use on the output stream (Ruby 1.9+) + # @option options [Boolean] :canonicalize (false) + # whether to canonicalize literals when serializing + # @option options [Hash] :prefixes (Hash.new) + # the prefix mappings to use (not supported by all writers) + # @option options [#to_s] :base_uri (nil) + # Base IRI used for relativizing IRIs + # @option options [#to_s] :vocab (nil) + # Vocabulary prefix used for relativizing IRIs + # @option options [Boolean] :standard_prefixes (false) + # Add standard prefixes to @prefixes, if necessary. + # @yield [writer] `self` + # @yieldparam [RDF::Writer] writer + # @yieldreturn [void] + # @yield [writer] + # @yieldparam [RDF::Writer] writer + def initialize(output = $stdout, options = {}, &block) + super do + @graph = RDF::Graph.new + @iri_to_prefix = DEFAULT_CONTEXT.dup.delete_if {|k,v| k == COERCE}.invert + @coerce = DEFAULT_COERCE.merge(options[:coerce] || {}) + if block_given? + case block.arity + when 0 then instance_eval(&block) + else block.call(self) + end + end + end + end + + ## + # Write whole graph + # + # @param [Graph] graph + # @return [void] + def write_graph(graph) + add_debug "Add graph #{graph.inspect}" + @graph = graph + end + + ## + # Addes a statement to be serialized + # @param [RDF::Statement] statement + # @return [void] + def write_statement(statement) + @graph.insert(statement) + end + + ## + # Addes a triple to be serialized + # @param [RDF::Resource] subject + # @param [RDF::URI] predicate + # @param [RDF::Value] object + # @return [void] + # @raise [NotImplementedError] unless implemented in subclass + # @abstract + def write_triple(subject, predicate, object) + @graph.insert(Statement.new(subject, predicate, object)) + end + + ## + # Outputs the Serialized JSON-LD representation of all stored triples. + # + # @return [void] + # @see #write_triple + def write_epilogue + @base_uri = RDF::URI(@options[:base_uri]) if @options[:base_uri] && !@options[:canonicalize] + @vocab = @options[:vocab] unless @options[:canonicalize] + @debug = @options[:debug] + + reset + + add_debug "\nserialize: graph: #{@graph.size}" + + preprocess + + # Don't generate context for canonical output + json_hash = @options[:canonicalize] ? {} : start_document + + elements = [] + order_subjects.each do |subject| + unless is_done?(subject) + elements << subject(subject, json_hash) + end + end + + return if elements.empty? + + if elements.length == 1 && elements.first.is_a?(Hash) + json_hash.merge!(elements.first) + else + json_hash[SUBJECT] = elements + end + + if @output.is_a?(Hash) + @output.merge!(json_hash) + else + json_state = if @options[:canonicalize] + JSON::State.new( + :indent => "", + :space => "", + :space_before => "", + :object_nl => "", + :array_nl => "" + ) + else + JSON::State.new( + :indent => " ", + :space => " ", + :space_before => "", + :object_nl => "\n", + :array_nl => "\n" + ) + end + @output.write(json_hash.to_json(json_state)) + end + end + + ## + # Returns the representation of a IRI reference. + # + # Spec confusion: should a subject URI be normalized? + # + # @param [RDF::URI] value + # @param [Hash{Symbol => Object}] options + # @option options [:subject, :predicate, :object] position + # Useful when determining how to serialize. + # @option options [RDF::URI] property + # Property for object reference, which can be used to return + # bare strings, rather than {"iri":} + # @return [Object] + def format_uri(value, options = {}) + result = case options[:position] + when :subject + # attempt base_uri replacement + short = value.to_s.sub(base_uri.to_s, "") + short == value.to_s ? (get_curie(value) || value.to_s) : short + when :predicate + # attempt vocab replacement + short = TYPE if value == RDF.type + short ||= value.to_s.sub(@vocab.to_s, "") + short == value.to_s ? (get_curie(value) || value.to_s) : short + else + # Encode like a subject + iri_range?(options[:property]) ? + format_uri(value, :position => :subject) : + {:iri => format_uri(value, :position => :subject)} + end + + add_debug("format_uri(#{options.inspect}, #{value.inspect}) => #{result.inspect}") + result + end + + ## + # @param [RDF::Node] value + # @param [Hash{Symbol => Object}] options + # @return [String] + # @raise [NotImplementedError] unless implemented in subclass + # @abstract + def format_node(value, options = {}) + format_uri(value, options) + end + + ## + # Returns the representation of a literal. + # + # @param [RDF::Literal, String, #to_s] literal + # @param [Hash{Symbol => Object}] options + # @option options [RDF::URI] property + # Property referencing literal for type coercion + # @return [Object] + def format_literal(literal, options = {}) + if options[:canonical] || @options[:canonicalize] + return { + :literal => literal.value, + :datatype => (format_uri(literal.datatype, :position => :subject) if literal.has_datatype?), + :language => (literal.language.to_s if literal.has_language?) + }.delete_if {|k,v| v.nil?} + end + + case literal + when RDF::Literal::Integer, RDF::Literal::Boolean + literal.object + when RDF::Literal + if datatype_range?(options[:property]) || !(literal.has_datatype? || literal.has_language?) + # Datatype coercion where literal has the same datatype + literal.value + else + format_literal(literal, :canonical => true) + end + end + end + + ## + # Serialize an RDF list + # @param [RDF::URI] object + # @param [Hash{Symbol => Object}] options + # @option options [RDF::URI] property + # Property referencing literal for type coercion + # @return [Array<Array<Object>>] + def format_list(object, options = {}) + predicate = options[:property] + list = [] + + add_debug "format_list(#{object}, #{predicate})" + + @depth += 1 + while object do + subject_done(object) + p = @graph.properties(object) + item = p.fetch(RDF.first.to_s, []).first + if item + add_debug "format_list serialize #{item.inspect}" + list << if predicate || item.literal? + property(predicate, item) + else + subject(item) + end + end + object = p.fetch(RDF.rest.to_s, []).first + end + @depth -= 1 + + # Returns + add_debug "format_list => #{[list].inspect}" + [list] + end + + private + ## + # Generate @context + # @return [Hash] + def start_document + ctx = {} + ctx[BASE] = base_uri.to_s if base_uri + ctx[VOCAB] = vocab.to_s if vocab + + # Prefixes + prefixes.keys.sort {|a,b| a.to_s <=> b.to_s}.each do |k| + next if DEFAULT_CONTEXT.has_key?(k.to_s) + add_debug "prefix[#{k}] => #{prefixes[k]}" + ctx[k.to_s] = prefixes[k].to_s + end + + # Coerce + add_debug "start_doc: coerce= #{coerce.inspect}" + unless coerce == DEFAULT_COERCE + c_h = {} + coerce.keys.sort.each do |k| + next if coerce[k] == DEFAULT_COERCE[k] || + coerce[k] == false || + coerce[k] == RDF::XSD.integer || + coerce[k] == RDF::XSD.boolean + k_iri = format_uri(k, :position => :predicate) + d_iri = format_uri(coerce[k], :position => :subject) + add_debug "coerce[#{k_iri}] => #{d_iri}" + case c_h[d_iri] + when nil + c_h[d_iri] = k_iri + when Array + c_h[d_iri] << k_iri + else + c_h[d_iri] = [c_h[d_iri], k_iri] + end + end + + ctx[COERCE] = c_h unless c_h.empty? + end + + add_debug "start_doc: context=#{ctx.inspect}" + # Return hash with @context, or empty + ctx.empty? ? {} : {CONTEXT => ctx} + end + + # Perform any preprocessing of statements required + def preprocess + # Load defined prefixes + (@options[:prefixes] || {}).each_pair do |k, v| + @iri_to_prefix[v.to_s] = k + end + @options[:prefixes] = {} # Will define actual used when matched + + @graph.each {|statement| preprocess_statement(statement)} + end + + # Perform any statement preprocessing required. This is used to perform reference counts and determine required + # prefixes. + # @param [Statement] statement + def preprocess_statement(statement) + add_debug "preprocess: #{statement.inspect}" + references = ref_count(statement.object) + 1 + @references[statement.object] = references + @subjects[statement.subject] = true + + # Pre-fetch qnames, to fill prefixes + get_curie(statement.subject) + get_curie(statement.predicate) + if statement.object.literal? + datatype_range?(statement.predicate) # To figure out coercion requirements + else + iri_range?(statement.predicate) + get_curie(statement.object) + end + + @references[statement.predicate] = ref_count(statement.predicate) + 1 + end + + # Serialize a subject + # Option contains referencing property, if this is recursive + # @return [Hash] + def subject(subject, options = {}) + defn = {} + + raise RDF::WriterError, "Illegal use of subject #{subject.inspect}, not supported" unless subject.resource? + + subject_done(subject) + properties = @graph.properties(subject) + add_debug "subject: #{subject.inspect}, props: #{properties.inspect}" + + @graph.query(:subject => subject).each do |st| + raise RDF::WriterError, "Illegal use of predicate #{st.predicate.inspect}, not supported in RDF/XML" unless st.predicate.uri? + end + + if subject.node? && ref_count(subject) > (options[:property] ? 1 : 0) && options[:canonicalize] + raise RDF::WriterError, "Can't serialize named node when normalizing" + end + + # Subject may be a list + if is_valid_list?(subject) + add_debug "subject is a list" + defn[SUBJECT] = format_list(subject) + properties.delete(RDF.first.to_s) + properties.delete(RDF.rest.to_s) + + # Special case, if there are no properties, then we can just serialize the list itself + return defn[SUBJECT].first if properties.empty? + elsif subject.uri? || ref_count(subject) > 1 + add_debug "subject is a uri" + # Don't need to set subject if it's a Node without references + defn[SUBJECT] = format_uri(subject, :position => :subject) + else + add_debug "subject is an unreferenced BNode" + end + + prop_list = order_properties(properties) + #add_debug "=> property order: #{prop_list.to_sentence}" + + prop_list.each do |prop| + predicate = RDF::URI.intern(prop) + + p_iri = format_uri(predicate, :position => :predicate) + @depth += 1 + defn[p_iri] = property(predicate, properties[prop]) + add_debug "prop(#{p_iri}) => #{properties[prop]} => #{defn[p_iri].inspect}" + @depth -= 1 + end + + add_debug "subject: #{subject} has defn: #{defn.inspect}" + defn + end + + ## + # Serialize objects for a property + # + # Spec confusion: sorting of multi-valued properties not adequately specified. + # + # @param [RDF::URI] predicate + # @param [Array<RDF::URI>, RDF::URI] objects + # @param [Hash{Symbol => Object}] options + # @return [Array, Hash, Object] + def property(predicate, objects, options = {}) + objects = objects.first if objects.is_a?(Array) && objects.length == 1 + case objects + when Array + objects.sort_by(&:to_s).map {|o| property(predicate, o, options)} + when RDF::Literal + format_literal(objects, options.merge(:property => predicate)) + else + if is_valid_list?(objects) + format_list(objects, :property => predicate) + elsif is_done?(objects) || !@subjects.include?(objects) + format_uri(objects, :position => :object, :property => predicate) + else + subject(objects, :property => predicate) + end + end + end + + ## + # Return a CURIE for the IRI, or nil. Adds namespace of CURIE to defined prefixes + # @param [RDF::Resource] resource + # @return [String, nil] value to use to identify IRI + def get_curie(resource) + add_debug "get_curie(#{resource.inspect})" + case resource + when RDF::Node + return resource.to_s + when RDF::URI + iri = resource.to_s + return iri if options[:canonicalize] + else + return nil + end + + curie = case + when @iri_to_curie.has_key?(iri) + return @iri_to_curie[iri] + when u = @iri_to_prefix.keys.detect {|u| iri.index(u.to_s) == 0} + # Use a defined prefix + prefix = @iri_to_prefix[u] + prefix(prefix, u) # Define for output + iri.sub(u.to_s, "#{prefix}:") + when @options[:standard_prefixes] && vocab = RDF::Vocabulary.detect {|v| iri.index(v.to_uri.to_s) == 0} + prefix = vocab.__name__.to_s.split('::').last.downcase + @iri_to_prefix[vocab.to_uri.to_s] = prefix + prefix(prefix, vocab.to_uri) # Define for output + iri.sub(vocab.to_uri.to_s, "#{prefix}:") + else + nil + end + + @iri_to_curie[iri] = curie + rescue Addressable::URI::InvalidURIError => e + raise RDF::WriterError, "Invalid IRI #{resource.inspect}: #{e.message}" + end + + ## + # Take a hash from predicate IRIs to lists of values. + # Sort the lists of values. Return a sorted list of properties. + # @param [Hash{String => Array<Resource>}] properties A hash of Property to Resource mappings + # @return [Array<String>}] Ordered list of properties. + def order_properties(properties) + # Make sorted list of properties + prop_list = [] + + properties.keys.sort do |a,b| + format_uri(a, :position => :predicate) <=> format_uri(b, :position => :predicate) + end.each do |prop| + prop_list << prop.to_s + end + + prop_list + end + + # Order subjects for output. Override this to output subjects in another order. + # + # Uses #base_uri. + # @return [Array<Resource>] Ordered list of subjects + def order_subjects + seen = {} + subjects = [] + + return @subjects.keys.sort do |a,b| + format_iri(a, :position => :subject) <=> format_iri(b, :position => :subject) + end if @options[:canonicalize] + + # Start with base_uri + if base_uri && @subjects.keys.include?(base_uri) + subjects << base_uri + seen[base_uri] = true + end + + # Sort subjects by resources over bnodes, ref_counts and the subject URI itself + recursable = @subjects.keys. + select {|s| !seen.include?(s)}. + map {|r| [r.is_a?(RDF::Node) ? 1 : 0, ref_count(r), r]}. + sort + + subjects += recursable.map{|r| r.last} + end + + # Return the number of times this node has been referenced in the object position + # @return [Integer] + def ref_count(node) + @references.fetch(node, 0) + end + + ## + # Does predicate have a range of IRI? + # @param [RDF::URI] predicate + # @return [Boolean] + def iri_range?(predicate) + return false if predicate.nil? || @options[:canonicalize] + + unless coerce.has_key?(predicate) + # objects of all statements with the predicate may not be literal + coerce[predicate] = @graph.query(:predicate => predicate).to_a.any? {|st| st.object.literal?} ? + false : RDF::XSD.anyURI + end + + add_debug "iri_range(#{predicate}) = #{coerce[predicate].inspect}" + coerce[predicate] == RDF::XSD.anyURI + end + + ## + # Does predicate have a range of specific typed literal? + # @param [RDF::URI] predicate + # @return [Boolean] + def datatype_range?(predicate) + unless coerce.has_key?(predicate) + # objects of all statements with the predicate must be literal + # and have the same non-nil datatype + dt = nil + @graph.query(:predicate => predicate) do |st| + if st.object.literal? && st.object.has_datatype? + dt = st.object.datatype if dt.nil? + dt = false unless dt == st.object.datatype + else + dt = false + end + end + add_debug "range(#{predicate}) = #{dt.inspect}" + coerce[predicate] = dt + end + + coerce[predicate] + end + + # Reset internal helper instance variables + def reset + @depth = 0 + @references = {} + @serialized = {} + @subjects = {} + @iri_to_curie = {} + end + + # Add debug event to debug array, if specified + # + # @param [String] message:: + def add_debug(message) + msg = "#{" " * @depth * 2}#{message}" + STDERR.puts msg if ::JSON::LD::debug? + @debug << msg if @debug.is_a?(Array) + end + + # Checks if l is a valid RDF list, i.e. no nodes have other properties. + def is_valid_list?(l) + props = @graph.properties(l) + unless l.node? && props.has_key?(RDF.first.to_s) || l == RDF.nil + add_debug "is_valid_list: false, #{l.inspect}: #{props.inspect}" + return false + end + + while l && l != RDF.nil do + #add_debug "is_valid_list(length): #{props.length}" + return false unless props.has_key?(RDF.first.to_s) && props.has_key?(RDF.rest.to_s) + n = props[RDF.rest.to_s] + unless n.is_a?(Array) && n.length == 1 + add_debug "is_valid_list: false, #{n.inspect}" + return false + end + l = n.first + unless l.node? || l == RDF.nil + add_debug "is_valid_list: false, #{l.inspect}" + return false + end + props = @graph.properties(l) + end + add_debug "is_valid_list: valid" + true + end + + def is_done?(subject) + @serialized.include?(subject) + end + + # Mark a subject as done. + def subject_done(subject) + @serialized[subject] = true + end end end