require File.join(File.dirname(__FILE__), 'parser') module RdfContext ## # An RDFa parser in Ruby # # Based on processing rules described here: http://www.w3.org/TR/rdfa-syntax/#s_model # # Ben Adida # 2008-05-07 # Gregg Kellogg # 2009-08-04 class RdfaParser < Parser attr_reader :namespace # The Recursive Baggage class EvaluationContext # :nodoc: attr :base, true attr :parent_subject, true attr :parent_object, true attr :uri_mappings, true attr :incomplete_triples, true attr :language, true def initialize(base) # Initialize the evaluation context, [5.1] @base = base @parent_subject = @base @parent_object = nil @uri_mappings = {} @incomplete_triples = [] @language = nil end # Copy this Evaluation Context def initialize_copy(from) # clone the evaluation context correctly @uri_mappings = from.uri_mappings.clone @incomplete_triples = from.incomplete_triples.clone end def inspect v = %w(base parent_subject parent_object language).map {|a| "#{a}='#{self.send(a).nil? ? 'nil' : self.send(a)}'"} v << "uri_mappings[#{uri_mappings.keys.length}]" v << "incomplete_triples[#{incomplete_triples.length}]" v.join(",") end end # Parse XHTML+RDFa document from a string or input stream to closure or graph. # # If the parser is called with a block, triples are passed to the block rather # than added to the graph. # # Optionally, the stream may be a Nokogiri::HTML::Document or Nokogiri::XML::Document # With a block, yeilds each statement with URIRef, BNode or Literal elements # # @param [IO] stream:: the HTML+RDFa IO stream, string, Nokogiri::HTML::Document or Nokogiri::XML::Document # @param [String] uri:: the URI of the document # @param [Hash] options:: Parser options, one of # options[:debug]:: Array to place debug messages # options[:strict]:: Raise Error if true, continue with lax parsing, otherwise # @return [Graph]:: Returns the graph containing parsed triples # @raise [Error]:: Raises RdfError if _strict_ def parse(stream, uri = nil, options = {}, &block) # :yields: triple super @doc = case stream when Nokogiri::HTML::Document then stream when Nokogiri::XML::Document then stream else Nokogiri::XML.parse(stream, uri.to_s) end raise ParserException, "Empty document" if @doc.nil? && @strict @callback = block # If the doc has a default, use that as "html" ns = @doc.namespaces["xmlns"] ns ||= "http://www.w3.org/1999/xhtml" # FIXME: intuite from DOCTYPE, or however @namespace = Namespace.new(ns, "html") if ns # parse parse_whole_document(@doc, @uri) @graph end private # Parsing an RDFa document (this is *not* the recursive method) def parse_whole_document(doc, base) # find if the document has a base element base_el = doc.css('html>head>base').first if (base_el) base = base_el.attributes['href'] # Strip any fragment from base base = base.to_s.split("#").first add_debug(base_el, "parse_whole_doc: base='#{base}'") end # initialize the evaluation context with the appropriate base evaluation_context= EvaluationContext.new(base) traverse(doc.root, evaluation_context) end # Extract the XMLNS mappings from an element def extract_mappings(element) mappings = {} # look for xmlns element.namespaces.each do |attr_name,attr_value| begin abbr, suffix = attr_name.split(":") mappings[suffix] = @graph.bind(Namespace.new(attr_value, suffix)) if abbr == "xmlns" rescue RdfException => e add_debug(element, "extract_mappings raised #{e.class}: #{e.message}") raise if @strict end end add_debug(element, "mappings: #{mappings.keys.join(", ")}") mappings end # The recursive helper function def traverse(element, evaluation_context) if element.nil? add_debug(element, "traverse nil element") raise ParserException, "Can't parse nil element" if @strict return nil end # local variables [5.5 Step 1] recurse = true skip = false new_subject = nil current_object_resource = nil uri_mappings = evaluation_context.uri_mappings.clone incomplete_triples = [] language = evaluation_context.language # shortcut attrs = element.attributes about = attrs['about'] src = attrs['src'] resource = attrs['resource'] href = attrs['href'] # Pull out the attributes needed for the skip test. property = attrs['property'].to_s if attrs['property'] typeof = attrs['typeof'].to_s if attrs['typeof'] datatype = attrs['datatype'].to_s if attrs['datatype'] content = attrs['content'].to_s if attrs['content'] rel = attrs['rel'].to_s if attrs['rel'] rev = attrs['rev'].to_s if attrs['rev'] # SPEC CONFUSION: not sure what to initialize this value to current_object_literal = nil # XMLNS mappings [5.5 Step 2] uri_mappings.merge!(extract_mappings(element)) # Language information [5.5 Step 3] add_debug(element, "traverse, lang: #{attrs['lang']}") if attrs['lang'] language = attrs['lang'] || language # rels and revs rels = parse_curies(rel, uri_mappings, evaluation_context.base, true) revs = parse_curies(rev, uri_mappings, evaluation_context.base, true) valid_rel_or_rev = !(rel.nil? && rev.nil?) add_debug(element, "traverse, ec: #{evaluation_context.inspect}") add_debug(element, "traverse, about: #{about.nil? ? 'nil' : about}, src: #{src.nil? ? 'nil' : src}, resource: #{resource.nil? ? 'nil' : resource}, href: #{href.nil? ? 'nil' : href}") add_debug(element, "traverse, property: #{property.nil? ? 'nil' : property}, typeof: #{typeof.nil? ? 'nil' : typeof}, datatype: #{datatype.nil? ? 'nil' : datatype}, content: #{content.nil? ? 'nil' : content}") add_debug(element, "traverse, rels: #{rels.join(" ")}, revs: #{revs.join(" ")}") if not valid_rel_or_rev # Establishing a new subject if no valid rel/rev [5.5 Step 4] if about new_subject = uri_or_safe_curie(about, evaluation_context, uri_mappings) elsif src new_subject = URIRef.new(src, evaluation_context.base) elsif resource new_subject = uri_or_safe_curie(resource, evaluation_context, uri_mappings) elsif href new_subject = URIRef.new(href, evaluation_context.base) end # SPEC CONFUSION: not sure what "If no URI is provided by a resource attribute" means, I assume # it means that new_subject is still null if new_subject.nil? if element.name =~ /^(head|body)$/ && evaluation_context.base new_subject = URIRef.new(evaluation_context.base) elsif element.attributes['typeof'] new_subject = BNode.new else # if it's null, it's null and nothing changes new_subject = evaluation_context.parent_object skip = true unless property end end add_debug(element, "new_subject: #{new_subject}, skip = #{skip}") else # Establish both new subject and current object resource [5.5 Step 5] if about new_subject = uri_or_safe_curie(about, evaluation_context, uri_mappings) elsif src new_subject = uri_or_safe_curie(src, evaluation_context, uri_mappings) end # If no URI is provided then the first match from the following rules will apply if new_subject.nil? if element.name =~ /^(head|body)$/ new_subject = URIRef.new(evaluation_context.base) elsif element.attributes['typeof'] new_subject = BNode.new else # if it's null, it's null and nothing changes new_subject = evaluation_context.parent_object # no skip flag set this time end end if resource current_object_resource = uri_or_safe_curie(resource, evaluation_context, uri_mappings) elsif href current_object_resource = URIRef.new(href, evaluation_context.base) end add_debug(element, "new_subject: #{new_subject}, current_object_resource = #{current_object_resource.nil? ? 'nil' : current_object_resource}") end # Process @typeof if there is a subject [Step 6] if new_subject and typeof types = parse_curies(typeof, uri_mappings, evaluation_context.base, false) add_debug(element, "typeof: #{typeof}") types.each do |one_type| add_triple(element, new_subject, RDF_TYPE, one_type) end end # Generate triples with given object [Step 7] if current_object_resource rels.each do |rel| add_triple(element, new_subject, rel, current_object_resource) end revs.each do |rev| add_triple(element, current_object_resource, rev, new_subject) end else # Incomplete triples and bnode creation [Step 8] add_debug(element, "step 8: valid: #{valid_rel_or_rev}, rels: #{rels}, revs: #{revs}") current_object_resource = BNode.new if valid_rel_or_rev rels.each do |rel| # SPEC CONFUSION: we don't store the subject here? incomplete_triples << {:predicate => rel, :direction => :forward} end revs.each do |rev| # SPEC CONFUSION: we don't store the object here? incomplete_triples << {:predicate => rev, :direction => :reverse} end end # Establish current object literal [Step 9] if property properties = parse_curies(property, uri_mappings, evaluation_context.base, false) # get the literal datatype type = datatype children_node_types = element.children.collect{|c| c.class}.uniq # the following 3 IF clauses should be mutually exclusive. Written as is to prevent extensive indentation. # SPEC CONFUSION: have to special case XML Literal, not clear right away. # SPEC CONFUSION: specify that the conditions are in order of priority type_resource = curie_to_resource_or_bnode(type, uri_mappings, evaluation_context.base) if type if type and !type.empty? and (type_resource.to_s != XML_LITERAL.to_s) # typed literal add_debug(element, "typed literal") current_object_literal = Literal.typed(content || element.inner_text, type_resource, :language => language) elsif content or (children_node_types == [Nokogiri::XML::Text]) or (element.children.length == 0) or (type == '') # plain literal add_debug(element, "plain literal") current_object_literal = Literal.untyped(content || element.inner_text, language) elsif children_node_types != [Nokogiri::XML::Text] and (type == nil or type_resource.to_s == XML_LITERAL.to_s) # XML Literal add_debug(element, "XML Literal: #{element.inner_html}") current_object_literal = Literal.typed(element.children, XML_LITERAL, :language => language, :namespaces => uri_mappings) recurse = false end # add each property properties.each do |property| add_triple(element, new_subject, property, current_object_literal) end # SPEC CONFUSION: "the triple has been created" ==> there may be more than one # set the recurse flag above in the IF about xmlliteral, as it is the only place that can happen end # Complete the incomplete triples from the evaluation context [Step 10] add_debug(element, "10: skip=#{skip}, new_subject=#{new_subject}") if not skip and new_subject evaluation_context.incomplete_triples.each do |trip| if trip[:direction] == :forward add_triple(element, evaluation_context.parent_subject, trip[:predicate], new_subject) elsif trip[:direction] == :reverse add_triple(element, new_subject, trip[:predicate], evaluation_context.parent_subject) end end end # Create a new evaluation context and proceed recursively [Step 11] if recurse # SPEC CONFUSION: new evaluation context for each child? Probably not necessary, # but maybe needs to be pointed out? if skip new_ec = evaluation_context.clone new_ec.language = language new_ec.uri_mappings = uri_mappings add_debug(element, "skip: cloned ec: #{evaluation_context.inspect}") else # create a new evaluation context new_ec = EvaluationContext.new(evaluation_context.base) new_ec.parent_subject = new_subject || evaluation_context.parent_subject new_ec.parent_object = current_object_resource || new_subject || evaluation_context.parent_subject new_ec.uri_mappings = uri_mappings new_ec.incomplete_triples = incomplete_triples new_ec.language = language #add_debug(element, "new ec: #{new_ec.inspect}") end element.children.each do |child| # recurse only if it's an element traverse(child, new_ec) if child.class == Nokogiri::XML::Element end end end # space-separated CURIEs or Link Types def parse_curies(value, uri_mappings, base, with_link_types=false) return [] unless value resource_array = [] value.to_s.split(' ').each do |curie| if curie.include?(":") resource_array << curie_to_resource_or_bnode(curie, uri_mappings, base) elsif with_link_types # Reserved words are all mapped to lower case curie = curie.to_s.downcase link_type_curie = curie_to_resource_or_bnode(":#{curie}", XH_MAPPING, base) if LINK_TYPES.include?(curie) resource_array << link_type_curie if link_type_curie end end resource_array end def curie_to_resource_or_bnode(curie, uri_mappings, subject) # URI mappings for CURIEs default to XH_MAPPING, rather than the default doc namespace uri_mappings = uri_mappings.merge(XH_MAPPING) prefix, suffix = curie.to_s.split(":") # consider the bnode situation if prefix == "_" # we force a non-nil name, otherwise it generates a new name BNode.new(suffix || "", @named_bnodes) elsif curie.to_s.empty? add_debug(nil, "curie_to_resource_or_bnode #{URIRef.new(subject)}") # Empty curie resolves to current subject (No, an empty curie should be ignored) # URIRef.new(subject) nil else ns = uri_mappings[prefix.to_s] unless ns add_debug(nil, "curie_to_resource_or_bnode No namespace mapping for #{prefix}") raise ParserException, "No namespace mapping for #{prefix}" if @strict return nil end ns + suffix end end def uri_or_safe_curie(value, evaluation_context, uri_mappings) return nil if value.nil? # check if the value is [foo:bar] if value.to_s.match(/^\[(.*)\]$/) curie_to_resource_or_bnode($1, uri_mappings, evaluation_context.parent_subject) else URIRef.new(value, evaluation_context.base) end end end end