require 'xml'
require File.join(File.dirname(__FILE__), 'parser')
module RdfContext
class RdfXmlParser < Parser
CORE_SYNTAX_TERMS = %w(RDF ID about parseType resource nodeID datatype).map {|n| "http://www.w3.org/1999/02/22-rdf-syntax-ns##{n}"}
OLD_TERMS = %w(aboutEach aboutEachPrefix bagID).map {|n| "http://www.w3.org/1999/02/22-rdf-syntax-ns##{n}"}
# The Recursive Baggage
class EvaluationContext # :nodoc:
attr_reader :base
attr :subject, true
attr :uri_mappings, true
attr :language, true
attr :graph, true
attr :li_counter, true
def initialize(base, element, graph)
# Initialize the evaluation context, [5.1]
self.base = base
@uri_mappings = {}
@language = nil
@graph = graph
@li_counter = 0
@uri_mappings = {}
extract_from_element(element) if element
end
# Clone existing evaluation context adding information from element
def clone(element, options = {})
new_ec = EvaluationContext.new(@base, nil, @graph)
new_ec.uri_mappings = self.uri_mappings.clone
new_ec.language = self.language
new_ec.extract_from_element(element) if element
options.each_pair {|k, v| new_ec.send("#{k}=", v)}
new_ec
end
# Extract Evaluation Context from an element by looking at ancestors recurively
def extract_from_ancestors(el)
ancestors = el.ancestors
while ancestors.length > 0
a = ancestors.pop
next unless a.element?
extract_from_element(a)
end
extract_from_element(el)
end
# Extract Evaluation Context from an element
def extract_from_element(el)
b = el.attribute_with_ns("base", XML_NS.uri.to_s)
lang = el.attribute_with_ns("lang", XML_NS.uri.to_s)
self.base = URIRef.new(b, self.base) if b
self.language = lang if lang
self.uri_mappings.merge!(extract_mappings(el))
end
# Extract the XMLNS mappings from an element
def extract_mappings(element)
mappings = {}
# look for xmlns
element.namespaces.each do |attr_name,attr_value|
abbr, suffix = attr_name.to_s.split(":")
if abbr == "xmlns"
mappings[suffix] = Namespace.new(attr_value, suffix)
@graph.bind(mappings[suffix])
end
end
mappings
end
# Produce the next list entry for this context
def li_next(predicate)
@li_counter += 1
predicate = Addressable::URI.parse(predicate.to_s)
predicate.fragment = "_#{@li_counter}"
predicate = URIRef.new(predicate)
end
# Set XML base. Ignore any fragment
def base=(b)
b = Addressable::URI.parse(b.to_s)
b.fragment = nil
@base = b.to_s
end
def inspect
v = %w(base subject language).map {|a| "#{a}='#{self.send(a).nil? ? 'nil' : self.send(a)}'"}
v << "uri_mappings[#{uri_mappings.keys.length}]"
v.join(",")
end
end
# Parse RDF/XML document from a string or input stream to closure or graph.
#
# If the parser is called with a block, triples are passed to the block rather
# than added to the graph.
#
# Optionally, the stream may be a string or Nokogiri::XML::Document
#
# @param [IO] stream:: the RDF/XML IO stream, string or Nokogiri::XML::Document
# @param [String] uri:: the URI of the document
# @param [Hash] options:: Parser options, one of
# options[:debug]:: Array to place debug messages
# options[:strict]:: Raise Error if true, continue with lax parsing, otherwise
# @return [Graph]:: Returns the graph containing parsed triples
# @raise [Error]:: Raises RdfError if _strict_
def parse(stream, uri = nil, options = {}, &block) # :yields: triple
super
@doc = case stream
when Nokogiri::XML::Document then stream
else Nokogiri::XML.parse(stream, uri.to_s)
end
raise ParserException, "Synax errors:\n#{@doc.errors}" unless @doc.errors.empty?
@id_mapping = Hash.new
raise ParserException, "Empty document" if @doc.nil? || @doc.root.nil?
@callback = block
root = @doc.root
# Look for rdf:RDF elements and process each.
rdf_nodes = root.xpath("//rdf:RDF", RDF_NS.prefix => RDF_NS.uri.to_s)
if rdf_nodes.length == 0
# If none found, root element may be processed as an RDF Node
ec = EvaluationContext.new(@uri, root, @graph)
nodeElement(root, ec)
else
rdf_nodes.each do |node|
# XXX Skip this element if it's contained within another rdf:RDF element
# Extract base, lang and namespaces from parents to create proper evaluation context
ec = EvaluationContext.new(@uri, nil, @graph)
ec.extract_from_ancestors(node)
node.children.each {|el|
next unless el.elem?
new_ec = ec.clone(el)
nodeElement(el, new_ec)
}
end
end
@graph
end
private
# XML nodeElement production
#
# @param [XML Element] el:: XMl Element to parse
# @param [EvaluationContext] ec:: Evaluation context
# @return [URIRef] subject:: The subject found for the node
# @raise [RdfException]:: Raises Exception if _strict_
def nodeElement(el, ec)
# subject
subject = ec.subject || parse_subject(el, ec)
add_debug(el, "nodeElement, ec: #{ec.inspect}")
add_debug(el, "nodeElement, el: #{el.uri}")
add_debug(el, "nodeElement, subject: #{subject.nil? ? 'nil' : subject.to_s}")
unless el.uri == RDF_NS.Description.to_s
add_triple(el, subject, RDF_TYPE, el.uri)
end
# produce triples for attributes
el.attribute_nodes.each do |attr|
add_debug(el, "propertyAttr: #{attr.uri}='#{attr.value}'")
if attr.uri == RDF_TYPE
# If there is an attribute a in propertyAttr with a.URI == rdf:type
# then u:=uri(identifier:=resolve(a.string-value))
# and the following tiple is added to the graph:
u = URIRef.new(attr.value, ec.base)
add_triple(attr, subject, RDF_TYPE, u)
elsif is_propertyAttr?(attr)
# Attributes not RDF_TYPE
predicate = attr.uri
lit = Literal.untyped(attr.value, ec.language)
add_triple(attr, subject, predicate, lit)
end
end
# Handle the propertyEltList children events in document order
li_counter = 0 # this will increase for each li we iterate through
el.children.each do |child|
next unless child.elem?
child_ec = ec.clone(child)
predicate = child.uri
add_debug(child, "propertyElt, predicate: #{predicate}")
propertyElementURI_check(child)
# Determine the content type of this property element
text_nodes = child.children.select {|e| e.text? && !e.blank?}
element_nodes = child.children.select(&:element?)
add_debug(child, "#{text_nodes.length} text nodes, #{element_nodes.length} element nodes")
if element_nodes.length > 1
element_nodes.each do |node|
add_debug(child, " node: #{node.to_s}")
end
end
# List expansion
predicate = ec.li_next(predicate) if predicate == RDF_NS.li
# Productions based on set of attributes
# All remaining reserved XML Names (See Name in XML 1.0) are now removed from the set.
# These are, all attribute information items in the set with property [prefix] beginning with xml
# (case independent comparison) and all attribute information items with [prefix] property having
# no value and which have [local name] beginning with xml (case independent comparison) are removed.
# Note that the [base URI] accessor is computed by XML Base before any xml:base attribute information item
# is deleted.
attrs = {}
id = datatype = parseType = resourceAttr = nodeID = nil
child.attribute_nodes.each do |attr|
if attr.namespace.to_s.empty?
# The support for a limited set of non-namespaced names is REQUIRED and intended to allow
# RDF/XML documents specified in [RDF-MS] to remain valid;
# new documents SHOULD NOT use these unqualified attributes and applications
# MAY choose to warn when the unqualified form is seen in a document.
add_debug(el, "Unqualified attribute '#{attr}'")
#attrs[attr.to_s] = attr.value unless attr.to_s.match?(/^xml/)
elsif attr.namespace.href == XML_NS.uri.to_s
# No production. Lang and base elements already extracted
elsif attr.namespace.href == RDF_NS.uri.to_s
case attr.name
when "ID" then id = attr.value
when "datatype" then datatype = attr.value
when "parseType" then parseType = attr.value
when "resource" then resourceAttr = attr.value
when "nodeID" then nodeID = attr.value
else attrs[attr] = attr.value
end
else
attrs[attr] = attr.value
end
end
if nodeID && resourceAttr
add_debug(el, "Cannot have rdf:nodeID and rdf:resource.")
raise ParserException.new("Cannot have rdf:nodeID and rdf:resource.") if @strict
end
# Apply character transformations
id = id_check(el, id.rdf_escape, nil) if id
resourceAttr = resourceAttr.rdf_escape if resourceAttr
nodeID = nodeID_check(el, nodeID.rdf_escape) if nodeID
add_debug(child, "attrs: #{attrs.inspect}")
add_debug(child, "datatype: #{datatype}") if datatype
add_debug(child, "parseType: #{parseType}") if parseType
add_debug(child, "resource: #{resourceAttr}") if resourceAttr
add_debug(child, "nodeID: #{nodeID}") if nodeID
add_debug(child, "id: #{id}") if id
if attrs.empty? && datatype.nil? && parseType.nil? && element_nodes.length == 1
# Production resourcePropertyElt
new_ec = child_ec.clone(nil)
new_node_element = element_nodes.first
add_debug(child, "resourcePropertyElt: #{node_path(new_node_element)}")
new_subject = nodeElement(new_node_element, new_ec)
add_triple(child, subject, predicate, new_subject)
elsif attrs.empty? && parseType.nil? && element_nodes.length == 0 && text_nodes.length > 0
# Production literalPropertyElt
add_debug(child, "literalPropertyElt")
literal = datatype ? Literal.typed(child.inner_html, datatype) : Literal.untyped(child.inner_html, child_ec.language)
add_triple(child, subject, predicate, literal)
reify(id, child, subject, predicate, literal, ec) if id
elsif parseType == "Resource"
# Production parseTypeResourcePropertyElt
add_debug(child, "parseTypeResourcePropertyElt")
unless attrs.empty?
warn = "Resource Property with extra attributes: '#{attrs.inspect}'"
add_debug(child, warn)
raise ParserException.new(warn) if @strict
end
# For element e with possibly empty element content c.
n = BNode.new
add_triple(child, subject, predicate, n)
# Reification
reify(id, child, subject, predicate, n, child_ec) if id
# If the element content c is not empty, then use event n to create a new sequence of events as follows:
#
# start-element(URI := rdf:Description,
# subject := n,
# attributes := set())
# c
# end-element()
add_debug(child, "compose new sequence with rdf:Description")
node = child.clone
pt_attr = node.attribute("parseType")
node.namespace = pt_attr.namespace
node.attributes.keys.each {|a| node.remove_attribute(a)}
node.node_name = "Description"
new_ec = child_ec.clone(nil, :subject => n)
nodeElement(node, new_ec)
elsif parseType == "Collection"
# Production parseTypeCollectionPropertyElt
add_debug(child, "parseTypeCollectionPropertyElt")
unless attrs.empty?
warn = "Resource Property with extra attributes: '#{attrs.inspect}'"
add_debug(child, warn)
raise ParserException.new(warn) if @strict
end
# For element event e with possibly empty nodeElementList l. Set s:=list().
# For each element event f in l, n := bnodeid(identifier := generated-blank-node-id()) and append n to s to give a sequence of events.
s = element_nodes.map { BNode.new }
n = s.first || RDF_NS.send("nil")
add_triple(child, subject, predicate, n)
reify(id, child, subject, predicate, n, child_ec) if id
# Add first/rest entries for all list elements
s.each_index do |i|
n = s[i]
o = s[i+1]
f = element_nodes[i]
new_ec = child_ec.clone(nil)
object = nodeElement(f, new_ec)
add_triple(child, n, RDF_NS.first, object)
add_triple(child, n, RDF_NS.rest, o ? o : RDF_NS.nil)
end
elsif parseType # Literal or Other
# Production parseTypeResourcePropertyElt
add_debug(child, parseType == "Literal" ? "parseTypeResourcePropertyElt" : "parseTypeOtherPropertyElt (#{parseType})")
unless attrs.empty?
warn = "Resource Property with extra attributes: '#{attrs.inspect}'"
add_debug(child, warn)
raise ParserException.new(warn) if @strict
end
if resourceAttr
warn = "illegal rdf:resource"
add_debug(child, warn)
raise ParserException.new(warn) if @strict
end
object = Literal.typed(child.children, XML_LITERAL, :namespaces => child_ec.uri_mappings)
add_triple(child, subject, predicate, object)
elsif text_nodes.length == 0 && element_nodes.length == 0
# Production emptyPropertyElt
add_debug(child, "emptyPropertyElt")
if attrs.empty? && resourceAttr.nil? && nodeID.nil?
literal = Literal.untyped("", ec.language)
add_triple(child, subject, predicate, literal)
# Reification
reify(id, child, subject, predicate, literal, child_ec) if id
else
if resourceAttr
resource = URIRef.new(resourceAttr, ec.base)
elsif nodeID
resource = BNode.new(nodeID, @named_bnodes)
else
resource = BNode.new
end
# produce triples for attributes
attrs.each_pair do |attr, val|
add_debug(el, "attr: #{attr.name}='#{val}'")
if attr.uri.to_s == RDF_TYPE
add_triple(child, resource, RDF_TYPE, val)
else
# Check for illegal attributes
next unless is_propertyAttr?(attr)
# Attributes not in RDF_TYPE
lit = Literal.untyped(val, child_ec.language)
add_triple(child, resource, attr.uri.to_s, lit)
end
end
add_triple(child, subject, predicate, resource)
# Reification
reify(id, child, subject, predicate, resource, child_ec) if id
end
end
end
# Return subject
subject
end
private
# Reify subject, predicate, and object given the EvaluationContext (ec) and current XMl element (el)
def reify(id, el, subject, predicate, object, ec)
add_debug(el, "reify, id: #{id}")
rsubject = URIRef.new("#" + id, ec.base)
add_triple(el, rsubject, RDF_NS.subject, subject)
add_triple(el, rsubject, RDF_NS.predicate, predicate)
add_triple(el, rsubject, RDF_NS.object, object)
add_triple(el, rsubject, RDF_TYPE, RDF_NS.Statement)
end
# Figure out the subject from the element.
def parse_subject(el, ec)
old_property_check(el)
nodeElementURI_check(el)
about = el.attribute("about")
id = el.attribute("ID")
nodeID = el.attribute("nodeID")
if nodeID && about
add_debug(el, "Cannot have rdf:nodeID and rdf:about.")
raise ParserException.new("Cannot have rdf:nodeID and rdf:about.") if @strict
elsif nodeID && id
add_debug(el, "Cannot have rdf:nodeID and rdf:ID.")
raise ParserException.new("Cannot have rdf:nodeID and rdf:ID.") if @strict
end
case
when id
add_debug(el, "parse_subject, id: '#{id.value.rdf_escape}'")
id_check(el, id.value.rdf_escape, ec.base) # Returns URI
when nodeID
# The value of rdf:nodeID must match the XML Name production
nodeID = nodeID_check(el, nodeID.value.rdf_escape)
add_debug(el, "parse_subject, nodeID: '#{nodeID}")
BNode.new(nodeID, @named_bnodes)
when about
about = about.value.rdf_escape
add_debug(el, "parse_subject, about: '#{about}'")
URIRef.new(about, ec.base)
else
add_debug(el, "parse_subject, BNode")
BNode.new
end
end
# ID attribute must be an NCName
def id_check(el, id, base)
if NC_REGEXP.match(id)
# ID may only be specified once for the same URI
if base
uri = URIRef.new("##{id}", base)
if @id_mapping[id] && @id_mapping[id] == uri
warn = "ID addtribute '#{id}' may only be defined once for the same URI"
add_debug(el, warn)
raise RdfContext::ParserException.new(warn) if @strict
end
@id_mapping[id] = uri
# Returns URI, in this case
else
id
end
else
warn = "ID addtribute '#{id}' must be a NCName"
add_debug(el, "ID addtribute '#{id}' must be a NCName")
add_debug(el, warn)
raise RdfContext::ParserException.new(warn) if @strict
nil
end
end
# nodeID must be an XML Name
# nodeID must pass Production rdf-id
def nodeID_check(el, nodeID)
if NC_REGEXP.match(nodeID)
nodeID
else
add_debug(el, "nodeID addtribute '#{nodeID}' must be an XML Name")
raise RdfContext::ParserException.new("nodeID addtribute '#{nodeID}' must be a NCName") if @strict
nil
end
end
# Is this attribute a Property Attribute?
def is_propertyAttr?(attr)
if ([RDF_NS.Description.to_s, RDF_NS.li.to_s] + OLD_TERMS).include?(attr.uri.to_s)
warn = "Invalid use of rdf:#{attr.name}"
add_debug(attr, warn)
raise InvalidPredicate.new(warn) if @strict
return false
end
!CORE_SYNTAX_TERMS.include?(attr.uri.to_s) && attr.namespace && attr.namespace.href != XML_NS.uri.to_s
end
# Check Node Element name
def nodeElementURI_check(el)
if (CORE_SYNTAX_TERMS + [RDF_NS.li.to_s] + OLD_TERMS).include?(el.uri.to_s)
warn = "Invalid use of rdf:#{el.name}"
add_debug(el, warn)
raise InvalidSubject.new(warn) if @strict
end
end
# Check Property Element name
def propertyElementURI_check(el)
if (CORE_SYNTAX_TERMS + [RDF_NS.Description.to_s] + OLD_TERMS).include?(el.uri.to_s)
warn = "Invalid use of rdf:#{el.name}"
add_debug(el, warn)
raise InvalidPredicate.new(warn) if @strict
end
end
# Check for the use of an obsolete RDF property
def old_property_check(el)
el.attribute_nodes.each do |attr|
if OLD_TERMS.include?(attr.uri.to_s)
add_debug(el, "Obsolete attribute '#{attr.uri}'")
raise InvalidPredicate.new("Obsolete attribute '#{attr.uri}'") if @strict
end
end
end
end
end