lib/rdf/rdfa/reader.rb in rdf-rdfa-0.2.1 vs lib/rdf/rdfa/reader.rb in rdf-rdfa-0.2.2

- old
+ new

@@ -2,14 +2,31 @@ module RDF::RDFa ## # An RDFa parser in Ruby # + # Based on processing rules described here: + # @see http://www.w3.org/TR/rdfa-syntax/#s_model RDFa 1.0 + # @see http://www.w3.org/2010/02/rdfa/drafts/2010/ED-rdfa-core-20100803/ RDFa 1.1 + # # @author [Gregg Kellogg](http://kellogg-assoc.com/) class Reader < RDF::Reader format Format + SafeCURIEorCURIEorURI = { + :rdfa_1_0 => [:term, :safe_curie, :uri, :bnode], + :rdfa_1_1 => [:safe_curie, :curie, :term, :uri, :bnode], + } + TERMorCURIEorAbsURI = { + :rdfa_1_0 => [:term, :curie], + :rdfa_1_1 => [:term, :curie, :absuri], + } + TERMorCURIEorAbsURIprop = { + :rdfa_1_0 => [:curie], + :rdfa_1_1 => [:term, :curie, :absuri], + } + NC_REGEXP = Regexp.new( %{^ (?!\\\\u0301) # &#x301; is a non-spacing acute accent. # It is legal within an XML Name, but not as the first character. ( [a-zA-Z_] @@ -19,66 +36,97 @@ | \\\\u([0-9a-fA-F]{4}) )* $}, Regexp::EXTENDED) - # Host language, One of: - # :xhtml_rdfa_1_0 - # :xhtml_rdfa_1_1 + # Host language + # @return [:xhtml] attr_reader :host_language # The Recursive Baggage + # @private class EvaluationContext # :nodoc: - # The base. This will usually be the URL of the document being processed, + # The base. + # + # This will usually be the URL of the document being processed, # but it could be some other URL, set by some other mechanism, # such as the (X)HTML base element. The important thing is that it establishes # a URL against which relative paths can be resolved. + # + # @return [URI] attr :base, true # The parent subject. + # # The initial value will be the same as the initial value of base, # but it will usually change during the course of processing. + # + # @return [URI] attr :parent_subject, true # The parent object. + # # In some situations the object of a statement becomes the subject of any nested statements, # and this property is used to convey this value. # Note that this value may be a bnode, since in some situations a number of nested statements # are grouped together on one bnode. # This means that the bnode must be set in the containing statement and passed down, # and this property is used to convey this value. + # + # @return URI attr :parent_object, true # A list of current, in-scope URI mappings. + # + # @return [Hash{Symbol => String}] attr :uri_mappings, true - # A list of incomplete triples. A triple can be incomplete when no object resource + # A list of incomplete triples. + # + # A triple can be incomplete when no object resource # is provided alongside a predicate that requires a resource (i.e., @rel or @rev). # The triples can be completed when a resource becomes available, # which will be when the next subject is specified (part of the process called chaining). + # + # @return [Array<Array<URI, Resource>>] attr :incomplete_triples, true # The language. Note that there is no default language. + # + # @return [Symbol] attr :language, true # The term mappings, a list of terms and their associated URIs. + # # This specification does not define an initial list. # Host Languages may define an initial list. # If a Host Language provides an initial list, it should do so via an RDFa Profile document. + # + # @return [Hash{Symbol => URI}] attr :term_mappings, true - # The default vocabulary, a value to use as the prefix URI when a term is used. + # The default vocabulary + # + # A value to use as the prefix URI when a term is used. # This specification does not define an initial setting for the default vocabulary. # Host Languages may define an initial setting. + # + # @return [URI] attr :default_vocabulary, true + # @param [RDF::URI] base + # @param [Hash] host_defaults + # @option host_defaults [Hash{String => URI}] :term_mappings Hash of NCName => URI + # @option host_defaults [Hash{String => URI}] :vocabulary Hash of prefix => URI def initialize(base, host_defaults) # Initialize the evaluation context, [5.1] @base = base @parent_subject = @base @parent_object = nil @incomplete_triples = [] @language = nil @uri_mappings = host_defaults.fetch(:uri_mappings, {}) @term_mappings = host_defaults.fetch(:term_mappings, {}) - @default_voabulary = host_defaults.fetch(:voabulary, nil) + @default_vocabulary = host_defaults.fetch(:vocabulary, nil) end # Copy this Evaluation Context + # + # @param [EvaluationContext] from def initialize_copy(from) # clone the evaluation context correctly @uri_mappings = from.uri_mappings.clone @incomplete_triples = from.incomplete_triples.clone end @@ -93,33 +141,40 @@ end ## # Initializes the RDFa reader instance. # - # @param [Nokogiri::HTML::Document, Nokogiri::XML::Document, IO, File, String] input + # @param [Nokogiri::HTML::Document, Nokogiri::XML::Document, #read, #to_s] input # @option options [Array] :debug (nil) Array to place debug messages + # @option options [Graph] :processor_graph (nil) Graph to record information, warnings and errors. # @option options [Boolean] :strict (false) Raise Error if true, continue with lax parsing, otherwise # @option options [Boolean] :base_uri (nil) Base URI to use for relative URIs. + # @option options [:rdfa_1_0, :rdfa_1_1] :version (:rdfa_1_1) Parser version information + # @option options [:xhtml] :host_language (:xhtml) Host Language # @return [reader] # @yield [reader] - # @yieldparam [Reader] reader + # @yieldparam [RDF::Reader] reader # @raise [RDF::ReaderError]:: Raises RDF::ReaderError if _strict_ def initialize(input = $stdin, options = {}, &block) super do @debug = options[:debug] @strict = options[:strict] @base_uri = RDF::URI.intern(options[:base_uri]) @@vocabulary_cache ||= {} + @version = options[:version] ? options[:version].to_sym : :rdfa_1_1 + @host_language = options[:host_language] || :xhtml + @doc = case input when Nokogiri::HTML::Document then input when Nokogiri::XML::Document then input else Nokogiri::XML.parse(input, @base_uri.to_s) end - raise RDF::ReaderError, "Synax errors:\n#{@doc.errors}" if !@doc.errors.empty? && @strict - raise RDF::ReaderError, "Empty document" if (@doc.nil? || @doc.root.nil?) && @strict + add_error(nil, "Empty document", RDF::RDFA.HostLanguageMarkupError) if (@doc.nil? || @doc.root.nil?) + add_warning(nil, "Synax errors:\n#{@doc.errors}", RDF::RDFA.HostLanguageMarkupError) unless @doc.errors.empty? + block.call(self) if block_given? end end ## @@ -129,20 +184,15 @@ # @yieldparam [RDF::Statement] statement # @return [void] def each_statement(&block) @callback = block - # Determine host language - # XXX - right now only XHTML defined - @host_language = case @doc.root.attributes["version"].to_s - when /XHTML+RDFa/ then :xhtml - end - - # If none found, assume xhtml - @host_language ||= :xhtml - - @host_defaults = {} + # Section 4.2 RDFa Host Language Conformance + # + # The Host Language may define a default RDFa Profile. If it does, the RDFa Profile triples that establish term or + # URI mappings associated with that profile must not change without changing the profile URI. RDFa Processors may + # embed, cache, or retrieve the RDFa Profile triples associated with that profile. @host_defaults = case @host_language when :xhtml { :vocabulary => RDF::XHV.to_s, :prefix => "xhv", @@ -154,10 +204,14 @@ } else {} end + @host_defaults.delete(:vocabulary) if @version == :rdfa_1_0 + + add_debug(@doc, "version = #{@version}, host_language = #{@host_language}") + # parse parse_whole_document(@doc, @base_uri) end ## @@ -183,36 +237,61 @@ end # Figure out the document path, if it is a Nokogiri::XML::Element or Attribute def node_path(node) case node - when Nokogiri::XML::Element, Nokogiri::XML::Attr then "#{node_path(node.parent)}/#{node.name}" - when String then node - else "" + when Nokogiri::XML::Node then node.display_path + else node.to_s end end # Add debug event to debug array, if specified # # @param [XML Node, any] node:: XML Node or string for showing context # @param [String] message:: def add_debug(node, message) - puts "#{node_path(node)}: #{message}" if $DEBUG - @debug << "#{node_path(node)}: #{message}" if @debug.is_a?(Array) + add_processor_message(node, message, RDF::RDFA.InformationalMessage) end + def add_info(node, message, process_class = RDF::RDFA.InformationalMessage) + add_processor_message(node, message, process_class) + end + + def add_warning(node, message, process_class = RDF::RDFA.MiscellaneousWarning) + add_processor_message(node, message, process_class) + end + + def add_error(node, message, process_class = RDF::RDFA.MiscellaneousError) + add_processor_message(node, message, process_class) + raise ParserException, message if @strict + end + + def add_processor_message(node, message, process_class) + puts "#{node_path(node)}: #{message}" if ::RDF::RDFa::debug? + @debug << "#{node_path(node)}: #{message}" if @debug.is_a?(Array) + if @processor_graph + @processor_sequence ||= 0 + n = RDF::Node.new + @processor_graph << RDF::Statement.new(n, RDF["type"], process_class) + @processor_graph << RDF::Statement.new(n, RDF::DC.description, message) + @processor_graph << RDF::Statement.new(n, RDF::DC.date, RDF::Literal::Date.new(DateTime.now.to_date)) + @processor_graph << RDF::Statement.new(n, RDF::RDFA.sequence, RDF::Literal::Integer.new(@processor_sequence += 1)) + @processor_graph << RDF::Statement.new(n, RDF::RDFA.source, node_path(node)) + end + end + # add a statement, object can be literal or URI or bnode # # @param [Nokogiri::XML::Node, any] node:: XML Node or string for showing context # @param [URI, BNode] subject:: the subject of the statement # @param [URI] predicate:: the predicate of the statement # @param [URI, BNode, Literal] object:: the object of the statement # @return [Statement]:: Added statement # @raise [ReaderError]:: Checks parameter types and raises if they are incorrect if parsing mode is _strict_. def add_triple(node, subject, predicate, object) statement = RDF::Statement.new(subject, predicate, object) - add_debug(node, "statement: #{statement}") + add_debug(node, "statement: #{statement.to_ntriples}") @callback.call(statement) end # Parsing an RDFa document (this is *not* the recursive method) @@ -227,103 +306,100 @@ @base_uri = RDF::URI.intern(base) add_debug(base_el, "parse_whole_doc: base='#{base}'") end # initialize the evaluation context with the appropriate base - evaluation_context = EvaluationContext.new(base, @host_defaults) + evaluation_context = EvaluationContext.new(@base_uri, @host_defaults) traverse(doc.root, evaluation_context) end - # Extract the XMLNS mappings from an element - def extract_mappings(element, uri_mappings, term_mappings) - # Process @profile - # Next the current element is parsed for any updates to the local term mappings and - # local list of URI mappings via @profile. - # If @profile is present, its value is processed as defined in RDFa Profiles. - element.attributes['profile'].to_s.split(/\s/).each do |profile| - if node_path(element) == "/html/head" - # Don't try to open ourselves! - add_debug(element, "extract_mappings: skip head profile <#{profile}>") - next - elsif @@vocabulary_cache[profile] - add_debug(element, "extract_mappings: cached profile <#{profile}>") - @@vocabulary_cache[profile] - elsif @base_uri.to_s == profile - # Don't try to open ourselves! - add_debug(element, "extract_mappings: skip recursive profile <#{profile}>") - next + # Parse and process URI mappings, Term mappings and a default vocabulary from @profile + # + # Yields each mapping + def process_profile(element) + element.attributes['profile'].to_s.split(/\s/).reverse.each do |profile| + # Don't try to open ourselves! + if @uri == profile + add_debug(element, "process_profile: skip recursive profile <#{profile}>") elsif @@vocabulary_cache.has_key?(profile) - add_debug(element, "extract_mappings: skip previously parsed profile <#{profile}>") + add_debug(element, "process_profile: skip previously parsed profile <#{profile}>") else begin - add_debug(element, "extract_mappings: parse profile <#{profile}>") @@vocabulary_cache[profile] = { :uri_mappings => {}, - :term_mappings => {} + :term_mappings => {}, + :default_vocabulary => nil } um = @@vocabulary_cache[profile][:uri_mappings] tm = @@vocabulary_cache[profile][:term_mappings] - add_debug(element, "extract_mappings: profile open <#{profile}>") - - old_debug, old_verbose, = $DEBUG, $verbose - $DEBUG, $verbose = false, false - # FIXME: format shouldn't need to be specified here - p_graph = RDF::Graph.load(profile, :base_uri => profile, :format => RDF::Format.for(profile) || :rdfa) - puts p_graph.inspect if old_debug - $DEBUG, $verbose = old_debug, old_verbose - p_graph.each_subject do |subject| - # If one of the objects is not a Literal no mapping is created. + add_debug(element, "process_profile: parse profile <#{profile}>") + + # Parse profile, and extract mappings from graph + old_debug, old_verbose, = ::RDF::RDFa::debug?, $verbose + ::RDF::RDFa::debug, $verbose = false, false + # Fixme, RDF isn't smart enough to figure this out from MIME-Type + load_opts = {:base_uri => profile} + load_opts[:format] = :rdfa unless RDF::Format.for(:file_name => profile) + p_graph = RDF::Graph.load(profile, load_opts) + ::RDF::RDFa::debug, $verbose = old_debug, old_verbose + p_graph.subjects.each do |subject| + # If one of the objects is not a Literal or if there are additional rdfa:uri or rdfa:term + # predicates sharing the same subject, no mapping is created. uri = p_graph.first_object([subject, RDF::RDFA['uri'], nil]) term = p_graph.first_object([subject, RDF::RDFA['term'], nil]) prefix = p_graph.first_object([subject, RDF::RDFA['prefix'], nil]) - add_debug(element, "extract_mappings: uri=#{uri.inspect}, term=#{term.inspect}, prefix=#{prefix.inspect}") + vocab = p_graph.first_object([subject, RDF::RDFA['vocabulary'], nil]) + add_debug(element, "process_profile: uri=#{uri.inspect}, term=#{term.inspect}, prefix=#{prefix.inspect}, vocabulary=#{vocab.inspect}") - next if !uri || (!term && !prefix) - raise RDF::ReaderError, "rdf:uri must be a Literal" unless uri.is_a?(RDF::Literal) - raise RDF::ReaderError, "rdf:term must be a Literal" unless term.nil? || term.is_a?(RDF::Literal) - raise RDF::ReaderError, "rdf:prefix must be a Literal" unless prefix.nil? || prefix.is_a?(RDF::Literal) - + raise RDF::ReaderError, "rdf:uri #{uri.inspect} must be a Literal" unless uri.nil? || uri.is_a?(RDF::Literal) + raise RDF::ReaderError, "rdf:term #{term.inspect} must be a Literal" unless term.nil? || term.is_a?(RDF::Literal) + raise RDF::ReaderError, "rdf:prefix #{prefix.inspect} must be a Literal" unless prefix.nil? || prefix.is_a?(RDF::Literal) + raise RDF::ReaderError, "rdf:vocabulary #{vocab.inspect} must be a Literal" unless vocab.nil? || vocab.is_a?(RDF::Literal) + + @@vocabulary_cache[profile][:default_vocabulary] = vocab.value if vocab + # For every extracted triple that is the common subject of an rdfa:prefix and an rdfa:uri # predicate, create a mapping from the object literal of the rdfa:prefix predicate to the # object literal of the rdfa:uri predicate. Add or update this mapping in the local list of # URI mappings after transforming the 'prefix' component to lower-case. # For every extracted - um[prefix.value.downcase] = uri.value if prefix + um[prefix.value.downcase] = uri.value if prefix && prefix.value != "_" # triple that is the common subject of an rdfa:term and an rdfa:uri predicate, create a # mapping from the object literal of the rdfa:term predicate to the object literal of the # rdfa:uri predicate. Add or update this mapping in the local term mappings. - tm[term.value] = RDF::URI.intern(uri.value) if term + tm[term.value.downcase] = RDF::URI.intern(uri.value) if term end - # FIXME: subject isn't in scope here - #rescue RDF::ReaderError - # add_debug(element, "extract_mappings: profile subject #{subject.to_s}: #{e.message}") - # raise if @strict - rescue RuntimeError => e - add_debug(element, "extract_mappings: profile: #{e.message}") - raise if @strict + rescue RDF::ReaderError => e + add_error(element, e.message, RDF::RDFA.ProfileReferenceError) + raise # Incase we're not in strict mode, we need to be sure processing stops end end - - # Merge mappings from this vocabulary - uri_mappings.merge!(@@vocabulary_cache[profile][:uri_mappings]) - term_mappings.merge!(@@vocabulary_cache[profile][:term_mappings]) + profile_mappings = @@vocabulary_cache[profile] + yield :uri_mappings, profile_mappings[:uri_mappings] unless profile_mappings[:uri_mappings].empty? + yield :term_mappings, profile_mappings[:term_mappings] unless profile_mappings[:term_mappings].empty? + yield :default_vocabulary, profile_mappings[:default_vocabulary] if profile_mappings[:default_vocabulary] end - + end + + # Extract the XMLNS mappings from an element + def extract_mappings(element, uri_mappings, term_mappings) # look for xmlns # (note, this may be dependent on @host_language) # Regardless of how the mapping is declared, the value to be mapped must be converted to lower case, # and the URI is not processed in any way; in particular if it is a relative path it is # not resolved against the current base. - element.namespaces.each do |attr_name, attr_value| - begin - abbr, prefix = attr_name.split(":") - uri_mappings[prefix.to_s.downcase] = attr_value.to_s if abbr.downcase == "xmlns" && prefix - rescue ReaderError => e - add_debug(element, "extract_mappings raised #{e.class}: #{e.message}") - raise if @strict + element.namespace_definitions.each do |ns| + # A Conforming RDFa Processor must ignore any definition of a mapping for the '_' prefix. + next if ns.prefix == "_" + + # Downcase prefix for RDFa 1.1 + pfx_lc = (@version == :rdfa_1_0 || ns.prefix.nil?) ? ns.prefix : ns.prefix.to_s.downcase + if ns.prefix + uri_mappings[pfx_lc] = ns.href + add_debug(element, "extract_mappings: xmlns:#{ns.prefix} => <#{ns.href}>") end end # Set mappings from @prefix # prefix is a whitespace separated list of prefix-name URI pairs of the form @@ -333,15 +409,16 @@ prefix, uri = mappings.shift.downcase, mappings.shift #puts "uri_mappings prefix #{prefix} <#{uri}>" next unless prefix.match(/:$/) prefix.chop! + # A Conforming RDFa Processor must ignore any definition of a mapping for the '_' prefix. + next if prefix == "_" + uri_mappings[prefix] = uri - end - - add_debug(element, "uri_mappings: #{uri_mappings.map{|k,v|"#{k}='#{v}'"}.join(", ")}") - add_debug(element, "term_mappings: #{term_mappings.map{|k,v|"#{k}='#{v}'"}.join(", ")}") + add_debug(element, "extract_mappings: prefix #{prefix} => <#{uri}>") + end unless @version == :rdfa_1_0 end # The recursive helper function def traverse(element, evaluation_context) if element.nil? @@ -350,11 +427,11 @@ return nil end add_debug(element, "traverse, ec: #{evaluation_context.inspect}") - # local variables [5.5 Step 1] + # local variables [7.5 Step 1] recurse = true skip = false new_subject = nil current_object_resource = nil uri_mappings = evaluation_context.uri_mappings.clone @@ -373,34 +450,57 @@ resource = attrs['resource'] href = attrs['href'] vocab = attrs['vocab'] # Pull out the attributes needed for the skip test. - property = attrs['property'].to_s if attrs['property'] - typeof = attrs['typeof'].to_s if attrs['typeof'] + property = attrs['property'].to_s.strip if attrs['property'] + typeof = attrs['typeof'].to_s.strip if attrs['typeof'] datatype = attrs['datatype'].to_s if attrs['datatype'] content = attrs['content'].to_s if attrs['content'] - rel = attrs['rel'].to_s if attrs['rel'] - rev = attrs['rev'].to_s if attrs['rev'] + rel = attrs['rel'].to_s.strip if attrs['rel'] + rev = attrs['rev'].to_s.strip if attrs['rev'] - # Default vocabulary [7.5 Step 2] - # First the current element is examined for any change to the default vocabulary via @vocab. + # Local term mappings [7.5 Steps 2] + # Next the current element is parsed for any updates to the local term mappings and local list of URI mappings via @profile. + # If @profile is present, its value is processed as defined in RDFa Profiles. + unless @version == :rdfa_1_0 + begin + process_profile(element) do |which, value| + add_debug(element, "[Step 2] traverse, #{which}: #{value.inspect}") + case which + when :uri_mappings then uri_mappings.merge!(value) + when :term_mappings then term_mappings.merge!(value) + when :default_vocabulary then default_vocabulary = value + end + end + rescue + # Skip this element and all sub-elements + # If any referenced RDFa Profile is not available, then the current element and its children must not place any + # triples in the default graph . + raise if @strict + return + end + end + + # Default vocabulary [7.5 Step 3] + # Next the current element is examined for any change to the default vocabulary via @vocab. # If @vocab is present and contains a value, its value updates the local default vocabulary. # If the value is empty, then the local default vocabulary must be reset to the Host Language defined default. unless vocab.nil? default_vocabulary = if vocab.to_s.empty? # Set default_vocabulary to host language default - @host_defaults.fetch(:voabulary, nil) + add_debug(element, "[Step 2] traverse, reset default_vocaulary to #{@host_defaults.fetch(:vocabulary, nil).inspect}") + @host_defaults.fetch(:vocabulary, nil) else - vocab.to_s + RDF::URI.intern(vocab) end add_debug(element, "[Step 2] traverse, default_vocaulary: #{default_vocabulary.inspect}") end - # Local term mappings [7.5 Steps 3 & 4] - # Next the current element is parsed for any updates to the local term mappings and local list of URI mappings via @profile. - # If @profile is present, its value is processed as defined in RDFa Profiles. + # Local term mappings [7.5 Steps 4] + # Next, the current element is then examined for URI mapping s and these are added to the local list of URI mappings. + # Note that a URI mapping will simply overwrite any current mapping in the list that has the same name extract_mappings(element, uri_mappings, term_mappings) # Language information [7.5 Step 5] # From HTML5 [3.2.3.3] # If both the lang attribute in no namespace and the lang attribute in the XML namespace are set @@ -417,94 +517,110 @@ end language = nil if language.to_s.empty? add_debug(element, "HTML5 [3.2.3.3] traverse, lang: #{language || 'nil'}") if attrs['lang'] # rels and revs - rels = process_uris(element, rel, evaluation_context, :uri_mappings => uri_mappings, :term_mappings => term_mappings, :vocab => default_vocabulary) - revs = process_uris(element, rev, evaluation_context, :uri_mappings => uri_mappings, :term_mappings => term_mappings, :vocab => default_vocabulary) + rels = process_uris(element, rel, evaluation_context, + :uri_mappings => uri_mappings, + :term_mappings => term_mappings, + :vocab => default_vocabulary, + :restrictions => TERMorCURIEorAbsURI[@version]) + revs = process_uris(element, rev, evaluation_context, + :uri_mappings => uri_mappings, + :term_mappings => term_mappings, + :vocab => default_vocabulary, + :restrictions => TERMorCURIEorAbsURI[@version]) add_debug(element, "traverse, about: #{about.nil? ? 'nil' : about}, src: #{src.nil? ? 'nil' : src}, resource: #{resource.nil? ? 'nil' : resource}, href: #{href.nil? ? 'nil' : href}") add_debug(element, "traverse, property: #{property.nil? ? 'nil' : property}, typeof: #{typeof.nil? ? 'nil' : typeof}, datatype: #{datatype.nil? ? 'nil' : datatype}, content: #{content.nil? ? 'nil' : content}") add_debug(element, "traverse, rels: #{rels.join(" ")}, revs: #{revs.join(" ")}") if !(rel || rev) # Establishing a new subject if no rel/rev [7.5 Step 6] # May not be valid, but can exist - if about - new_subject = process_uri(element, about, evaluation_context, :uri_mappings => uri_mappings) + new_subject = if about + process_uri(element, about, evaluation_context, + :uri_mappings => uri_mappings, + :restrictions => SafeCURIEorCURIEorURI[@version]) elsif src - new_subject = process_uri(element, src, evaluation_context) + process_uri(element, src, evaluation_context, :restrictions => [:uri]) elsif resource - new_subject = process_uri(element, resource, evaluation_context, :uri_mappings => uri_mappings) + process_uri(element, resource, evaluation_context, + :uri_mappings => uri_mappings, + :restrictions => SafeCURIEorCURIEorURI[@version]) elsif href - new_subject = process_uri(element, href, evaluation_context) + process_uri(element, href, evaluation_context, :restrictions => [:uri]) end # If no URI is provided by a resource attribute, then the first match from the following rules # will apply: # if @typeof is present, then new subject is set to be a newly created bnode. # otherwise, # if parent object is present, new subject is set to the value of parent object. # Additionally, if @property is not present then the skip element flag is set to 'true'; - if new_subject.nil? - if @host_language == :xhtml && element.name =~ /^(head|body)$/ && evaluation_context.base - # From XHTML+RDFa 1.1: - # if no URI is provided, then first check to see if the element is the head or body element. - # If it is, then act as if there is an empty @about present, and process it according to the rule for @about. - new_subject = RDF::URI.intern(evaluation_context.base) - elsif element.attributes['typeof'] - new_subject = RDF::Node.new - else - # if it's null, it's null and nothing changes - new_subject = evaluation_context.parent_object - skip = true unless property - end + new_subject ||= if @host_language == :xhtml && element.name =~ /^(head|body)$/ && evaluation_context.base + # From XHTML+RDFa 1.1: + # if no URI is provided, then first check to see if the element is the head or body element. + # If it is, then act as if there is an empty @about present, and process it according to the rule for @about. + evaluation_context.base + elsif element.attributes['typeof'] + RDF::Node.new + else + # if it's null, it's null and nothing changes + skip = true unless property + evaluation_context.parent_object end add_debug(element, "[Step 6] new_subject: #{new_subject}, skip = #{skip}") else # [7.5 Step 7] # If the current element does contain a @rel or @rev attribute, then the next step is to # establish both a value for new subject and a value for current object resource: - if about - new_subject = process_uri(element, about, evaluation_context, :uri_mappings => uri_mappings) - elsif src - new_subject = process_uri(element, src, evaluation_context, :uri_mappings => uri_mappings) - end + new_subject = process_uri(element, about, evaluation_context, + :uri_mappings => uri_mappings, + :restrictions => SafeCURIEorCURIEorURI[@version]) || + process_uri(element, src, evaluation_context, + :uri_mappings => uri_mappings, + :restrictions => [:uri]) # If no URI is provided then the first match from the following rules will apply - if new_subject.nil? - if @host_language == :xhtml && element.name =~ /^(head|body)$/ - # From XHTML+RDFa 1.1: - # if no URI is provided, then first check to see if the element is the head or body element. - # If it is, then act as if there is an empty @about present, and process it according to the rule for @about. - new_subject = RDF::URI.intern(evaluation_context.base) - elsif element.attributes['typeof'] - new_subject = RDF::Node.new - else - # if it's null, it's null and nothing changes - new_subject = evaluation_context.parent_object - # no skip flag set this time - end + new_subject ||= if @host_language == :xhtml && element.name =~ /^(head|body)$/ + # From XHTML+RDFa 1.1: + # if no URI is provided, then first check to see if the element is the head or body element. + # If it is, then act as if there is an empty @about present, and process it according to the rule for @about. + evaluation_context.base + elsif element.attributes['typeof'] + RDF::Node.new + else + # if it's null, it's null and nothing changes + evaluation_context.parent_object + # no skip flag set this time end # Then the current object resource is set to the URI obtained from the first match from the following rules: - if resource - current_object_resource = process_uri(element, resource, evaluation_context, :uri_mappings => uri_mappings) + current_object_resource = if resource + process_uri(element, resource, evaluation_context, + :uri_mappings => uri_mappings, + :restrictions => SafeCURIEorCURIEorURI[@version]) elsif href - current_object_resource = process_uri(element, href, evaluation_context) + process_uri(element, href, evaluation_context, + :restrictions => [:uri]) end add_debug(element, "[Step 7] new_subject: #{new_subject}, current_object_resource = #{current_object_resource.nil? ? 'nil' : current_object_resource}") end # Process @typeof if there is a subject [Step 8] if new_subject and typeof - # Typeof is TERMorCURIEorURIs - types = process_uris(element, typeof, evaluation_context, :uri_mappings => uri_mappings, :term_mappings => term_mappings, :vocab => default_vocabulary) + # Typeof is TERMorCURIEorAbsURIs + types = process_uris(element, typeof, evaluation_context, + :uri_mappings => uri_mappings, + :term_mappings => term_mappings, + :vocab => default_vocabulary, + :restrictions => TERMorCURIEorAbsURI[@version]) add_debug(element, "typeof: #{typeof}") types.each do |one_type| - add_triple(element, new_subject, RDF.type, one_type) + add_triple(element, new_subject, RDF["type"], one_type) end end # Generate triples with given object [Step 9] if current_object_resource @@ -529,34 +645,64 @@ end end # Establish current object literal [Step 11] if property - properties = process_uris(element, property, evaluation_context, :uri_mappings => uri_mappings, :term_mappings => term_mappings, :vocab => default_vocabulary) + properties = process_uris(element, property, evaluation_context, + :uri_mappings => uri_mappings, + :term_mappings => term_mappings, + :vocab => default_vocabulary, + :restrictions => TERMorCURIEorAbsURIprop[@version]) + properties.reject! do |p| + if p.is_a?(RDF::URI) + false + else + add_debug(element, "Illegal predicate: #{p.inspect}") + raise RDF::ReaderError, "predicate #{p.inspect} must be a URI" if @strict + true + end + end + # get the literal datatype - type = datatype children_node_types = element.children.collect{|c| c.class}.uniq # the following 3 IF clauses should be mutually exclusive. Written as is to prevent extensive indentation. - type_resource = process_uri(element, type, evaluation_context, :uri_mappings => uri_mappings, :term_mappings => term_mappings, :vocab => default_vocabulary) if type - if type and !type.empty? and (type_resource.to_s != RDF.XMLLiteral.to_s) + datatype = process_uri(element, datatype, evaluation_context, + :uri_mappings => uri_mappings, + :term_mappings => term_mappings, + :vocab => default_vocabulary, + :restrictions => TERMorCURIEorAbsURI[@version]) unless datatype.to_s.empty? + current_object_literal = if !datatype.to_s.empty? && datatype.to_s != RDF.XMLLiteral.to_s # typed literal - add_debug(element, "[Step 11] typed literal") - current_object_literal = RDF::Literal.new(content || element.inner_text.to_s, :datatype => type_resource, :language => language) - elsif content or (children_node_types == [Nokogiri::XML::Text]) or (element.children.length == 0) or (type == '') - # plain literal - add_debug(element, "[Step 11] plain literal") - current_object_literal = RDF::Literal.new(content || element.inner_text.to_s, :language => language) - elsif children_node_types != [Nokogiri::XML::Text] and (type == nil or type_resource.to_s == RDF.XMLLiteral.to_s) - # XML Literal - add_debug(element, "[Step 11] XML Literal: #{element.inner_html}") - current_object_literal = RDF::Literal.new(element.inner_html, :datatype => RDF.XMLLiteral, :language => language, :namespaces => uri_mappings.merge("" => "http://www.w3.org/1999/xhtml")) - recurse = false + add_debug(element, "[Step 11] typed literal (#{datatype})") + RDF::Literal.new(content || element.inner_text.to_s, :datatype => datatype, :language => language) + elsif @version == :rdfa_1_1 + if datatype.to_s == RDF.XMLLiteral.to_s + # XML Literal + add_debug(element, "[Step 11(1.1)] XML Literal: #{element.inner_html}") + recurse = false + RDF::Literal.new(element.inner_html, :datatype => RDF.XMLLiteral, :language => language, :namespaces => uri_mappings.merge("" => "http://www.w3.org/1999/xhtml")) + else + # plain literal + add_debug(element, "[Step 11(1.1)] plain literal") + RDF::Literal.new(content || element.inner_text.to_s, :language => language) + end + else + if content || (children_node_types == [Nokogiri::XML::Text]) || (element.children.length == 0) || datatype == "" + # plain literal + add_debug(element, "[Step 11 (1.0)] plain literal") + RDF::Literal.new(content || element.inner_text.to_s, :language => language) + elsif children_node_types != [Nokogiri::XML::Text] and (datatype == nil or datatype.to_s == RDF.XMLLiteral.to_s) + # XML Literal + add_debug(element, "[Step 11 (1.0)] XML Literal: #{element.inner_html}") + recurse = false + RDF::Literal.new(element.inner_html, :datatype => RDF.XMLLiteral, :language => language, :namespaces => uri_mappings.merge("" => "http://www.w3.org/1999/xhtml")) + end end - - # add each property + + # add each property properties.each do |p| add_triple(element, new_subject, p, current_object_literal) end # SPEC CONFUSION: "the triple has been created" ==> there may be more than one # set the recurse flag above in the IF about xmlliteral, as it is the only place that can happen @@ -609,46 +755,68 @@ traverse(child, new_ec) if child.class == Nokogiri::XML::Element end end end - # space-separated TERMorCURIEorURI + # space-separated TERMorCURIEorAbsURI or SafeCURIEorCURIEorURI def process_uris(element, value, evaluation_context, options) return [] if value.to_s.empty? add_debug(element, "process_uris: #{value}") value.to_s.split(/\s+/).map {|v| process_uri(element, v, evaluation_context, options)}.compact end def process_uri(element, value, evaluation_context, options = {}) - #return if value.to_s.empty? - #add_debug(element, "process_uri: #{value}") + return if value.nil? + restrictions = options[:restrictions] + add_debug(element, "process_uri: #{value}, restrictions = #{restrictions.inspect}") options = {:uri_mappings => {}}.merge(options) - if !options[:term_mappings] && options[:uri_mappings] && value.to_s.match(/^\[(.*)\]$/) + if !options[:term_mappings] && options[:uri_mappings] && value.to_s.match(/^\[(.*)\]$/) && restrictions.include?(:safe_curie) # SafeCURIEorCURIEorURI # When the value is surrounded by square brackets, then the content within the brackets is # evaluated as a CURIE according to the CURIE Syntax definition. If it is not a valid CURIE, the # value must be ignored. - uri = curie_to_resource_or_bnode(element, $1, options[:uri_mappings], evaluation_context.parent_subject) + uri = curie_to_resource_or_bnode(element, $1, options[:uri_mappings], evaluation_context.parent_subject, restrictions) add_debug(element, "process_uri: #{value} => safeCURIE => <#{uri}>") uri - elsif options[:term_mappings] && NC_REGEXP.match(value.to_s) - # TERMorCURIEorURI + elsif options[:term_mappings] && NC_REGEXP.match(value.to_s) && restrictions.include?(:term) + # TERMorCURIEorAbsURI # If the value is an NCName, then it is evaluated as a term according to General Use of Terms in # Attributes. Note that this step may mean that the value is to be ignored. - uri = process_term(value.to_s, options) + uri = process_term(element, value.to_s, options) add_debug(element, "process_uri: #{value} => term => <#{uri}>") uri else - # SafeCURIEorCURIEorURI or TERMorCURIEorURI + # SafeCURIEorCURIEorURI or TERMorCURIEorAbsURI # Otherwise, the value is evaluated as a CURIE. # If it is a valid CURIE, the resulting URI is used; otherwise, the value will be processed as a URI. - uri = curie_to_resource_or_bnode(element, value, options[:uri_mappings], evaluation_context.parent_subject) + uri = curie_to_resource_or_bnode(element, value, options[:uri_mappings], evaluation_context.parent_subject, restrictions) if uri add_debug(element, "process_uri: #{value} => CURIE => <#{uri}>") - else - ## FIXME: throw exception if there is no base uri set? - uri = RDF::URI.intern(RDF::URI.intern(evaluation_context.base).join(value)) + elsif @version == :rdfa_1_0 && value.to_s.match(/^xml/i) + # Special case to not allow anything starting with XML to be treated as a URI + elsif restrictions.include?(:absuri) || restrictions.include?(:uri) + begin + # AbsURI does not use xml:base + if restrictions.include?(:absuri) + uri = RDF::URI.intern(value) + unless uri.absolute? + uri = nil + raise RDF::ReaderError, "Relative URI #{value}" + end + else + uri = evaluation_context.base.join(Addressable::URI.parse(value)) + end + rescue Addressable::URI::InvalidURIError => e + add_warning(element, "Malformed prefix #{value}", RDF::RDFA.UndefinedPrefixError) + rescue RDF::ReaderError => e + add_debug(element, e.message) + if value.to_s =~ /^\(^\w\):/ + add_warning(element, "Undefined prefix #{$1}", RDF::RDFA.UndefinedPrefixError) + else + add_warning(element, "Relative URI #{value}") + end + end add_debug(element, "process_uri: #{value} => URI => <#{uri}>") end uri end end @@ -657,49 +825,57 @@ # # @param [String] term:: term # @param [Hash] options:: Parser options, one of # <em>options[:term_mappings]</em>:: Term mappings # <em>options[:vocab]</em>:: Default vocabulary - def process_term(value, options) + def process_term(element, value, options) case when options[:term_mappings].is_a?(Hash) && options[:term_mappings].has_key?(value.to_s.downcase) # If the term is in the local term mappings, use the associated URI. # XXX Spec Confusion: are terms always downcased? Or only for XHTML Vocab? options[:term_mappings][value.to_s.downcase] when options[:vocab] # Otherwise, if there is a local default vocabulary the URI is obtained by concatenating that value and the term. RDF::URI.intern(options[:vocab] + value) else # Finally, if there is no local default vocabulary, the term has no associated URI and must be ignored. + add_warning(element, "Term #{value} is not defined", RDF::RDFA.UndefinedTermError) nil end end # From section 6. CURIE Syntax Definition - def curie_to_resource_or_bnode(element, curie, uri_mappings, subject) + def curie_to_resource_or_bnode(element, curie, uri_mappings, subject, restrictions) # URI mappings for CURIEs default to XHV, rather than the default doc namespace prefix, reference = curie.to_s.split(":") # consider the bnode situation - if prefix == "_" + if prefix == "_" && restrictions.include?(:bnode) + # we force a non-nil name, otherwise it generates a new name + # As a special case, _: is also a valid reference for one specific bnode. bnode(reference) elsif curie.to_s.match(/^:/) + add_debug(element, "curie_to_resource_or_bnode: default prefix: defined? #{!!uri_mappings[""]}, defaults: #{@host_defaults[:prefix]}") # Default prefix if uri_mappings[""] RDF::URI.intern(uri_mappings[""] + reference.to_s) elsif @host_defaults[:prefix] RDF::URI.intern(uri_mappings[@host_defaults[:prefix]] + reference.to_s) + else + #add_warning(element, "Default namespace prefix is not defined", RDF::RDFA.UndefinedPrefixError) + nil end elsif !curie.to_s.match(/:/) # No prefix, undefined (in this context, it is evaluated as a term elsewhere) nil else # Prefixes always downcased - ns = uri_mappings[prefix.to_s.downcase] + prefix = prefix.to_s.downcase unless @version == :rdfa_1_0 + ns = uri_mappings[prefix.to_s] if ns RDF::URI.intern(ns + reference.to_s) else - add_debug(element, "curie_to_resource_or_bnode No namespace mapping for #{prefix.downcase}") + #add_debug(element, "curie_to_resource_or_bnode No namespace mapping for #{prefix}") nil end end end end \ No newline at end of file