lib/rdf/rdfa/reader.rb in rdf-rdfa-0.3.1.2 vs lib/rdf/rdfa/reader.rb in rdf-rdfa-0.3.3

- old
+ new

@@ -4,27 +4,28 @@ ## # An RDFa parser in Ruby # # Based on processing rules described here: # @see http://www.w3.org/TR/rdfa-syntax/#s_model RDFa 1.0 - # @see http://www.w3.org/2010/02/rdfa/drafts/2010/WD-rdfa-core-20101026/ RDFa 1.1 + # @see http://www.w3.org/TR/2010/WD-rdfa-core-20100422/ RDFa Core 1.1 + # @see http://www.w3.org/TR/2010/WD-xhtml-rdfa-20100422/ XHTML+RDFa 1.1 # # @author [Gregg Kellogg](http://kellogg-assoc.com/) class Reader < RDF::Reader format Format SafeCURIEorCURIEorURI = { - :rdfa_1_0 => [:term, :safe_curie, :uri, :bnode], - :rdfa_1_1 => [:safe_curie, :curie, :term, :uri, :bnode], + :"rdfa1.0" => [:term, :safe_curie, :uri, :bnode], + :"rdfa1.1" => [:safe_curie, :curie, :term, :uri, :bnode], } TERMorCURIEorAbsURI = { - :rdfa_1_0 => [:term, :curie], - :rdfa_1_1 => [:term, :curie, :absuri], + :"rdfa1.0" => [:term, :curie], + :"rdfa1.1" => [:term, :curie, :absuri], } TERMorCURIEorAbsURIprop = { - :rdfa_1_0 => [:curie], - :rdfa_1_1 => [:term, :curie, :absuri], + :"rdfa1.0" => [:curie], + :"rdfa1.1" => [:term, :curie, :absuri], } NC_REGEXP = Regexp.new( %{^ (?!\\\\u0301) # &#x301; is a non-spacing acute accent. @@ -37,15 +38,15 @@ )* $}, Regexp::EXTENDED) # Host language - # @return [:xhtml, :svg] + # @return [:xml1, :xhtml1, :xhtml5, :html4, :html5, :svg] attr_reader :host_language # Version - # @return [:rdfa_1_0, :rdfa_1_1] + # @return [:"rdfa1.0", :"rdfa1.1"] attr_reader :version # The Recursive Baggage # @private class EvaluationContext # :nodoc: @@ -145,11 +146,11 @@ def inspect v = %w(base parent_subject parent_object language default_vocabulary).map {|a| "#{a}='#{self.send(a).nil? ? '<nil>' : self.send(a)}'"} v << "uri_mappings[#{uri_mappings.keys.length}]" v << "incomplete_triples[#{incomplete_triples.length}]" v << "term_mappings[#{term_mappings.keys.length}]" - v.join(",") + v.join(", ") end end ## # Initializes the RDFa reader instance. @@ -168,13 +169,13 @@ # whether to intern all parsed URIs # @option options [Hash] :prefixes (Hash.new) # the prefix mappings to use (not supported by all readers) # @option options [#to_s] :base_uri (nil) # the base URI to use when resolving relative URIs - # @option options [:xhtml] :host_language (:xhtml) + # @option options [:xml1, :xhtml1, :xhtml5, :html4, :html5, :svg] :host_language (:xhtml1) # Host Language - # @option options [:rdfa_1_0, :rdfa_1_1] :version (:rdfa_1_1) + # @option options [:"rdfa1.0", :"rdfa1.1"] :version (:"rdfa1.1") # Parser version information # @option options [Graph] :processor_graph (nil) # Graph to record information, warnings and errors. # @option options [Repository] :profile_repository (nil) # Repository to save profile graphs. @@ -188,29 +189,98 @@ def initialize(input = $stdin, options = {}, &block) super do @debug = options[:debug] @base_uri = uri(options[:base_uri]) - @version = options[:version] ? options[:version].to_sym : :rdfa_1_1 + @host_language = options[:host_language] @processor_graph = options[:processor_graph] @doc = case input - when Nokogiri::HTML::Document then input - when Nokogiri::XML::Document then input - else Nokogiri::XML.parse(input, @base_uri.to_s) + when Nokogiri::HTML::Document + @host_language ||= :xhtml1 + when Nokogiri::XML::Document + input + else + # Intuit from content type + @host_language ||= case input.respond_to?(:content_type) && input.content_type + when "text/xml", "application/xml" + :xml1 + when "text/html", "application/xhtml+xml" + :xhtml1 + when "image/svg+xml" + :svg + end + + # Intuit from file extension + @host_language ||= case input.respond_to?(:path) && File.extname(input.path.to_s) + when ".html" then :html5 + when ".xhtml" then :xhtml1 + when ".svg" then :svg + end + + Nokogiri::XML.parse(input, @base_uri.to_s) end - @host_language = options[:host_language] || case @doc.root.name.downcase.to_sym - when :html then :xhtml + if (@doc.nil? || @doc.root.nil?) + add_error(nil, "Empty document", RDF::RDFA.DocumentError) + raise RDF::ReaderError, "Empty Document" + end + add_warning(nil, "Synax errors:\n#{@doc.errors}", RDF::RDFA.DocumentError) if !@doc.errors.empty? && validate? + + @version = options[:version] ? options[:version].to_sym : nil + + # Check for version of the processor to use: + # * Check document type for "XHTML+RDFa 1.0" + # * Check @version attribute on the html element for the value "XHTML+RDFa 1.0" + @version ||= :"rdfa1.0" if @doc.doctype.to_s =~ /RDFa 1\.0/ + @version ||= :"rdfa1.0" if @doc.root && @doc.root.attribute("version").to_s =~ /RDFa 1\.0/ + @version ||= :"rdfa1.1" if @doc.root && @doc.root.attribute("version").to_s =~ /RDFa 1\.1/ + @version ||= :"rdfa1.1" + + # Intuit host_language from doctype + @host_language ||= case @doc.doctype.to_s + when /html 4/i then :html4 + when /xhtml\+rdfa/i then :xhtml1 + when /html/ then :html5 + end + + # Determine host language from element name + @host_language ||= case @doc.root.name.downcase.to_sym + when :html then :xhtml1 when :svg then :svg - else :xhtml + else :xml end - add_error(nil, "Empty document", RDF::RDFA.DocumentError) if (@doc.nil? || @doc.root.nil?) - add_warning(nil, "Synax errors:\n#{@doc.errors}", RDF::RDFA.DocumentError) if !@doc.errors.empty? && validate? - add_error("Empty document") if (@doc.nil? || @doc.root.nil?) && validate? + # Otherwise, treat it as XML + @host_language ||= :xml1 + # Section 4.2 RDFa Host Language Conformance + # + # The Host Language may require the automatic inclusion of one or more default RDFa Profiles. + @host_defaults = { + :vocabulary => nil, + :uri_mappings => {}, + :profiles => [], + } + + if @version == :"rdfa1.0" + # Add default term mappings + @host_defaults[:term_mappings] = %w( + alternate appendix bookmark cite chapter contents copyright first glossary help icon index + last license meta next p3pv1 prev role section stylesheet subsection start top up + ).inject({}) { |hash, term| hash[term] = RDF::XHV[term]; hash } + end + + case @host_language + when :xml1, :svg + @host_defaults[:profiles] = [XML_RDFA_PROFILE] + when :xhtml1, :xhtml5, :html4, :html5 + @host_defaults[:profiles] = [XML_RDFA_PROFILE, XHTML_RDFA_PROFILE] + end + + add_info(@doc, "version = #{@version}, host_language = #{@host_language}") + block.call(self) if block_given? end self.profile_repository = options[:profile_repository] if options[:profile_repository] end @@ -232,39 +302,15 @@ # @yieldparam [RDF::Statement] statement # @return [void] def each_statement(&block) @callback = block - # Section 4.2 RDFa Host Language Conformance - # - # The Host Language may define a default RDFa Profile. If it does, the RDFa Profile triples that establish term or - # URI mappings associated with that profile must not change without changing the profile URI. RDFa Processors may - # embed, cache, or retrieve the RDFa Profile triples associated with that profile. - @host_defaults = case @host_language - when :xhtml - { - :vocabulary => nil, - :prefix => "xhv", - :uri_mappings => {"xhv" => RDF::XHV.to_s}, # RDF::XHTML is wrong - :term_mappings => %w( - alternate appendix bookmark cite chapter contents copyright first glossary help icon index - last license meta next p3pv1 prev role section stylesheet subsection start top up - ).inject({}) { |hash, term| hash[term.to_sym] = RDF::XHV[term].to_s; hash }, - } - else - { - :uri_mappings => {}, - } - end - # Add prefix definitions from host defaults @host_defaults[:uri_mappings].each_pair do |prefix, value| prefix(prefix, value) end - add_info(@doc, "version = #{@version}, host_language = #{@host_language}") - # parse parse_whole_document(@doc, @base_uri) end ## @@ -317,23 +363,21 @@ add_processor_message(node, message, process_class) raise RDF::ReaderError, message if validate? end def add_processor_message(node, message, process_class) - puts "#{node_path(node)}: #{message}" if ::RDF::RDFa::debug? + puts "#{node_path(node)}: #{message}" if ::RDF::RDFa.debug? @debug << "#{node_path(node)}: #{message}" if @debug.is_a?(Array) if @processor_graph - @processor_sequence ||= 0 n = RDF::Node.new @processor_graph << RDF::Statement.new(n, RDF["type"], process_class) @processor_graph << RDF::Statement.new(n, RDF::DC.description, message) @processor_graph << RDF::Statement.new(n, RDF::DC.date, RDF::Literal::Date.new(DateTime.now)) - @processor_graph << RDF::Statement.new(n, RDF::RDFA.sequence, RDF::Literal::Integer.new(@processor_sequence += 1)) @processor_graph << RDF::Statement.new(n, RDF::RDFA.context, @base_uri) nc = RDF::Node.new @processor_graph << RDF::Statement.new(nc, RDF["type"], RDF::PTR.XPathPointer) - @processor_graph << RDF::Statement.new(nc, RDF::PTR.expression, node.path) + @processor_graph << RDF::Statement.new(nc, RDF::PTR.expression, node.path) if node.respond_to?(:path) @processor_graph << RDF::Statement.new(n, RDF::RDFA.context, nc) end end # add a statement, object can be literal or URI or bnode @@ -353,13 +397,16 @@ # Parsing an RDFa document (this is *not* the recursive method) def parse_whole_document(doc, base) # find if the document has a base element case @host_language - when :xhtml + when :xhtml1, :xhtml5, :html4, :html5 base_el = doc.at_css("html>head>base") base = base_el.attribute("href").to_s.split("#").first if base_el + else + xml_base = doc.root.attribute_with_ns("base", RDF::XML.to_s) + base = xml_base if xml_base end if (base) # Strip any fragment from base base = base.to_s.split("#").first @@ -368,44 +415,60 @@ end # initialize the evaluation context with the appropriate base evaluation_context = EvaluationContext.new(base, @host_defaults) + if @version != :"rdfa1.0" + # Process default vocabularies + process_profile(doc.root, @host_defaults[:profiles]) do |which, value| + add_debug(doc.root, "parse_whole_document, #{which}: #{value.inspect}") + case which + when :uri_mappings then evaluation_context.uri_mappings.merge!(value) + when :term_mappings then evaluation_context.term_mappings.merge!(value) + when :default_vocabulary then evaluation_context.default_vocabulary = value + end + end + end + traverse(doc.root, evaluation_context) add_debug("", "parse_whole_doc: traversal complete'") end # Parse and process URI mappings, Term mappings and a default vocabulary from @profile # # Yields each mapping def process_profile(element, profiles) profiles. - reverse. map {|uri| uri(uri).normalize}. each do |uri| # Don't try to open ourselves! if @base_uri == uri add_debug(element, "process_profile: skip recursive profile <#{uri}>") next end old_debug = RDF::RDFa.debug? - #RDF::RDFa.debug = false - add_info(element, "process_profile: load <#{uri}>") - next unless profile = Profile.find(uri) - RDF::RDFa.debug = old_debug + begin + add_info(element, "process_profile: load <#{uri}>") + RDF::RDFa.debug = false + profile = Profile.find(uri) + rescue Exception => e + RDF::RDFa.debug = old_debug + add_error(element, e.message, RDF::RDFA.ProfileReferenceError) + raise # In case we're not in strict mode, we need to be sure processing stops + ensure + RDF::RDFa.debug = old_debug + end + # Add URI Mappings to prefixes profile.prefixes.each_pair do |prefix, value| prefix(prefix, value) end yield :uri_mappings, profile.prefixes unless profile.prefixes.empty? yield :term_mappings, profile.terms unless profile.terms.empty? yield :default_vocabulary, profile.vocabulary if profile.vocabulary end - rescue Exception => e - add_error(element, e.message, RDF::RDFA.ProfileReferenceError) - raise # In case we're not in strict mode, we need to be sure processing stops end # Extract the XMLNS mappings from an element def extract_mappings(element, uri_mappings, namespaces) # look for xmlns @@ -416,11 +479,11 @@ element.namespace_definitions.each do |ns| # A Conforming RDFa Processor must ignore any definition of a mapping for the '_' prefix. next if ns.prefix == "_" # Downcase prefix for RDFa 1.1 - pfx_lc = (@version == :rdfa_1_0 || ns.prefix.nil?) ? ns.prefix : ns.prefix.to_s.downcase + pfx_lc = (@version == :"rdfa1.0" || ns.prefix.nil?) ? ns.prefix : ns.prefix.to_s.downcase if ns.prefix uri_mappings[pfx_lc.to_sym] = ns.href namespaces[pfx_lc] ||= ns.href prefix(pfx_lc, ns.href) add_info(element, "extract_mappings: xmlns:#{ns.prefix} => <#{ns.href}>") @@ -430,24 +493,29 @@ end # Set mappings from @prefix # prefix is a whitespace separated list of prefix-name URI pairs of the form # NCName ':' ' '+ xs:anyURI - mappings = element.attributes["prefix"].to_s.split(/\s+/) + mappings = element.attribute("prefix").to_s.strip.split(/\s+/) while mappings.length > 0 do prefix, uri = mappings.shift.downcase, mappings.shift #puts "uri_mappings prefix #{prefix} <#{uri}>" next unless prefix.match(/:$/) prefix.chop! + unless prefix.match(NC_REGEXP) + add_error(element, "extract_mappings: Prefix #{prefix.inspect} does not match NCName production") + next + end + # A Conforming RDFa Processor must ignore any definition of a mapping for the '_' prefix. next if prefix == "_" uri_mappings[prefix.to_s.empty? ? nil : prefix.to_s.to_sym] = uri prefix(prefix, uri) add_info(element, "extract_mappings: prefix #{prefix} => <#{uri}>") - end unless @version == :rdfa_1_0 + end unless @version == :"rdfa1.0" end # The recursive helper function def traverse(element, evaluation_context) if element.nil? @@ -478,11 +546,11 @@ src = attrs['src'] resource = attrs['resource'] href = attrs['href'] vocab = attrs['vocab'] xml_base = element.attribute_with_ns("base", RDF::XML.to_s) - base = xml_base.to_s if xml_base && @host_language != :xhtml + base = xml_base.to_s if xml_base && ![:xhtml1, :xhtml5, :html4, :html5].include?(@host_language) base ||= evaluation_context.base # Pull out the attributes needed for the skip test. property = attrs['property'].to_s.strip if attrs['property'] typeof = attrs['typeof'].to_s.strip if attrs['typeof'] @@ -510,18 +578,33 @@ add_debug(element, "traverse " + attrs.map{|a| "#{a.first}: #{a.last}"}.join(", ")) unless attrs.empty? # Local term mappings [7.5 Steps 2] # Next the current element is parsed for any updates to the local term mappings and local list of URI mappings via @profile. # If @profile is present, its value is processed as defined in RDFa Profiles. - unless @version == :rdfa_1_0 + unless @version == :"rdfa1.0" begin process_profile(element, profiles) do |which, value| add_debug(element, "[Step 2] traverse, #{which}: #{value.inspect}") case which - when :uri_mappings then uri_mappings.merge!(value) - when :term_mappings then term_mappings.merge!(value) - when :default_vocabulary then default_vocabulary = value + when :uri_mappings + value.each do |k, v| + if k.to_s.match(NC_REGEXP) + uri_mappings[k] = v + else + add_error(element, "[Step 2] traverse: Prefix #{k.to_s.inspect} does not match NCName production") + end + end + when :term_mappings + value.each do |k, v| + if k.to_s.match(NC_REGEXP) + term_mappings[k] = v + else + add_error(element, "[Step 2] traverse: Term #{k.to_s.inspect} does not match NCName production") + end + end + when :default_vocabulary + default_vocabulary = value end end rescue # Skip this element and all sub-elements # If any referenced RDFa Profile is not available, then the current element and its children must not place any @@ -536,16 +619,16 @@ # If @vocab is present and contains a value, its value updates the local default vocabulary. # If the value is empty, then the local default vocabulary must be reset to the Host Language defined default. unless vocab.nil? default_vocabulary = if vocab.to_s.empty? # Set default_vocabulary to host language default - add_debug(element, "[Step 2] traverse, reset default_vocaulary to #{@host_defaults.fetch(:vocabulary, nil).inspect}") + add_debug(element, "[Step 3] traverse, reset default_vocaulary to #{@host_defaults.fetch(:vocabulary, nil).inspect}") @host_defaults.fetch(:vocabulary, nil) else uri(vocab) end - add_debug(element, "[Step 2] traverse, default_vocaulary: #{default_vocabulary.inspect}") + add_debug(element, "[Step 3] traverse, default_vocaulary: #{default_vocabulary.inspect}") end # Local term mappings [7.5 Steps 4] # Next, the current element is then examined for URI mapping s and these are added to the local list of URI mappings. # Note that a URI mapping will simply overwrite any current mapping in the list that has the same name @@ -558,12 +641,12 @@ # attribute in no namespace must be ignored for the purposes of determining the element's # language. language = case when element.at_xpath("@xml:lang", "xml" => RDF::XML["uri"].to_s) element.at_xpath("@xml:lang", "xml" => RDF::XML["uri"].to_s).to_s - when element.at_xpath("lang") - element.at_xpath("lang").to_s + when element.at_xpath("@lang") + element.at_xpath("@lang").to_s else language end language = nil if language.to_s.empty? add_debug(element, "HTML5 [3.2.3.3] traverse, lang: #{language || 'nil'}") if language @@ -603,19 +686,18 @@ # will apply: # if @typeof is present, then new subject is set to be a newly created bnode. # otherwise, # if parent object is present, new subject is set to the value of parent object. # Additionally, if @property is not present then the skip element flag is set to 'true'; - new_subject ||= if @host_language == :xhtml && element.name =~ /^(head|body)$/ && base + new_subject ||= if element == @doc.root && base + uri(base) + elsif [:xhtml1, :xhtml5, :html4, :html5].include?(@host_language) && element.name =~ /^(head|body)$/ # From XHTML+RDFa 1.1: # if no URI is provided, then first check to see if the element is the head or body element. # If it is, then act as if there is an empty @about present, and process it according to the rule for @about. uri(base) - elsif @host_language != :xhtml && base - # XXX Spec confusion, assume that this is true - uri(base) - elsif element.attributes['typeof'] + elsif typeof RDF::Node.new else # if it's null, it's null and nothing changes skip = true unless property evaluation_context.parent_object @@ -631,11 +713,13 @@ process_uri(element, src, evaluation_context, base, :uri_mappings => uri_mappings, :restrictions => [:uri]) # If no URI is provided then the first match from the following rules will apply - new_subject ||= if @host_language == :xhtml && element.name =~ /^(head|body)$/ + new_subject ||= if element == @doc.root && base + uri(base) + elsif [:xhtml1, :xhtml5, :html4, :html5].include?(@host_language) && element.name =~ /^(head|body)$/ # From XHTML+RDFa 1.1: # if no URI is provided, then first check to see if the element is the head or body element. # If it is, then act as if there is an empty @about present, and process it according to the rule for @about. uri(base) elsif element.attributes['typeof'] @@ -672,11 +756,11 @@ add_triple(element, new_subject, RDF["type"], one_type) end end # Generate triples with given object [Step 9] - if current_object_resource + if new_subject and current_object_resource rels.each do |r| add_triple(element, new_subject, r, current_object_resource) end revs.each do |r| @@ -725,11 +809,11 @@ begin current_object_literal = if !datatype.to_s.empty? && datatype.to_s != RDF.XMLLiteral.to_s # typed literal add_debug(element, "[Step 11] typed literal (#{datatype})") RDF::Literal.new(content || element.inner_text.to_s, :datatype => datatype, :language => language, :validate => validate?, :canonicalize => canonicalize?) - elsif @version == :rdfa_1_1 + elsif @version == :"rdfa1.1" if datatype.to_s == RDF.XMLLiteral.to_s # XML Literal add_debug(element, "[Step 11(1.1)] XML Literal: #{element.inner_html}") # In order to maintain maximum portability of this literal, any children of the current node that are @@ -772,13 +856,13 @@ end rescue ArgumentError => e add_error(element, e.message) end - # add each property + # add each property properties.each do |p| - add_triple(element, new_subject, p, current_object_literal) + add_triple(element, new_subject, p, current_object_literal) if new_subject end end if not skip and new_subject && !evaluation_context.incomplete_triples.empty? # Complete the incomplete triples from the evaluation context [Step 12] @@ -865,11 +949,11 @@ # Otherwise, the value is evaluated as a CURIE. # If it is a valid CURIE, the resulting URI is used; otherwise, the value will be processed as a URI. uri = curie_to_resource_or_bnode(element, value, options[:uri_mappings], evaluation_context.parent_subject, restrictions) if uri add_debug(element, "process_uri: #{value} => CURIE => <#{uri}>") - elsif @version == :rdfa_1_0 && value.to_s.match(/^xml/i) + elsif @version == :"rdfa1.0" && value.to_s.match(/^xml/i) # Special case to not allow anything starting with XML to be treated as a URI elsif restrictions.include?(:absuri) || restrictions.include?(:uri) begin # AbsURI does not use xml:base if restrictions.include?(:absuri) @@ -933,25 +1017,17 @@ if prefix == "_" && restrictions.include?(:bnode) # we force a non-nil name, otherwise it generates a new name # As a special case, _: is also a valid reference for one specific bnode. bnode(reference) elsif curie.to_s.match(/^:/) - add_debug(element, "curie_to_resource_or_bnode: default prefix: defined? #{!!uri_mappings[""]}, defaults: #{@host_defaults[:prefix]}") # Default prefix - if uri_mappings[nil] - uri(uri_mappings[nil] + reference.to_s) - elsif @host_defaults[:prefix] - uri(uri_mappings[@host_defaults[:prefix]] + reference.to_s) - else - #add_warning(element, "Default namespace prefix is not defined", RDF::RDFA.UnresolvedCURIE) - nil - end + RDF::XHV[reference.to_s] elsif !curie.to_s.match(/:/) # No prefix, undefined (in this context, it is evaluated as a term elsewhere) nil else # Prefixes always downcased - prefix = prefix.to_s.downcase unless @version == :rdfa_1_0 + prefix = prefix.to_s.downcase unless @version == :"rdfa1.0" add_debug(element, "curie_to_resource_or_bnode check for #{prefix.to_s.to_sym.inspect} in #{uri_mappings.inspect}") ns = uri_mappings[prefix.to_s.to_sym] if ns uri(ns + reference.to_s) else \ No newline at end of file