begin raise LoadError, "not with java" if RUBY_PLATFORM == "java" require 'nokogiri' rescue LoadError => e :rexml end require 'rdf/ntriples' require 'rdf/xsd' module RDF::RDFa ## # An RDFa parser in Ruby # # This class supports [Nokogiri][] for HTML # processing, and will automatically select the most performant # implementation (Nokogiri or LibXML) that is available. If need be, you # can explicitly override the used implementation by passing in a # `:library` option to `Reader.new` or `Reader.open`. # # [Nokogiri]: http://nokogiri.org/ # # Based on processing rules described here: # @see http://www.w3.org/TR/rdfa-syntax/#s_model RDFa 1.0 # @see http://www.w3.org/TR/2012/CR-rdfa-core-20120313/ # @see http://www.w3.org/TR/2012/CR-xhtml-rdfa-20120313/ # @see http://dev.w3.org/html5/rdfa/ # # @author [Gregg Kellogg](http://kellogg-assoc.com/) class Reader < RDF::Reader format Format include Expansion XHTML = "http://www.w3.org/1999/xhtml" # Content model for @about and @resource. In RDFa 1.0, this was URIorSafeCURIE SafeCURIEorCURIEorIRI = { :"rdfa1.0" => [:safe_curie, :uri, :bnode], :"rdfa1.1" => [:safe_curie, :curie, :uri, :bnode], } # Content model for @datatype. In RDFa 1.0, this was CURIE # Also plural TERMorCURIEorAbsIRIs, content model for @rel, @rev, @property and @typeof TERMorCURIEorAbsIRI = { :"rdfa1.0" => [:term, :curie], :"rdfa1.1" => [:term, :curie, :absuri], } # This expression matches an NCName as defined in # [XML-NAMES](http://www.w3.org/TR/2009/REC-xml-names-20091208/#NT-NCName) # # @see http://www.w3.org/TR/2009/REC-xml-names-20091208/#NT-NCName NC_REGEXP = Regexp.new( %{^ ( [a-zA-Z_] | \\\\u[0-9a-fA-F]{4} ) ( [0-9a-zA-Z_\.-/] | \\\\u([0-9a-fA-F]{4}) )* $}, Regexp::EXTENDED) # This expression matches an term as defined in # [RDFA-CORE](# @see http://www.w3.org/TR/2012/CR-rdfa-core-20120313/#s_terms) # # For the avoidance of doubt, this definition means a 'term' # in RDFa is an XML NCName that also permits slash as a non-leading character. # @see http://www.w3.org/TR/2012/CR-rdfa-core-20120313/#s_terms TERM_REGEXP = Regexp.new( %{^ (?!\\\\u0301) # ́ is a non-spacing acute accent. # It is legal within an XML Name, but not as the first character. ( [a-zA-Z_] | \\\\u[0-9a-fA-F]{4} ) ( [0-9a-zA-Z_\.-\/] | \\\\u([0-9a-fA-F]{4}) )* $}, Regexp::EXTENDED) # Host language # @attr [:xml, :xhtml1, :xhtml5, :html4, :html5, :svg] attr_reader :host_language # Version # @attr [:"rdfa1.0", :"rdfa1.1"] attr_reader :version # The Recursive Baggage # @private class EvaluationContext # :nodoc: ## # The base. # # This will usually be the URL of the document being processed, # but it could be some other URL, set by some other mechanism, # such as the (X)HTML base element. The important thing is that it establishes # a URL against which relative paths can be resolved. # # @attr [RDF::URI] attr :base, true ## # The parent subject. # # The initial value will be the same as the initial value of base, # but it will usually change during the course of processing. # # @attr [RDF::URI] attr :parent_subject, true ## # The parent object. # # In some situations the object of a statement becomes the subject of any nested statements, # and this property is used to convey this value. # Note that this value may be a bnode, since in some situations a number of nested statements # are grouped together on one bnode. # This means that the bnode must be set in the containing statement and passed down, # and this property is used to convey this value. # # @attr [RDF::URI] attr :parent_object, true ## # A list of current, in-scope URI mappings. # # @attr [Hash{Symbol => String}] attr :uri_mappings, true ## # A list of current, in-scope Namespaces. This is the subset of uri_mappings # which are defined using xmlns. # # @attr [Hash{String => Namespace}] attr :namespaces, true ## # A list of incomplete triples. # # A triple can be incomplete when no object resource # is provided alongside a predicate that requires a resource (i.e., @rel or @rev). # The triples can be completed when a resource becomes available, # which will be when the next subject is specified (part of the process called chaining). # # @attr [Array>] attr :incomplete_triples, true ## # The language. Note that there is no default language. # # @attr [Symbol] attr :language, true ## # The term mappings, a list of terms and their associated URIs. # # This specification does not define an initial list. # Host Languages may define an initial list. # If a Host Language provides an initial list, it should do so via an RDFa Context document. # # @attr [Hash{Symbol => RDF::URI}] attr :term_mappings, true ## # The default vocabulary # # A value to use as the prefix URI when a term is used. # This specification does not define an initial setting for the default vocabulary. # Host Languages may define an initial setting. # # @attr [RDF::URI] attr :default_vocabulary, true ## # lists # # A hash associating lists with properties. # @attr [Hash{RDF::URI => Array}] attr :list_mapping, true # @param [RDF::URI] base # @param [Hash] host_defaults # @option host_defaults [Hash{String => RDF::URI}] :term_mappings Hash of NCName => URI # @option host_defaults [Hash{String => RDF::URI}] :vocabulary Hash of prefix => URI def initialize(base, host_defaults) # Initialize the evaluation context, [5.1] @base = base @parent_subject = @base @parent_object = nil @namespaces = {} @incomplete_triples = [] @language = nil @uri_mappings = host_defaults.fetch(:uri_mappings, {}) @term_mappings = host_defaults.fetch(:term_mappings, {}) @default_vocabulary = host_defaults.fetch(:vocabulary, nil) end # Copy this Evaluation Context # # @param [EvaluationContext] from def initialize_copy(from) # clone the evaluation context correctly @uri_mappings = from.uri_mappings.clone @incomplete_triples = from.incomplete_triples.clone @namespaces = from.namespaces.clone @list_mapping = from.list_mapping # Don't clone end def inspect v = ['base', 'parent_subject', 'parent_object', 'language', 'default_vocabulary'].map do |a| "#{a}=#{self.send(a).inspect}" end v << "uri_mappings[#{uri_mappings.keys.length}]" v << "incomplete_triples[#{incomplete_triples.length}]" v << "term_mappings[#{term_mappings.keys.length}]" v << "lists[#{list_mapping.keys.length}]" if list_mapping v.join(", ") end end # Returns the XML implementation module for this reader instance. # # @attr_reader [Module] attr_reader :implementation ## # Initializes the RDFa reader instance. # # @param [IO, File, String] input # the input stream to read # @param [Hash{Symbol => Object}] options # any additional options (see `RDF::Reader#initialize`) # @option options [Symbol] :library # One of :nokogiri or :rexml. If nil/unspecified uses :nokogiri if available, :rexml otherwise. # @option options [Boolean] :vocab_expansion (false) # whether to perform RDFS expansion on the resulting graph # @option options [:xml, :xhtml1, :xhtml5, :html4, :html5, :svg] :host_language (:html5) # Host Language # @option options [:"rdfa1.0", :"rdfa1.1"] :version (:"rdfa1.1") # Parser version information # @option options [Proc] :processor_callback (nil) # Callback used to provide processor graph triples. # @option options [Array] :rdfagraph ([:output]) # Used to indicate if either or both of the :output or :processor graphs are output. # Value is an array containing on or both of :output or :processor. # @option options [Repository] :vocab_repository (nil) # Repository to save loaded vocabularies. # @option options [Array] :debug # Array to place debug messages # @return [reader] # @yield [reader] `self` # @yieldparam [RDF::Reader] reader # @yieldreturn [void] ignored # @raise [Error]:: Raises RDF::ReaderError if _validate_ def initialize(input = $stdin, options = {}, &block) super do @debug = options[:debug] @options[:rdfagraph] = case @options[:rdfagraph] when String, Symbol then @options[:rdfagraph].to_s.split(',').map(&:strip).map(&:to_sym) when Array then @options[:rdfagraph].map {|o| o.to_s.to_sym} else [] end.select {|o| [:output, :processor].include?(o)} @options[:rdfagraph] << :output if @options[:rdfagraph].empty? @library = case options[:library] when nil # Use Nokogiri when available, and REXML otherwise: (defined?(::Nokogiri) && RUBY_PLATFORM != 'java') ? :nokogiri : :rexml when :nokogiri, :rexml options[:library] else raise ArgumentError.new("expected :rexml or :nokogiri, but got #{options[:library].inspect}") end require "rdf/rdfa/reader/#{@library}" @implementation = case @library when :nokogiri then Nokogiri when :rexml then REXML end self.extend(@implementation) detect_host_language_version(input, options) add_info(@doc, "version = #{@version}, host_language = #{@host_language}, library = #{@library}, rdfagraph = #{@options[:rdfagraph].inspect}, expand = #{@options[:vocab_expansion]}") begin initialize_xml(input, options) rescue add_error(nil, "Malformed document: #{$!.message}") end add_error(nil, "Empty document") if root.nil? add_error(nil, "Syntax errors:\n#{doc_errors}") if !doc_errors.empty? # Section 4.2 RDFa Host Language Conformance # # The Host Language may require the automatic inclusion of one or more Initial Contexts @host_defaults = { :vocabulary => nil, :uri_mappings => {}, :initial_contexts => [], } if @version == :"rdfa1.0" # Add default term mappings @host_defaults[:term_mappings] = %w( alternate appendix bookmark cite chapter contents copyright first glossary help icon index last license meta next p3pv1 prev role section stylesheet subsection start top up ).inject({}) { |hash, term| hash[term] = RDF::XHV[term]; hash } end case @host_language when :xml, :svg @host_defaults[:initial_contexts] = [XML_RDFA_CONTEXT] when :xhtml1 @host_defaults[:initial_contexts] = [XML_RDFA_CONTEXT, XHTML_RDFA_CONTEXT] when :xhtml5, :html4, :html5 @host_defaults[:initial_contexts] = [XML_RDFA_CONTEXT, HTML_RDFA_CONTEXT] end block.call(self) if block_given? end end ## # Iterates the given block for each RDF statement in the input. # # Reads to graph and performs expansion if required. # # @yield [statement] # @yieldparam [RDF::Statement] statement # @return [void] def each_statement(&block) if @options[:vocab_expansion] @options[:vocab_expansion] = false expand.each_statement(&block) @options[:vocab_expansion] = true else @callback = block # Process any saved callbacks (processor graph issues) @saved_callbacks.each {|s| @callback.call(s) } if @saved_callbacks # Add prefix definitions from host defaults @host_defaults[:uri_mappings].each_pair do |prefix, value| prefix(prefix, value) end # parse return unless @root parse_whole_document(@doc, RDF::URI(base_uri)) def extract_script(el, input, type, options, &block) add_debug(el, "script element of type #{type}") begin # Formats don't exist unless they've been required case type when 'application/rdf+xml' then require 'rdf/rdfxml' when 'text/ntriples' then require 'rdf/ntriples' when 'text/turtle' then require 'text/turtle' end rescue end if reader = RDF::Reader.for(:content_type => type) add_debug(el, "=> reader #{reader.to_sym}") reader.new(input, options).each(&block) end end # Look for Embedded Turtle and RDF/XML unless @root.xpath("//rdf:RDF", "xmlns:rdf" => "http://www.w3.org/1999/02/22-rdf-syntax-ns#").empty? extract_script(@root, @doc, "application/rdf+xml", @options) do |statement| block.call(statement) end end # # Look for Embedded RDF/XML # unless @root.xpath("//rdf:RDF", "xmlns:rdf" => "http://www.w3.org/1999/02/22-rdf-syntax-ns#").empty? # extract_script(@root, @doc, "application/rdf+xml", @options) do |statement| # block.call(statement) # end # end # Look for Embedded scripts @root.css("script[type]") do |el| ctx = RDF::URI(el.attribute("id")) if el.attribute("id") type = el.attribute("type") extract_script(el, el.inner_text, type, @options) do |statement| statement.context = ctx if ctx block.call(statement) end end # Just incase root is a