lib/rdf/rdfa/reader.rb in rdf-rdfa-0.0.2 vs lib/rdf/rdfa/reader.rb in rdf-rdfa-0.0.3

- old
+ new

@@ -1,18 +1,15 @@ require 'nokogiri' # FIXME: Implement using different modules as in RDF::TriX -require 'rdf' -require 'rdf/rdfa/vocab' module RDF::RDFa ## # An RDFa parser in Ruby # # @author [Gregg Kellogg](http://kellogg-assoc.com/) class Reader < RDF::Reader format Format - autoload :VERSION, 'rdf/rdfa/version' - + NC_REGEXP = Regexp.new( %{^ (?!\\\\u0301) # &#x301; is a non-spacing acute accent. # It is legal within an XML Name, but not as the first character. ( [a-zA-Z_] @@ -24,16 +21,10 @@ $}, Regexp::EXTENDED) XML_LITERAL = RDF['XMLLiteral'] - attr_reader :debug - - ## - # @return [RDF::Graph] - attr_reader :graph - # Host language, One of: # :xhtml_rdfa_1_0 # :xhtml_rdfa_1_1 attr_reader :host_language @@ -87,13 +78,13 @@ @default_voabulary = host_defaults.fetch(:voabulary, nil) end # Copy this Evaluation Context def initialize_copy(from) - # clone the evaluation context correctly - @uri_mappings = from.uri_mappings.clone - @incomplete_triples = from.incomplete_triples.clone + # clone the evaluation context correctly + @uri_mappings = from.uri_mappings.clone + @incomplete_triples = from.incomplete_triples.clone end def inspect v = %w(base parent_subject parent_object language default_vocabulary).map {|a| "#{a}='#{self.send(a).nil? ? '<nil>' : self.send(a)}'"} v << "uri_mappings[#{uri_mappings.keys.length}]" @@ -101,81 +92,38 @@ v << "term_mappings[#{term_mappings.keys.length}]" v.join(",") end end - # Parse XHTML+RDFa document from a string or input stream to closure or graph. - # - # If the parser is called with a block, triples are passed to the block rather - # than added to the graph. - # - # Optionally, the stream may be a Nokogiri::HTML::Document or Nokogiri::XML::Document - # With a block, yeilds each statement with URI, BNode or Literal elements - # - # @param [IO] stream:: the HTML+RDFa IO stream, string, Nokogiri::HTML::Document or Nokogiri::XML::Document - # @param [String] uri:: the URI of the document - # @param [Hash] options:: Parser options, one of - # <em>options[:debug]</em>:: Array to place debug messages - # <em>options[:strict]</em>:: Raise Error if true, continue with lax parsing, otherwise - # @return [Graph]:: Returns the graph containing parsed triples - # @raise [Error]:: Raises RdfError if _strict_ - ## # Initializes the RDFa reader instance. # - # @param [IO, File, String] input - # @param [Hash{Symbol => Object}] options + # @param [IO, File, String]:: input + # @param [Hash{Symbol => Object}]:: options + # <em>options[:debug]</em>:: Array to place debug messages + # <em>options[:strict]</em>:: Raise Error if true, continue with lax parsing, otherwise + # <em>options[:base_uri]</em>:: Base URI to use for relative URIs. # @yield [reader] # @yieldparam [Reader] reader + # @raise [RDF::ReaderError]:: Raises RDF::ReaderError if _strict_ def initialize(input = $stdin, options = {}, &block) - super - - @graph = RDF::Graph.new + super do @debug = options[:debug] @strict = options[:strict] - @base_uri = options[:base_uri] - @base_uri = RDF::URI.parse(@base_uri) if @base_uri.is_a?(String) - @named_bnodes = {} + @base_uri = RDF::URI.new(options[:base_uri]) @@vocabulary_cache ||= {} @doc = case input when Nokogiri::HTML::Document then input when Nokogiri::XML::Document then input else Nokogiri::XML.parse(input, @base_uri.to_s) end - raise ParserException, "Empty document" if @doc.nil? && @strict - @callback = block - - # Determine host language - # XXX - right now only XHTML defined - @host_language = case @doc.root.attributes["version"].to_s - when /XHTML+RDFa/ then :xhtml - end - - # If none found, assume xhtml - @host_language ||= :xhtml - - @host_defaults = {} - @host_defaults = case @host_language - when :xhtml - { - :vocabulary => RDF::XHV["uri"], - :prefix => "xhv", - :term_mappings => %w( - alternate appendix bookmark cite chapter contents copyright first glossary help icon index - last license meta next p3pv1 prev role section stylesheet subsection start top up - ).inject({}) { |hash, term| hash[term] = RDF::XHV[term]; hash }, - } - else - {} - end - - # parse - parse_whole_document(@doc, @base_uri) - + raise RDF::ReaderError, "Synax errors:\n#{@doc.errors}" if !@doc.errors.empty? && @strict + raise RDF::ReaderError, "Empty document" if (@doc.nil? || @doc.root.nil?) && @strict block.call(self) if block_given? + end end # XXX Invoke the parser, and allow add_triple to make the callback? ## @@ -183,11 +131,38 @@ # # @yield [statement] # @yieldparam [RDF::Statement] statement # @return [void] def each_statement(&block) - @graph.each_statement(&block) + @callback = block + + # Determine host language + # XXX - right now only XHTML defined + @host_language = case @doc.root.attributes["version"].to_s + when /XHTML+RDFa/ then :xhtml + end + + # If none found, assume xhtml + @host_language ||= :xhtml + + @host_defaults = {} + @host_defaults = case @host_language + when :xhtml + { + :vocabulary => RDF::XHV.to_s, + :prefix => "xhv", + :term_mappings => %w( + alternate appendix bookmark cite chapter contents copyright first glossary help icon index + last license meta next p3pv1 prev role section stylesheet subsection start top up + ).inject({}) { |hash, term| hash[term] = RDF::XHV[term]; hash }, + } + else + {} + end + + # parse + parse_whole_document(@doc, @base_uri) end ## # Iterates the given block for each RDF triple in the input. # @@ -195,11 +170,13 @@ # @yieldparam [RDF::Resource] subject # @yieldparam [RDF::URI] predicate # @yieldparam [RDF::Value] object # @return [void] def each_triple(&block) - @graph.each_triple(&block) + each_statement do |statement| + block.call(*statement.to_triple) + end end private # Figure out the document path, if it is a Nokogiri::XML::Element or Attribute @@ -225,21 +202,15 @@ # @param [Nokogiri::XML::Node, any] node:: XML Node or string for showing context # @param [URI, BNode] subject:: the subject of the statement # @param [URI] predicate:: the predicate of the statement # @param [URI, BNode, Literal] object:: the object of the statement # @return [Statement]:: Added statement - # @raise [Exception]:: Checks parameter types and raises if they are incorrect if parsing mode is _strict_. + # @raise [ReaderError]:: Checks parameter types and raises if they are incorrect if parsing mode is _strict_. def add_triple(node, subject, predicate, object) statement = RDF::Statement.new(subject, predicate, object) add_debug(node, "statement: #{statement}") - @graph << statement - statement - # FIXME: rescue RdfException => e - rescue Exception => e - add_debug(node, "add_triple raised #{e.class}: #{e.message}") - puts e.backtrace if $DEBUG - raise if @strict + @callback.call(statement) end # Parsing an RDFa document (this is *not* the recursive method) def parse_whole_document(doc, base) @@ -281,61 +252,44 @@ :term_mappings => {} } um = @@vocabulary_cache[profile][:uri_mappings] tm = @@vocabulary_cache[profile][:term_mappings] add_debug(element, "extract_mappings: profile open <#{profile}>") - require 'patron' unless defined?(Patron) - sess = Patron::Session.new - sess.timeout = 10 - resp = sess.get(profile) - raise RuntimeError, "HTTP returned status #{resp.status} when reading #{profile}" if resp.status >= 400 - - # Parse profile, and extract mappings from graph + old_debug, old_verbose, = $DEBUG, $verbose $DEBUG, $verbose = false, false - p_graph = Parser.parse(resp.body, profile) - ttl = p_graph.serialize(:format => :ttl) if @debug || $DEBUG + # FIXME: format shouldn't need to be specified here + p_graph = RDF::Graph.load(profile, :base_uri => profile, :format => :rdfa) $DEBUG, $verbose = old_debug, old_verbose - add_debug(element, ttl) if ttl - p_graph.subjects.each do |subject| - props = p_graph.properties(subject) - #puts props.inspect - - # If one of the objects is not a Literal or if there are additional rdfa:uri or rdfa:term - # predicates sharing the same subject, no mapping is created. - uri = props[RDF::RDFA["uri"].to_s] - term = props[RDF::RDFA["term"].to_s] - prefix = props[RDF::RDFA["prefix"].to_s] + p_graph.each_subject do |subject| + # If one of the objects is not a Literal no mapping is created. + uri = p_graph.first_object([subject, RDF::RDFA['uri'], nil]) + term = p_graph.first_object([subject, RDF::RDFA['term'], nil]) + prefix = p_graph.first_object([subject, RDF::RDFA['prefix'], nil]) add_debug(element, "extract_mappings: uri=#{uri.inspect}, term=#{term.inspect}, prefix=#{prefix.inspect}") next if !uri || (!term && !prefix) - raise ParserException, "multi-valued rdf:uri" if uri.length != 1 - raise ParserException, "multi-valued rdf:term." if term && term.length != 1 - raise ParserException, "multi-valued rdf:prefix" if prefix && prefix.length != 1 + raise RDF::ReaderError, "rdf:uri must be a Literal" unless uri.is_a?(RDF::Literal) + raise RDF::ReaderError, "rdf:term must be a Literal" unless term.nil? || term.is_a?(RDF::Literal) + raise RDF::ReaderError, "rdf:prefix must be a Literal" unless prefix.nil? || prefix.is_a?(RDF::Literal) - uri = uri.first - term = term.first if term - prefix = prefix.first if prefix - raise ParserException, "rdf:uri must be a Literal" unless uri.is_a?(Literal) - raise ParserException, "rdf:term must be a Literal" unless term.nil? || term.is_a?(Literal) - raise ParserException, "rdf:prefix must be a Literal" unless prefix.nil? || prefix.is_a?(Literal) - # For every extracted triple that is the common subject of an rdfa:prefix and an rdfa:uri # predicate, create a mapping from the object literal of the rdfa:prefix predicate to the # object literal of the rdfa:uri predicate. Add or update this mapping in the local list of # URI mappings after transforming the 'prefix' component to lower-case. # For every extracted - um[prefix.to_s.downcase] = RDF::URI.new(uri) if prefix + um[prefix.value.downcase] = uri.value if prefix # triple that is the common subject of an rdfa:term and an rdfa:uri predicate, create a # mapping from the object literal of the rdfa:term predicate to the object literal of the # rdfa:uri predicate. Add or update this mapping in the local term mappings. - tm[term.to_s] = RDF::URI.new(uri) if term + tm[term.value] = RDF::URI.new(uri.value) if term end - rescue ParserException - add_debug(element, "extract_mappings: profile subject #{subject.to_s}: #{e.message}") - raise if @strict + # FIXME: subject isn't in scope here + #rescue RDF::ReaderError + # add_debug(element, "extract_mappings: profile subject #{subject.to_s}: #{e.message}") + # raise if @strict rescue RuntimeError => e add_debug(element, "extract_mappings: profile: #{e.message}") raise if @strict end end @@ -351,13 +305,12 @@ # and the URI is not processed in any way; in particular if it is a relative path it is # not resolved against the current base. element.namespaces.each do |attr_name, attr_value| begin abbr, prefix = attr_name.split(":") - uri_mappings[prefix.to_s.downcase] = RDF::URI.new(attr_value) if abbr.downcase == "xmlns" && prefix - # FIXME: rescue RdfException => e - rescue Exception => e + uri_mappings[prefix.to_s.downcase] = attr_value.to_s if abbr.downcase == "xmlns" && prefix + rescue ReaderError => e add_debug(element, "extract_mappings raised #{e.class}: #{e.message}") raise if @strict end end @@ -370,22 +323,22 @@ prefix, uri = mappings.shift.downcase, mappings.shift #puts "uri_mappings prefix #{prefix} <#{uri}>" next unless prefix.match(/:$/) prefix.chop! - uri_mappings[prefix] = RDF::URI.new(uri) + uri_mappings[prefix] = uri end - add_debug(element, "uri_mappings: #{uri_mappings.values.map{|ns|ns.to_s}.join(", ")}") - add_debug(element, "term_mappings: #{term_mappings.keys.join(", ")}") + add_debug(element, "uri_mappings: #{uri_mappings.map{|k,v|"#{k}='#{v}'"}.join(", ")}") + add_debug(element, "term_mappings: #{term_mappings.map{|k,v|"#{k}='#{v}'"}.join(", ")}") end # The recursive helper function def traverse(element, evaluation_context) if element.nil? add_debug(element, "traverse nil element") - raise ParserException, "Can't parse nil element" if @strict + raise RDF::ReaderError, "Can't parse nil element" if @strict return nil end add_debug(element, "traverse, ec: #{evaluation_context.inspect}") @@ -426,11 +379,11 @@ unless vocab.nil? default_vocabulary = if vocab.to_s.empty? # Set default_vocabulary to host language default @host_defaults.fetch(:voabulary, nil) else - RDF::URI.new(vocab) + vocab.to_s end add_debug(element, "[Step 2] traverse, default_vocaulary: #{default_vocabulary.inspect}") end # Local term mappings [7.5 Steps 3 & 4] @@ -536,11 +489,11 @@ if new_subject and typeof # Typeof is TERMorCURIEorURIs types = process_uris(element, typeof, evaluation_context, :uri_mappings => uri_mappings, :term_mappings => term_mappings, :vocab => default_vocabulary) add_debug(element, "typeof: #{typeof}") types.each do |one_type| - add_triple(element, new_subject, RDF_TYPE, one_type) + add_triple(element, new_subject, RDF.type, one_type) end end # Generate triples with given object [Step 9] if current_object_resource @@ -679,11 +632,11 @@ # If it is a valid CURIE, the resulting URI is used; otherwise, the value will be processed as a URI. uri = curie_to_resource_or_bnode(element, value, options[:uri_mappings], evaluation_context.parent_subject) if uri add_debug(element, "process_uri: #{value} => CURIE => <#{uri}>") else - #FIXME: uri = URIRef.new(value, evaluation_context.base) + ## FIXME: throw exception if there is no base uri set? uri = RDF::URI.new(evaluation_context.base + value) add_debug(element, "process_uri: #{value} => URI => <#{uri}>") end uri end @@ -701,11 +654,11 @@ # If the term is in the local term mappings, use the associated URI. # XXX Spec Confusion: are terms always downcased? Or only for XHTML Vocab? options[:term_mappings][value.to_s.downcase] when options[:vocab] # Otherwise, if there is a local default vocabulary the URI is obtained by concatenating that value and the term. - options[:vocab].join(value) + RDF::URI.new(options[:vocab] + value) else # Finally, if there is no local default vocabulary, the term has no associated URI and must be ignored. nil end end @@ -715,27 +668,25 @@ # URI mappings for CURIEs default to XHV, rather than the default doc namespace prefix, reference = curie.to_s.split(":") # consider the bnode situation if prefix == "_" - # we force a non-nil name, otherwise it generates a new name - # FIXME: BNode.new(reference || "", @named_bnodes) - RDF::Node.new(reference || nil) + RDF::Node.new(reference) elsif curie.to_s.match(/^:/) # Default prefix if uri_mappings[""] - uri_mappings[""].join(reference) + RDF::URI.new(uri_mappings[""] + reference) elsif @host_defaults[:prefix] - @host_defaults[:prefix].join(reference) + RDF::URI.new(@host_defaults[:prefix] + reference) end elsif !curie.to_s.match(/:/) # No prefix, undefined (in this context, it is evaluated as a term elsewhere) nil else # Prefixes always downcased ns = uri_mappings[prefix.to_s.downcase] if ns - ns.join(reference) + RDF::URI.new(ns +reference) else add_debug(element, "curie_to_resource_or_bnode No namespace mapping for #{prefix.downcase}") nil end end \ No newline at end of file