lib/rdf/microdata/reader.rb in rdf-microdata-0.2.2 vs lib/rdf/microdata/reader.rb in rdf-microdata-0.2.3

- old
+ new

@@ -1,26 +1,35 @@ -require 'nokogiri' # FIXME: Implement using different modules as in RDF::TriX +begin + raise LoadError, "not with java" if RUBY_PLATFORM == "java" + require 'nokogiri' +rescue LoadError => e + :rexml +end +require 'rdf/xsd' +require 'json' module RDF::Microdata ## # An Microdata parser in Ruby # # Based on processing rules, amended with the following: - # * property generation from tokens now uses the associated @itemtype as the basis for generation - # * implicit triples are not generated, only those with @item* - # * @datetime values are scanned lexically to find appropriate datatype # - # @see http://dev.w3.org/html5/md/ + # @see https://dvcs.w3.org/hg/htmldata/raw-file/0d6b89f5befb/microdata-rdf/index.html # @author [Gregg Kellogg](http://kellogg-assoc.com/) class Reader < RDF::Reader format Format - XHTML = "http://www.w3.org/1999/xhtml" URL_PROPERTY_ELEMENTS = %w(a area audio embed iframe img link object source track video) + DEFAULT_REGISTRY = File.expand_path(File.join(File.dirname(__FILE__), "..", "..", "..", "etc", "registry.json")) class CrawlFailure < StandardError #:nodoc: end + # Returns the HTML implementation module for this reader instance. + # + # @attr_reader [Module] + attr_reader :implementation + ## # Returns the base URI determined by this reader. # # @example # reader.prefixes[:dc] #=> RDF::URI('http://purl.org/dc/terms/') @@ -29,27 +38,148 @@ # @since 0.3.0 def base_uri @options[:base_uri] end + # Interface to registry + class Registry + ## + # Initialize the registry from a URI or file path + # + # @param [Hash] json + def self.load_registry(json) + @prefixes = {} + json.each do |prefix, elements| + propertyURI = elements.fetch("propertyURI", "vocabulary").to_sym + multipleValues = elements.fetch("multipleValues", "unordered").to_sym + properties = elements.fetch("properties", {}) + @prefixes[prefix] = Registry.new(prefix, propertyURI, multipleValues, properties) + end + end + + ## + # True if registry has already been loaded + def self.loaded? + @prefixes.is_a?(Hash) + end + + ## + # Initialize registry for a particular prefix URI + # + # @param [RDF::URI] prefixURI + # @param [#to_sym] propertyURI (:vocabulary) + # @param [#to_sym] multipleValues (:unordered) + # @param [Hash] properties ({}) + def initialize(prefixURI, propertyURI = :vocabulary, multipleValues = :unordered, properties = {}) + @scheme = propertyURI.to_sym + @multipleValues = multipleValues.to_sym + @properties = properties + if @scheme == :vocabulary + @property_base = prefixURI.to_s + @property_base += '#' unless %w(/ #).include?(@property_base[-1]) # Append a '#' for fragment if necessary + else + @property_base = 'http://www.w3.org/ns/md?type=' + end + end + + ## + # Find a registry entry given a type URI + # + # @param [RDF::URI] type + # @return [Registry] + def self.find(type) + @prefixes.select do |key, value| + type.to_s.index(key) == 0 + end.values.first + end + + ## + # Generate a predicateURI given a `name` + # + # @param [#to_s] name + # @param [Hash{}] ec Evaluation Context + # @return [RDF::URI] + def predicateURI(name, ec) + u = RDF::URI(name) + return u if u.absolute? + + n = frag_escape(name) + if ec[:current_type].nil? + u = RDF::URI(ec[:document_base].to_s) + u.fragment = frag_escape(name) + u + elsif @scheme == :vocabulary + # If scheme is vocabulary return the URI reference constructed by appending the fragment escaped value of name + # to current vocabulary, separated by a U+0023 NUMBER SIGN character (#) unless the current vocabulary ends + # with either a U+0023 NUMBER SIGN character (#) or SOLIDUS U+002F (/). + RDF::URI(@property_base + n) + else # @scheme == :contextual + if ec[:current_type].to_s.index(@property_base) == 0 + # return the concatenation of s, a U+002E FULL STOP character (.) and the fragment-escaped value of name. + RDF::URI(@property_base + '.' + n) + else + # return the concatenation of http://www.w3.org/ns/md?type=, the fragment-escaped value of s, + # the string &prop=, and the fragment-escaped value of name + RDF::URI(@property_base + frag_escape(ec[:current_type]) + '?prop=' + n) + end + end + end + + + ## + # Turn a predicateURI into a simple token + # @param [RDF::URI] predicateURI + # @return [String] + def tokenize(predicateURI) + case @scheme + when :vocabulary + predicateURI.to_s.sub(@property_base, '') + when :contextual + predicateURI.to_s.split('?prop=').last.split('.').last + end + end + + ## + # Determine if property should be serialized as a list or not + # @param [RDF::URI] predicateURI + # @return [Boolean] + def as_list(predicateURI) + tok = tokenize(predicateURI) + if @properties[tok].is_a?(Hash) + @properties[tok]["multipleValues"].to_sym == :list + else + @multipleValues == :list + end + end + + ## + # Fragment escape a name + def frag_escape(name) + name.to_s.gsub(/["#%<>\[\\\]^{|}]/) {|c| '%' + c.unpack('H2' * c.bytesize).join('%').upcase} + end + end + ## # Initializes the Microdata reader instance. # # @param [Nokogiri::HTML::Document, Nokogiri::XML::Document, IO, File, String] input # the input stream to read # @param [Hash{Symbol => Object}] options # any additional options + # @option options [Symbol] :library (:nokogiri) + # One of :nokogiri or :rexml. If nil/unspecified uses :nokogiri if available, :rexml otherwise. # @option options [Encoding] :encoding (Encoding::UTF_8) # the encoding of the input stream (Ruby 1.9+) # @option options [Boolean] :validate (false) # whether to validate the parsed statements and values # @option options [Boolean] :canonicalize (false) # whether to canonicalize parsed literals # @option options [Boolean] :intern (true) # whether to intern all parsed URIs # @option options [#to_s] :base_uri (nil) # the base URI to use when resolving relative URIs + # @option options [#to_s] :registry_uri (DEFAULT_REGISTRY) # @option options [Array] :debug # Array to place debug messages # @return [reader] # @yield [reader] `self` # @yieldparam [RDF::Reader] reader @@ -57,28 +187,47 @@ # @raise [Error]:: Raises RDF::ReaderError if _validate_ def initialize(input = $stdin, options = {}, &block) super do @debug = options[:debug] - @doc = case input - when Nokogiri::HTML::Document, Nokogiri::XML::Document - input - else - # Try to detect charset from input - options[:encoding] ||= input.charset if input.respond_to?(:charset) - - # Otherwise, default is utf-8 - options[:encoding] ||= 'utf-8' + @library = case options[:library] + when nil + (defined?(::Nokogiri) && RUBY_PLATFORM != 'java') ? :nokogiri : :rexml + when :nokogiri, :rexml + options[:library] + else + raise ArgumentError.new("expected :rexml or :nokogiri, but got #{options[:library].inspect}") + end - add_debug(nil, "base_uri: #{base_uri}") - Nokogiri::HTML.parse(input, base_uri.to_s, options[:encoding]) + require "rdf/microdata/reader/#{@library}" + @implementation = case @library + when :nokogiri then Nokogiri + when :rexml then REXML end - - errors = @doc.errors.reject {|e| e.to_s =~ /Tag (audio|source|track|video|time) invalid/} + self.extend(@implementation) + + initialize_html(input, options) rescue raise RDF::ReaderError.new($!.message) + + if (root.nil? && validate?) + raise RDF::ReaderError, "Empty Document" + end + errors = doc_errors.reject {|e| e.to_s =~ /Tag (audio|source|track|video|time) invalid/} raise RDF::ReaderError, "Syntax errors:\n#{errors}" if !errors.empty? && validate? - raise RDF::ReaderError, "Empty document" if (@doc.nil? || @doc.root.nil?) && validate? + add_debug(@doc, "library = #{@library}") + + # Load registry + unless Registry.loaded? + registry = options[:registry_uri] || DEFAULT_REGISTRY + begin + json = RDF::Util::File.open_file(registry) { |f| JSON.load(f) } + rescue JSON::ParserError => e + raise RDF::ReaderError, "Failed to parse registry: #{e.message}" + end + Registry.load_registry(json) + end + if block_given? case block.arity when 0 then instance_eval(&block) else block.call(self) end @@ -119,23 +268,23 @@ def bnode(value = nil) @bnode_cache ||= {} @bnode_cache[value.to_s] ||= RDF::Node.new(value) end - # Figure out the document path, if it is a Nokogiri::XML::Element or Attribute + # Figure out the document path, if it is an Element or Attribute def node_path(node) - "<#{base_uri}>" + case node - when Nokogiri::XML::Node then node.display_path - else node.to_s - end + "<#{base_uri}>#{node.respond_to?(:display_path) ? node.display_path : node}" end # Add debug event to debug array, if specified # - # @param [XML Node, any] node:: XML Node or string for showing context + # @param [Nokogiri::XML::Node, #to_s] node:: XML Node or string for showing context # @param [String] message:: - def add_debug(node, message) + # @yieldreturn [String] appended to message, to allow for lazy-evaulation of message + def add_debug(node, message = "") + return unless ::RDF::Microdata.debug? || @debug + message = message + yield if block_given? puts "#{node_path(node)}: #{message}" if ::RDF::Microdata::debug? @debug << "#{node_path(node)}: #{message}" if @debug.is_a?(Array) end def add_error(node, message) @@ -151,207 +300,175 @@ # @param [URI, BNode, Literal] object:: the object of the statement # @return [Statement]:: Added statement # @raise [ReaderError]:: Checks parameter types and raises if they are incorrect if parsing mode is _validate_. def add_triple(node, subject, predicate, object) statement = RDF::Statement.new(subject, predicate, object) - add_debug(node, "statement: #{RDF::NTriples.serialize(statement)}") + add_debug(node) {"statement: #{RDF::NTriples.serialize(statement)}"} @callback.call(statement) end # Parsing a Microdata document (this is *not* the recursive method) def parse_whole_document(doc, base) - base_el = doc.at_css('html>head>base') - base = base_el.attribute('href').to_s.split('#').first if base_el - - add_debug(doc, "parse_whole_doc: options=#{@options.inspect}") - - if (base) + base = doc_base(base) + options[:base_uri] = if (base) # Strip any fragment from base base = base.to_s.split('#').first - base = options[:base_uri] = uri(base) - add_debug(base_el, "parse_whole_doc: base='#{base}'") + base = uri(base) else base = RDF::URI("") end - # 2. For each a, area, and link element in the Document, run these substeps: - # - # * If the element does not have a rel attribute, then skip this element. - # * If the element does not have an href attribute, then skip this element. - # * If resolving the element's href attribute relative to the element is not successful, - # then skip this element. - doc.css('a, area, link').each do |el| - rel, href = el.attribute('rel'), el.attribute('href') - next unless rel && href - href = uri(href, el.base || base) - add_debug(el, "a: rel=#{rel.inspect}, href=#{href}") + add_debug(nil) {"parse_whole_doc: base='#{base}'"} - # Otherwise, split the value of the element's rel attribute on spaces, obtaining list of tokens. - # Coalesce duplicate tokens in list of tokens. - tokens = rel.to_s.split(/\s+/).map do |tok| - # Convert each token in list of tokens that does not contain a U+003A COLON characters (:) - # to ASCII lowercase. - tok =~ /:/ ? tok : tok.downcase - end.uniq - - # If list of tokens contains both the tokens alternate and stylesheet, - # then remove them both and replace them with the single (uppercase) token - # ALTERNATE-STYLESHEET. - if tokens.include?('alternate') && tokens.include?('stylesheet') - tokens = tokens - %w(alternate stylesheet) - tokens << 'ALTERNATE-STYLESHEET' - end - - tokens.each do |tok| - tok_uri = RDF::URI(tok) - if tok !~ /:/ - # For each token token in list of tokens that contains no U+003A COLON characters (:), - # generate the following triple: - add_triple(el, base, RDF::XHV[tok.gsub('#', '%23')], href) - elsif tok_uri.absolute? - # For each token token in list of tokens that is an absolute URL, generate the following triple: - add_triple(el, base, tok_uri, href) - end - end + ec = { + :memory => {}, + :current_name => nil, + :current_type => nil, + :current_vocabulary => nil, + :document_base => base, + } + items = [] + # 1) For each element that is also a top-level item run the following algorithm: + # + # 1) Generate the triples for an item item, using the evaluation context. + # Let result be the (URI reference or blank node) subject returned. + # 2) Append result to item list. + getItems.each do |el| + result = generate_triples(el, ec) + items << result end + + # 2) Generate an RDF Collection list from + # the ordered list of values. Set value to the value returned from generate an RDF Collection. + value = generateRDFCollection(root, items) - # 3. For each meta element in the Document that has a name attribute and a content attribute, - doc.css('meta[name][content]').each do |el| - name, content = el.attribute('name'), el.attribute('content') - name = name.to_s - name_uri = uri(name, el.base || base) - add_debug(el, "meta: name=#{name.inspect}") - if name !~ /:/ - # If the value of the name attribute contains no U+003A COLON characters (:), - # generate the following triple: - add_triple(el, base, RDF::XHV[name.downcase.gsub('#', '%23')], RDF::Literal(content, :language => el.language)) - elsif name_uri.absolute? - # If the value of the name attribute contains no U+003A COLON characters (:), - # generate the following triple: - add_triple(el, base, name_uri, RDF::Literal(content, :language => el.language)) - end - end + # 3) Generate the following triple: + # subject Document base + # predicate http://www.w3.org/1999/xhtml/microdata#item + # object value + add_triple(doc, base, RDF::MD.item, value) if value - # 4. For each blockquote and q element in the Document that has a cite attribute that resolves - # successfully relative to the element, generate the following triple: - doc.css('blockquote[cite], q[cite]').each do |el| - object = uri(el.attribute('cite'), el.base || base) - add_debug(el, "blockquote: cite=#{object}") - add_triple(el, base, RDF::DC.source, object) - end - - # 5. Let memory be a mapping of items to subjects, initially empty. - # 6. For each element that is also a top-level microdata item, run the following steps: - # * Generate the triples for the item. Pass a reference to memory as the item/subject list. - # Let result be the subject returned. - # * Generate the following triple: - # subject the document's current address - # predicate http://www.w3.org/1999/xhtml/microdata#item - # object result - memory = {} - doc.css('[itemscope]'). - select {|el| !el.has_attribute?('itemprop')}. - each do |el| - object = generate_triples(el, memory) - add_triple(el, base, RDF::MD.item, object) - end - add_debug(doc, "parse_whole_doc: traversal complete") end ## # Generate triples for an item # @param [RDF::Resource] item - # @param [Hash{Nokogiri::XML::Element} => RDF::Resource] memory - # @param [Hash{Symbol => Object}] options - # @option options [RDF::Resource] :fallback_type - # @option options [RDF::Resource] :fallback_name + # @param [Hash{Symbol => Object}] ec + # @option ec [Hash{Nokogiri::XML::Element} => RDF::Resource] memory + # @option ec [RDF::Resource] :current_type # @return [RDF::Resource] - def generate_triples(item, memory, options = {}) - fallback_type = options[:fallback_type] - fallback_name = options[:fallback_name] - - # 1. If there is an entry for item in memory, then let subject be the subject of that entry. + def generate_triples(item, ec = {}) + memory = ec[:memory] + # 1) If there is an entry for item in memory, then let subject be the subject of that entry. # Otherwise, if item has a global identifier and that global identifier is an absolute URL, # let subject be that global identifier. Otherwise, let subject be a new blank node. - subject = if memory.include?(item) - memory[item][:subject] + subject = if memory.include?(item.node) + memory[item.node][:subject] elsif item.has_attribute?('itemid') - u = uri(item.attribute('itemid'), item.base || base_uri) + uri(item.attribute('itemid'), item.base || base_uri) end || RDF::Node.new - memory[item] ||= {} + memory[item.node] ||= {} - add_debug(item, "gentrips(2): subject=#{subject.inspect}") + add_debug(item) {"gentrips(2): subject=#{subject.inspect}, current_type: #{ec[:current_type]}"} - # 2. Add a mapping from item to subject in memory, if there isn't one already. - memory[item][:subject] ||= subject + # 2) Add a mapping from item to subject in memory, if there isn't one already. + memory[item.node][:subject] ||= subject - # 3. If item has an item type and that item type is an absolute URL, let type be that item type. - # Otherwise, let type be the empty string. - rdf_type = type = uri(item.attribute('itemtype')) - type = '' unless type.absolute? + # 3) For each type returned from element.itemType of the element defining the item. + type = nil + item.attribute('itemtype').to_s.split(' ').map{|n| uri(n)}.select(&:absolute?).each do |t| + # 3.1. If type is an absolute URL, generate the following triple: + type ||= t + add_triple(item, subject, RDF.type, t) + end - if type != '' - add_triple(item, subject, RDF.type, type) - # 4.2. If type does not contain a U+0023 NUMBER SIGN character (#), then append a # to type. - type += '#' unless type.to_s.include?('#') - # 4.3. If type does not have a : after its #, append a : to type. - type += ':' unless type.to_s.match(/\#:/) - elsif fallback_type - add_debug(item, "gentrips(5.2): fallback_type=#{fallback_type}, fallback_name=#{fallback_name}") - rdf_type = type = fallback_type - # 5.2. If type does not contain a U+0023 NUMBER SIGN character (#), then append a # to type. - type += '#' unless type.to_s.include?('#') - # 5.3. If type does not have a : after its #, append a : to type. - type += ':' unless type.to_s.match(/\#:/) - # 5.4. If the last character of type is not a :, %20 to type. - type += '%20' unless type.to_s[-1,1] == ':' - # 5.5. Append the fragment-escaped value of fallback name to type. - type += fallback_name.to_s.gsub('#', '%23') + # 5) If type is not an absolute URL, set it to current type from the Evaluation Context if not empty. + type ||= ec[:current_type] + add_debug(item) {"gentrips(5): type=#{type.inspect}"} + + # 6) If the registry contains a URI prefix that is a character for character match of type up to the length of the + # URI prefix, set vocab as that URI prefix + vocab = Registry.find(type) + + # 7) Otherwise, if type is not empty, construct vocab by removing everything following the last + # SOLIDUS U+002F ("/") or NUMBER SIGN U+0023 ("#") from type. + vocab ||= begin + type_vocab = type.to_s.sub(/([\/\#])[^\/\#]*$/, '\1') + add_debug(item) {"gentrips(7): typtype_vocab=#{type_vocab.inspect}"} + Registry.new(type_vocab) # if type end - add_debug(item, "gentrips(6): type=#{type.inspect}") - - # 6. For each element _element_ that has one or more property names and is one of the + # 8) Update evaluation context setting current vocabulary to vocab. + ec[:current_vocabulary] = vocab + + # 9) Set property list to an empty mapping between properties and one or more ordered values as established below. + property_list = {} + + # 10. For each element _element_ that has one or more property names and is one of the # properties of the item _item_, in the order those elements are given by the algorithm # that returns the properties of an item, run the following substep: props = item_properties(item) - - # 6.1. For each name name in element's property names, run the following substeps: + # 10.1. For each name name in element's property names, run the following substeps: props.each do |element| - element.attribute('itemprop').to_s.split(' ').each do |name| - add_debug(element, "gentrips(6.1): name=#{name.inspect}") - # If type is the empty string and name is not an absolute URL, then abort these substeps. - name_uri = RDF::URI(name) - next if type == '' && !name_uri.absolute? - + element.attribute('itemprop').to_s.split(' ').compact.each do |name| + add_debug(element) {"gentrips(10.1): name=#{name.inspect}, type=#{type}"} + # Let context be a copy of evaluation context with current type set to type and current vocabulary set to vocab. + ec_new = ec.merge({:current_type => type, :current_vocabulary => vocab}) + + predicate = vocab.predicateURI(name, ec_new) + ec_new[:current_name] = predicate + add_debug(element) {"gentrips(10.1.2): predicate=#{predicate}"} + + # 10.1.3) Let value be the property value of element. value = property_value(element) - add_debug(element, "gentrips(6.1.2) value=#{value.inspect}") + add_debug(element) {"gentrips(10.1.3) value=#{value.inspect}"} + # 10.1.4) If value is an item, then generate the triples for value using a copy of evaluation context with + # current type set to type. Replace value by the subject returned from those steps. if value.is_a?(Hash) - value = generate_triples(element, memory, :fallback_type => type, :fallback_name => name) + value = generate_triples(element, ec_new) + add_debug(element) {"gentrips(10.1.4): value=#{value.inspect}"} end - - add_debug(element, "gentrips(6.1.3): value=#{value.inspect}") - predicate = if name_uri.absolute? - name_uri - else - # Use the URI of the type to create URIs for @itemprop terms - add_debug(element, "gentrips: rdf_type=#{rdf_type}") - predicate = RDF::URI(rdf_type.to_s.sub(/([\/\#])[^\/\#]*$/, '\1' + name)) - end - add_debug(element, "gentrips(6.1.5): predicate=#{predicate}") - - add_triple(element, subject, predicate, value) if predicate + property_list[predicate] ||= [] + property_list[predicate] << value end end + # 11) For each predicate in property list + property_list.each do |predicate, values| + generatePropertyValues(item, subject, predicate, values, ec) + end + subject end + def generatePropertyValues(element, subject, predicate, values, ec) + registry = ec[:current_vocabulary] + if registry.as_list(predicate) + value = generateRDFCollection(element, values) + add_triple(element, subject, predicate, value) + else + values.each {|v| add_triple(element, subject, predicate, v)} + end + end + ## + # Called when values has more than one entry + # @param [Nokogiri::HTML::Element] element + # @param [Array<RDF::Value>] values + # @return [RDF::Node] + def generateRDFCollection(element, values) + list = RDF::List.new(nil, nil, values) + list.each_statement do |st| + add_triple(element, st.subject, st.predicate, st.object) unless st.object == RDF.List + end + list.subject + end + + ## # To find the properties of an item defined by the element root, the user agent must try # to crawl the properties of the element root, with an empty list as the value of memory: # if this fails, then the properties of the item defined by the element root is an empty # list; otherwise, it is the returned list. # @@ -376,17 +493,18 @@ # @param [Nokogiri::XML::Element] root # @param [Array<Nokokogiri::XML::Element>] memory # @return [Array<Array<Nokogiri::XML::Element>, Integer>] # Resultant elements and error count def crawl_properties(root, memory) + # 1. If root is in memory, then the algorithm fails; abort these steps. raise CrawlFailure, "crawl_props mem already has #{root.inspect}" if memory.include?(root) # 2. Collect all the elements in the item root; let results be the resulting # list of elements, and errors be the resulting count of errors. results, errors = elements_in_item(root) - add_debug(root, "crawl_properties results=#{results.inspect}, errors=#{errors}") + add_debug(root) {"crawl_properties results=#{results.map {|e| node_path(e)}.inspect}, errors=#{errors}"} # 3. Remove any elements from results that do not have an itemprop attribute specified. results = results.select {|e| e.has_attribute?('itemprop')} # 4. Let new memory be a new list consisting of the old list memory with the addition of root. @@ -425,17 +543,17 @@ pending = root.elements # If root has an itemref attribute, split the value of that itemref attribute on spaces. # For each resulting token ID, root.attribute('itemref').to_s.split(' ').each do |id| - add_debug(root, "elements_in_item itemref id #{id}") + add_debug(root) {"elements_in_item itemref id #{id}"} # if there is an element in the home subtree of root with the ID ID, # then add the first such element to pending. - id_elem = @doc.at_css("##{id}") + id_elem = find_element_by_id(id) pending << id_elem if id_elem end - add_debug(root, "elements_in_item pending #{pending.inspect}") + add_debug(root) {"elements_in_item pending #{pending.inspect}"} # Loop: Remove an element from pending and let current be that element. while current = pending.shift if results.include?(current) # If current is already in results, increment errors. @@ -455,40 +573,45 @@ end ## # def property_value(element) - add_debug(element, "property_value(#{element.inspect}): base #{element.base.inspect}, base_uri: #{base_uri.inspect}") - case + base = element.base || base_uri + add_debug(element) {"property_value(#{element.name}): base #{base.inspect}"} + value = case when element.has_attribute?('itemscope') {} when element.name == 'meta' - element.attribute('content').to_s + RDF::Literal.new(element.attribute('content').to_s, :language => element.language) + when element.name == 'data' + RDF::Literal.new(element.attribute('value').to_s, :language => element.language) when %w(audio embed iframe img source track video).include?(element.name) - uri(element.attribute('src'), element.base || base_uri) + uri(element.attribute('src'), base) when %w(a area link).include?(element.name) - uri(element.attribute('href'), element.base || base_uri) + uri(element.attribute('href'), base) when %w(object).include?(element.name) - uri(element.attribute('data'), element.base || base_uri) - when %w(time).include?(element.name) && element.has_attribute?('datetime') + uri(element.attribute('data'), base) + when %w(time).include?(element.name) # Lexically scan value and assign appropriate type, otherwise, leave untyped - v = element.attribute('datetime').to_s - datatype = %w(Date Time DateTime).map {|t| RDF::Literal.const_get(t)}.detect do |dt| + v = (element.attribute('datetime') || element.text).to_s + datatype = %w(Date Time DateTime Duration).map {|t| RDF::Literal.const_get(t)}.detect do |dt| v.match(dt::GRAMMAR) end || RDF::Literal - datatype.new(v) + datatype.new(v, :language => element.language) else - RDF::Literal.new(element.text, :language => element.language) + RDF::Literal.new(element.inner_text, :language => element.language) end + add_debug(element) {" #{value.inspect}"} + value end # Fixme, what about xml:base relative to element? def uri(value, base = nil) value = if base base = uri(base) unless base.is_a?(RDF::URI) - base.join(value) + base.join(value.to_s) else - RDF::URI(value) + RDF::URI(value.to_s) end value.validate! if validate? value.canonicalize! if canonicalize? value = RDF::URI.intern(value) if intern? value \ No newline at end of file