reader.rb in rdf-microdata-0.2.3

- old
+ new

@@ -1,26 +1,35 @@
-require 'nokogiri'  # FIXME: Implement using different modules as in RDF::TriX
+begin
+  raise LoadError, "not with java" if RUBY_PLATFORM == "java"
+  require 'nokogiri'
+rescue LoadError => e
+  :rexml
+end
+require 'rdf/xsd'
+require 'json'
 
 module RDF::Microdata
   ##
   # An Microdata parser in Ruby
   #
   # Based on processing rules, amended with the following:
-  # * property generation from tokens now uses the associated @itemtype as the basis for generation
-  # * implicit triples are not generated, only those with @item*
-  # * @datetime values are scanned lexically to find appropriate datatype
   #
-  # @see http://dev.w3.org/html5/md/
+  # @see https://dvcs.w3.org/hg/htmldata/raw-file/0d6b89f5befb/microdata-rdf/index.html
   # @author [Gregg Kellogg](http://kellogg-assoc.com/)
   class Reader < RDF::Reader
     format Format
-    XHTML = "http://www.w3.org/1999/xhtml"
     URL_PROPERTY_ELEMENTS = %w(a area audio embed iframe img link object source track video)
+    DEFAULT_REGISTRY = File.expand_path(File.join(File.dirname(__FILE__), "..", "..", "..", "etc", "registry.json"))
     
     class CrawlFailure < StandardError  #:nodoc:
     end
 
+    # Returns the HTML implementation module for this reader instance.
+    #
+    # @attr_reader [Module]
+    attr_reader :implementation
+
     ##
     # Returns the base URI determined by this reader.
     #
     # @example
     #   reader.prefixes[:dc]  #=> RDF::URI('http://purl.org/dc/terms/')
@@ -29,27 +38,148 @@
     # @since  0.3.0
     def base_uri
       @options[:base_uri]
     end
 
+    # Interface to registry
+    class Registry
+      ##
+      # Initialize the registry from a URI or file path
+      #
+      # @param [Hash] json
+      def self.load_registry(json)
+        @prefixes = {}
+        json.each do |prefix, elements|
+          propertyURI = elements.fetch("propertyURI", "vocabulary").to_sym
+          multipleValues = elements.fetch("multipleValues", "unordered").to_sym
+          properties = elements.fetch("properties", {})
+          @prefixes[prefix] = Registry.new(prefix, propertyURI, multipleValues, properties)
+        end
+      end
+      
+      ##
+      # True if registry has already been loaded
+      def self.loaded?
+        @prefixes.is_a?(Hash)
+      end
+
+      ##
+      # Initialize registry for a particular prefix URI
+      #
+      # @param [RDF::URI] prefixURI
+      # @param [#to_sym] propertyURI (:vocabulary)
+      # @param [#to_sym] multipleValues (:unordered)
+      # @param [Hash] properties ({})
+      def initialize(prefixURI, propertyURI = :vocabulary, multipleValues = :unordered, properties = {})
+        @scheme = propertyURI.to_sym
+        @multipleValues = multipleValues.to_sym
+        @properties = properties
+        if @scheme == :vocabulary
+          @property_base = prefixURI.to_s
+          @property_base += '#' unless %w(/ #).include?(@property_base[-1]) # Append a '#' for fragment if necessary
+        else
+          @property_base = 'http://www.w3.org/ns/md?type='
+        end
+      end
+
+      ##
+      # Find a registry entry given a type URI
+      #
+      # @param [RDF::URI] type
+      # @return [Registry]
+      def self.find(type)
+        @prefixes.select do |key, value|
+          type.to_s.index(key) == 0
+        end.values.first
+      end
+      
+      ##
+      # Generate a predicateURI given a `name`
+      #
+      # @param [#to_s] name
+      # @param [Hash{}] ec Evaluation Context
+      # @return [RDF::URI]
+      def predicateURI(name, ec)
+        u = RDF::URI(name)
+        return u if u.absolute?
+        
+        n = frag_escape(name)
+        if ec[:current_type].nil?
+          u = RDF::URI(ec[:document_base].to_s)
+          u.fragment = frag_escape(name)
+          u
+        elsif @scheme == :vocabulary
+          # If scheme is vocabulary return the URI reference constructed by appending the fragment escaped value of name
+          # to current vocabulary, separated by a U+0023 NUMBER SIGN character (#) unless the current vocabulary ends
+          # with either a U+0023 NUMBER SIGN character (#) or SOLIDUS U+002F (/).
+          RDF::URI(@property_base + n)
+        else  # @scheme == :contextual
+          if ec[:current_type].to_s.index(@property_base) == 0
+            # return the concatenation of s, a U+002E FULL STOP character (.) and the fragment-escaped value of name.
+            RDF::URI(@property_base + '.' + n)
+          else
+            # return the concatenation of http://www.w3.org/ns/md?type=, the fragment-escaped value of s,
+            # the string &prop=, and the fragment-escaped value of name
+            RDF::URI(@property_base + frag_escape(ec[:current_type]) + '?prop=' + n)
+          end
+        end
+      end
+      
+      
+      ##
+      # Turn a predicateURI into a simple token
+      # @param [RDF::URI] predicateURI
+      # @return [String]
+      def tokenize(predicateURI)
+        case @scheme
+        when :vocabulary
+          predicateURI.to_s.sub(@property_base, '')
+        when :contextual
+          predicateURI.to_s.split('?prop=').last.split('.').last
+        end
+      end
+
+      ##
+      # Determine if property should be serialized as a list or not
+      # @param [RDF::URI] predicateURI
+      # @return [Boolean]
+      def as_list(predicateURI)
+        tok = tokenize(predicateURI)
+        if @properties[tok].is_a?(Hash)
+          @properties[tok]["multipleValues"].to_sym == :list
+        else
+          @multipleValues == :list
+        end
+      end
+
+      ##
+      # Fragment escape a name
+      def frag_escape(name)
+        name.to_s.gsub(/["#%<>\[\\\]^{|}]/) {|c| '%' + c.unpack('H2' * c.bytesize).join('%').upcase}
+      end
+    end
+
     ##
     # Initializes the Microdata reader instance.
     #
     # @param  [Nokogiri::HTML::Document, Nokogiri::XML::Document, IO, File, String] input
     #   the input stream to read
     # @param  [Hash{Symbol => Object}] options
     #   any additional options
+    # @option options [Symbol] :library (:nokogiri)
+    #   One of :nokogiri or :rexml. If nil/unspecified uses :nokogiri if available, :rexml otherwise.
     # @option options [Encoding] :encoding     (Encoding::UTF_8)
     #   the encoding of the input stream (Ruby 1.9+)
     # @option options [Boolean]  :validate     (false)
     #   whether to validate the parsed statements and values
     # @option options [Boolean]  :canonicalize (false)
     #   whether to canonicalize parsed literals
     # @option options [Boolean]  :intern       (true)
     #   whether to intern all parsed URIs
     # @option options [#to_s]    :base_uri     (nil)
     #   the base URI to use when resolving relative URIs
+    # @option options [#to_s]    :registry_uri (DEFAULT_REGISTRY)
     # @option options [Array] :debug
     #   Array to place debug messages
     # @return [reader]
     # @yield  [reader] `self`
     # @yieldparam  [RDF::Reader] reader
@@ -57,28 +187,47 @@
     # @raise [Error]:: Raises RDF::ReaderError if _validate_
     def initialize(input = $stdin, options = {}, &block)
       super do
         @debug = options[:debug]
 
-        @doc = case input
-        when Nokogiri::HTML::Document, Nokogiri::XML::Document
-          input
-        else
-          # Try to detect charset from input
-          options[:encoding] ||= input.charset if input.respond_to?(:charset)
-          
-          # Otherwise, default is utf-8
-          options[:encoding] ||= 'utf-8'
+        @library = case options[:library]
+          when nil
+            (defined?(::Nokogiri) && RUBY_PLATFORM != 'java') ? :nokogiri : :rexml
+          when :nokogiri, :rexml
+            options[:library]
+          else
+            raise ArgumentError.new("expected :rexml or :nokogiri, but got #{options[:library].inspect}")
+        end
 
-          add_debug(nil, "base_uri: #{base_uri}")
-          Nokogiri::HTML.parse(input, base_uri.to_s, options[:encoding])
+        require "rdf/microdata/reader/#{@library}"
+        @implementation = case @library
+          when :nokogiri then Nokogiri
+          when :rexml    then REXML
         end
-        
-        errors = @doc.errors.reject {|e| e.to_s =~ /Tag (audio|source|track|video|time) invalid/}
+        self.extend(@implementation)
+
+        initialize_html(input, options) rescue raise RDF::ReaderError.new($!.message)
+
+        if (root.nil? && validate?)
+          raise RDF::ReaderError, "Empty Document"
+        end
+        errors = doc_errors.reject {|e| e.to_s =~ /Tag (audio|source|track|video|time) invalid/}
         raise RDF::ReaderError, "Syntax errors:\n#{errors}" if !errors.empty? && validate?
-        raise RDF::ReaderError, "Empty document" if (@doc.nil? || @doc.root.nil?) && validate?
 
+        add_debug(@doc, "library = #{@library}")
+
+        # Load registry
+        unless Registry.loaded?
+          registry = options[:registry_uri] || DEFAULT_REGISTRY
+          begin
+            json = RDF::Util::File.open_file(registry) { |f| JSON.load(f) }
+          rescue JSON::ParserError => e
+            raise RDF::ReaderError, "Failed to parse registry: #{e.message}"
+          end
+          Registry.load_registry(json)
+        end
+        
         if block_given?
           case block.arity
             when 0 then instance_eval(&block)
             else block.call(self)
           end
@@ -119,23 +268,23 @@
     def bnode(value = nil)
       @bnode_cache ||= {}
       @bnode_cache[value.to_s] ||= RDF::Node.new(value)
     end
     
-    # Figure out the document path, if it is a Nokogiri::XML::Element or Attribute
+    # Figure out the document path, if it is an Element or Attribute
     def node_path(node)
-      "<#{base_uri}>" + case node
-      when Nokogiri::XML::Node then node.display_path
-      else node.to_s
-      end
+      "<#{base_uri}>#{node.respond_to?(:display_path) ? node.display_path : node}"
     end
     
     # Add debug event to debug array, if specified
     #
-    # @param [XML Node, any] node:: XML Node or string for showing context
+    # @param [Nokogiri::XML::Node, #to_s] node:: XML Node or string for showing context
     # @param [String] message::
-    def add_debug(node, message)
+    # @yieldreturn [String] appended to message, to allow for lazy-evaulation of message
+    def add_debug(node, message = "")
+      return unless ::RDF::Microdata.debug? || @debug
+      message = message + yield if block_given?
       puts "#{node_path(node)}: #{message}" if ::RDF::Microdata::debug?
       @debug << "#{node_path(node)}: #{message}" if @debug.is_a?(Array)
     end
 
     def add_error(node, message)
@@ -151,207 +300,175 @@
     # @param [URI, BNode, Literal] object:: the object of the statement
     # @return [Statement]:: Added statement
     # @raise [ReaderError]:: Checks parameter types and raises if they are incorrect if parsing mode is _validate_.
     def add_triple(node, subject, predicate, object)
       statement = RDF::Statement.new(subject, predicate, object)
-      add_debug(node, "statement: #{RDF::NTriples.serialize(statement)}")
+      add_debug(node) {"statement: #{RDF::NTriples.serialize(statement)}"}
       @callback.call(statement)
     end
 
     # Parsing a Microdata document (this is *not* the recursive method)
     def parse_whole_document(doc, base)
-      base_el = doc.at_css('html>head>base')
-      base = base_el.attribute('href').to_s.split('#').first if base_el
-      
-      add_debug(doc, "parse_whole_doc: options=#{@options.inspect}")
-
-      if (base)
+      base = doc_base(base)
+      options[:base_uri] = if (base)
         # Strip any fragment from base
         base = base.to_s.split('#').first
-        base = options[:base_uri] = uri(base)
-        add_debug(base_el, "parse_whole_doc: base='#{base}'")
+        base = uri(base)
       else
         base = RDF::URI("")
       end
       
-      # 2. For each a, area, and link element in the Document, run these substeps:
-      #
-      # * If the element does not have a rel attribute, then skip this element.
-      # * If the element does not have an href attribute, then skip this element.
-      # * If resolving the element's href attribute relative to the element is not successful,
-      #   then skip this element.
-      doc.css('a, area, link').each do |el|
-        rel, href = el.attribute('rel'), el.attribute('href')
-        next unless rel && href
-        href = uri(href, el.base || base)
-        add_debug(el, "a: rel=#{rel.inspect}, href=#{href}")
+      add_debug(nil) {"parse_whole_doc: base='#{base}'"}
 
-        # Otherwise, split the value of the element's rel attribute on spaces, obtaining list of tokens.
-        # Coalesce duplicate tokens in list of tokens.
-        tokens = rel.to_s.split(/\s+/).map do |tok|
-          # Convert each token in list of tokens that does not contain a U+003A COLON characters (:)
-          # to ASCII lowercase.
-          tok =~ /:/ ? tok : tok.downcase
-        end.uniq
-
-        # If list of tokens contains both the tokens alternate and stylesheet,
-        # then remove them both and replace them with the single (uppercase) token
-        # ALTERNATE-STYLESHEET.
-        if tokens.include?('alternate') && tokens.include?('stylesheet')
-          tokens = tokens - %w(alternate stylesheet)
-          tokens << 'ALTERNATE-STYLESHEET'
-        end
-        
-        tokens.each do |tok|
-          tok_uri = RDF::URI(tok)
-          if tok !~ /:/
-            # For each token token in list of tokens that contains no U+003A COLON characters (:),
-            # generate the following triple:
-            add_triple(el, base, RDF::XHV[tok.gsub('#', '%23')], href)
-          elsif tok_uri.absolute?
-            # For each token token in list of tokens that is an absolute URL, generate the following triple:
-            add_triple(el, base, tok_uri, href)
-          end
-        end
+      ec = {
+        :memory             => {},
+        :current_name       => nil,
+        :current_type       => nil,
+        :current_vocabulary => nil,
+        :document_base      => base,
+      }
+      items = []
+      # 1) For each element that is also a top-level item run the following algorithm:
+      #
+      #   1) Generate the triples for an item item, using the evaluation context.
+      #      Let result be the (URI reference or blank node) subject returned.
+      #   2) Append result to item list.
+      getItems.each do |el|
+        result = generate_triples(el, ec)
+        items << result
       end
+      
+      # 2) Generate an RDF Collection list from
+      #    the ordered list of values. Set value to the value returned from generate an RDF Collection.
+      value = generateRDFCollection(root, items)
 
-      # 3. For each meta element in the Document that has a name attribute and a content attribute,
-      doc.css('meta[name][content]').each do |el|
-        name, content = el.attribute('name'), el.attribute('content')
-        name = name.to_s
-        name_uri = uri(name, el.base || base)
-        add_debug(el, "meta: name=#{name.inspect}")
-        if name !~ /:/
-          # If the value of the name attribute contains no U+003A COLON characters (:),
-          # generate the following triple:
-          add_triple(el, base, RDF::XHV[name.downcase.gsub('#', '%23')], RDF::Literal(content, :language => el.language))
-        elsif name_uri.absolute?
-          # If the value of the name attribute contains no U+003A COLON characters (:),
-          # generate the following triple:
-          add_triple(el, base, name_uri, RDF::Literal(content, :language => el.language))
-        end
-      end
+      # 3) Generate the following triple:
+      #     subject Document base
+      #     predicate http://www.w3.org/1999/xhtml/microdata#item
+      #     object value
+      add_triple(doc, base, RDF::MD.item, value) if value
 
-      # 4. For each blockquote and q element in the Document that has a cite attribute that resolves
-      #    successfully relative to the element, generate the following triple:
-      doc.css('blockquote[cite], q[cite]').each do |el|
-        object = uri(el.attribute('cite'), el.base || base)
-        add_debug(el, "blockquote: cite=#{object}")
-        add_triple(el, base, RDF::DC.source, object)
-      end
-
-      # 5. Let memory be a mapping of items to subjects, initially empty.
-      # 6. For each element that is also a top-level microdata item, run the following steps:
-      #    * Generate the triples for the item. Pass a reference to memory as the item/subject list.
-      #      Let result be the subject returned.
-      #    * Generate the following triple:
-      #      subject    the document's current address
-      #      predicate  http://www.w3.org/1999/xhtml/microdata#item
-      #      object     result 
-      memory = {}
-      doc.css('[itemscope]').
-        select {|el| !el.has_attribute?('itemprop')}.
-        each do |el|
-          object = generate_triples(el, memory)
-          add_triple(el, base, RDF::MD.item, object)
-      end
-
       add_debug(doc, "parse_whole_doc: traversal complete")
     end
 
     ##
     # Generate triples for an item
     # @param [RDF::Resource] item
-    # @param [Hash{Nokogiri::XML::Element} => RDF::Resource] memory
-    # @param [Hash{Symbol => Object}] options
-    # @option options [RDF::Resource] :fallback_type
-    # @option options [RDF::Resource] :fallback_name
+    # @param [Hash{Symbol => Object}] ec
+    # @option ec [Hash{Nokogiri::XML::Element} => RDF::Resource] memory
+    # @option ec [RDF::Resource] :current_type
     # @return [RDF::Resource]
-    def generate_triples(item, memory, options = {})
-      fallback_type = options[:fallback_type]
-      fallback_name = options[:fallback_name]
-
-      # 1. If there is an entry for item in memory, then let subject be the subject of that entry.
+    def generate_triples(item, ec = {})
+      memory = ec[:memory]
+      # 1) If there is an entry for item in memory, then let subject be the subject of that entry.
       #    Otherwise, if item has a global identifier and that global identifier is an absolute URL,
       #    let subject be that global identifier. Otherwise, let subject be a new blank node.
-      subject = if memory.include?(item)
-        memory[item][:subject]
+      subject = if memory.include?(item.node)
+        memory[item.node][:subject]
       elsif item.has_attribute?('itemid')
-        u = uri(item.attribute('itemid'), item.base || base_uri)
+        uri(item.attribute('itemid'), item.base || base_uri)
       end || RDF::Node.new
-      memory[item] ||= {}
+      memory[item.node] ||= {}
 
-      add_debug(item, "gentrips(2): subject=#{subject.inspect}")
+      add_debug(item) {"gentrips(2): subject=#{subject.inspect}, current_type: #{ec[:current_type]}"}
 
-      # 2. Add a mapping from item to subject in memory, if there isn't one already.
-      memory[item][:subject] ||= subject
+      # 2) Add a mapping from item to subject in memory, if there isn't one already.
+      memory[item.node][:subject] ||= subject
       
-      # 3. If item has an item type and that item type is an absolute URL, let type be that item type.
-      #    Otherwise, let type be the empty string.
-      rdf_type = type = uri(item.attribute('itemtype'))
-      type = '' unless type.absolute?
+      # 3) For each type returned from element.itemType of the element defining the item.
+      type = nil
+      item.attribute('itemtype').to_s.split(' ').map{|n| uri(n)}.select(&:absolute?).each do |t|
+        #   3.1. If type is an absolute URL, generate the following triple:
+        type ||= t
+        add_triple(item, subject, RDF.type, t)
+      end
       
-      if type != ''
-        add_triple(item, subject, RDF.type, type)
-        # 4.2. If type does not contain a U+0023 NUMBER SIGN character (#), then append a # to type.
-        type += '#' unless type.to_s.include?('#')
-        # 4.3. If type does not have a : after its #, append a : to type.
-        type += ':' unless type.to_s.match(/\#:/)
-      elsif fallback_type
-        add_debug(item, "gentrips(5.2): fallback_type=#{fallback_type}, fallback_name=#{fallback_name}")
-        rdf_type = type = fallback_type
-        # 5.2. If type does not contain a U+0023 NUMBER SIGN character (#), then append a # to type.
-        type += '#' unless type.to_s.include?('#')
-        # 5.3. If type does not have a : after its #, append a : to type.
-        type += ':' unless type.to_s.match(/\#:/)
-        # 5.4. If the last character of type is not a :, %20 to type.
-        type += '%20' unless type.to_s[-1,1] == ':'
-        # 5.5. Append the fragment-escaped value of fallback name to type.
-        type += fallback_name.to_s.gsub('#', '%23')
+      # 5) If type is not an absolute URL, set it to current type from the Evaluation Context if not empty.
+      type ||= ec[:current_type]
+      add_debug(item)  {"gentrips(5): type=#{type.inspect}"}
+
+      # 6) If the registry contains a URI prefix that is a character for character match of type up to the length of the
+      #    URI prefix, set vocab as that URI prefix
+      vocab = Registry.find(type)
+
+      # 7) Otherwise, if type is not empty, construct vocab by removing everything following the last
+      #    SOLIDUS U+002F ("/") or NUMBER SIGN U+0023 ("#") from type.
+      vocab ||= begin
+        type_vocab = type.to_s.sub(/([\/\#])[^\/\#]*$/, '\1')
+        add_debug(item)  {"gentrips(7): typtype_vocab=#{type_vocab.inspect}"}
+        Registry.new(type_vocab) # if type
       end
 
-      add_debug(item, "gentrips(6): type=#{type.inspect}")
-      
-      # 6. For each element _element_ that has one or more property names and is one of the
+      # 8) Update evaluation context setting current vocabulary to vocab.
+      ec[:current_vocabulary] = vocab
+
+      # 9) Set property list to an empty mapping between properties and one or more ordered values as established below.
+      property_list = {}
+
+      # 10. For each element _element_ that has one or more property names and is one of the
       #    properties of the item _item_, in the order those elements are given by the algorithm
       #    that returns the properties of an item, run the following substep:
       props = item_properties(item)
-
-      # 6.1. For each name name in element's property names, run the following substeps:
+      # 10.1. For each name name in element's property names, run the following substeps:
       props.each do |element|
-        element.attribute('itemprop').to_s.split(' ').each do |name|
-          add_debug(element, "gentrips(6.1): name=#{name.inspect}")
-          # If type is the empty string and name is not an absolute URL, then abort these substeps.
-          name_uri = RDF::URI(name)
-          next if type == '' && !name_uri.absolute?
-
+        element.attribute('itemprop').to_s.split(' ').compact.each do |name|
+          add_debug(element) {"gentrips(10.1): name=#{name.inspect}, type=#{type}"}
+          # Let context be a copy of evaluation context with current type set to type and current vocabulary set to vocab.
+          ec_new = ec.merge({:current_type => type, :current_vocabulary => vocab})
+          
+          predicate = vocab.predicateURI(name, ec_new)
+          ec_new[:current_name] = predicate
+          add_debug(element) {"gentrips(10.1.2): predicate=#{predicate}"}
+          
+          # 10.1.3) Let value be the property value of element.
           value = property_value(element)
-          add_debug(element, "gentrips(6.1.2) value=#{value.inspect}")
+          add_debug(element) {"gentrips(10.1.3) value=#{value.inspect}"}
           
+          # 10.1.4) If value is an item, then generate the triples for value using a copy of evaluation context with
+          #       current type set to type. Replace value by the subject returned from those steps.
           if value.is_a?(Hash)
-            value = generate_triples(element, memory, :fallback_type => type, :fallback_name => name) 
+            value = generate_triples(element, ec_new) 
+            add_debug(element) {"gentrips(10.1.4): value=#{value.inspect}"}
           end
-          
-          add_debug(element, "gentrips(6.1.3): value=#{value.inspect}")
 
-          predicate = if name_uri.absolute?
-            name_uri
-          else
-            # Use the URI of the type to create URIs for @itemprop terms
-            add_debug(element, "gentrips: rdf_type=#{rdf_type}")
-            predicate = RDF::URI(rdf_type.to_s.sub(/([\/\#])[^\/\#]*$/, '\1' + name))
-          end
-          add_debug(element, "gentrips(6.1.5): predicate=#{predicate}")
-          
-          add_triple(element, subject, predicate, value) if predicate
+          property_list[predicate] ||= []
+          property_list[predicate] << value
         end
       end
       
+      # 11) For each predicate in property list
+      property_list.each do |predicate, values|
+        generatePropertyValues(item, subject, predicate, values, ec)
+      end
+      
       subject
     end
 
+    def generatePropertyValues(element, subject, predicate, values, ec)
+      registry = ec[:current_vocabulary]
+      if registry.as_list(predicate)
+        value = generateRDFCollection(element, values)
+        add_triple(element, subject, predicate, value)
+      else
+        values.each {|v| add_triple(element, subject, predicate, v)}
+      end
+    end
+
     ##
+    # Called when values has more than one entry
+    # @param [Nokogiri::HTML::Element] element
+    # @param [Array<RDF::Value>] values
+    # @return [RDF::Node]
+    def generateRDFCollection(element, values)
+      list = RDF::List.new(nil, nil, values)
+      list.each_statement do |st|
+        add_triple(element, st.subject, st.predicate, st.object) unless st.object == RDF.List
+      end
+      list.subject
+    end
+
+    ##
     # To find the properties of an item defined by the element root, the user agent must try
     # to crawl the properties of the element root, with an empty list as the value of memory:
     # if this fails, then the properties of the item defined by the element root is an empty
     # list; otherwise, it is the returned list.
     #
@@ -376,17 +493,18 @@
     # @param [Nokogiri::XML::Element] root
     # @param [Array<Nokokogiri::XML::Element>] memory
     # @return [Array<Array<Nokogiri::XML::Element>, Integer>]
     #   Resultant elements and error count
     def crawl_properties(root, memory)
+      
       # 1. If root is in memory, then the algorithm fails; abort these steps.
       raise CrawlFailure, "crawl_props mem already has #{root.inspect}" if memory.include?(root)
       
       # 2. Collect all the elements in the item root; let results be the resulting
       #    list of elements, and errors be the resulting count of errors.
       results, errors = elements_in_item(root)
-      add_debug(root, "crawl_properties results=#{results.inspect}, errors=#{errors}")
+      add_debug(root) {"crawl_properties results=#{results.map {|e| node_path(e)}.inspect}, errors=#{errors}"}
 
       # 3. Remove any elements from results that do not have an itemprop attribute specified.
       results = results.select {|e| e.has_attribute?('itemprop')}
       
       # 4. Let new memory be a new list consisting of the old list memory with the addition of root.
@@ -425,17 +543,17 @@
       pending = root.elements
       
       # If root has an itemref attribute, split the value of that itemref attribute on spaces.
       # For each resulting token ID, 
       root.attribute('itemref').to_s.split(' ').each do |id|
-        add_debug(root, "elements_in_item itemref id #{id}")
+        add_debug(root) {"elements_in_item itemref id #{id}"}
         # if there is an element in the home subtree of root with the ID ID,
         # then add the first such element to pending.
-        id_elem = @doc.at_css("##{id}")
+        id_elem = find_element_by_id(id)
         pending << id_elem if id_elem
       end
-      add_debug(root, "elements_in_item pending #{pending.inspect}")
+      add_debug(root) {"elements_in_item pending #{pending.inspect}"}
 
       # Loop: Remove an element from pending and let current be that element.
       while current = pending.shift
         if results.include?(current)
           # If current is already in results, increment errors.
@@ -455,40 +573,45 @@
     end
 
     ##
     #
     def property_value(element)
-      add_debug(element, "property_value(#{element.inspect}): base #{element.base.inspect}, base_uri: #{base_uri.inspect}")
-      case
+      base = element.base || base_uri
+      add_debug(element) {"property_value(#{element.name}): base #{base.inspect}"}
+      value = case
       when element.has_attribute?('itemscope')
         {}
       when element.name == 'meta'
-        element.attribute('content').to_s
+        RDF::Literal.new(element.attribute('content').to_s, :language => element.language)
+      when element.name == 'data'
+        RDF::Literal.new(element.attribute('value').to_s, :language => element.language)
       when %w(audio embed iframe img source track video).include?(element.name)
-        uri(element.attribute('src'), element.base || base_uri)
+        uri(element.attribute('src'), base)
       when %w(a area link).include?(element.name)
-        uri(element.attribute('href'), element.base || base_uri)
+        uri(element.attribute('href'), base)
       when %w(object).include?(element.name)
-        uri(element.attribute('data'), element.base || base_uri)
-      when %w(time).include?(element.name) && element.has_attribute?('datetime')
+        uri(element.attribute('data'), base)
+      when %w(time).include?(element.name)
         # Lexically scan value and assign appropriate type, otherwise, leave untyped
-        v = element.attribute('datetime').to_s
-        datatype = %w(Date Time DateTime).map {|t| RDF::Literal.const_get(t)}.detect do |dt|
+        v = (element.attribute('datetime') || element.text).to_s
+        datatype = %w(Date Time DateTime Duration).map {|t| RDF::Literal.const_get(t)}.detect do |dt|
           v.match(dt::GRAMMAR)
         end || RDF::Literal
-        datatype.new(v)
+        datatype.new(v, :language => element.language)
       else
-        RDF::Literal.new(element.text, :language => element.language)
+        RDF::Literal.new(element.inner_text, :language => element.language)
       end
+      add_debug(element) {"  #{value.inspect}"}
+      value
     end
 
     # Fixme, what about xml:base relative to element?
     def uri(value, base = nil)
       value = if base
         base = uri(base) unless base.is_a?(RDF::URI)
-        base.join(value)
+        base.join(value.to_s)
       else
-        RDF::URI(value)
+        RDF::URI(value.to_s)
       end
       value.validate! if validate?
       value.canonicalize! if canonicalize?
       value = RDF::URI.intern(value) if intern?
       value
\ No newline at end of file