lib/rdf/microdata/reader.rb in rdf-microdata-0.2.2 vs lib/rdf/microdata/reader.rb in rdf-microdata-0.2.3
- old
+ new
@@ -1,26 +1,35 @@
-require 'nokogiri' # FIXME: Implement using different modules as in RDF::TriX
+begin
+ raise LoadError, "not with java" if RUBY_PLATFORM == "java"
+ require 'nokogiri'
+rescue LoadError => e
+ :rexml
+end
+require 'rdf/xsd'
+require 'json'
module RDF::Microdata
##
# An Microdata parser in Ruby
#
# Based on processing rules, amended with the following:
- # * property generation from tokens now uses the associated @itemtype as the basis for generation
- # * implicit triples are not generated, only those with @item*
- # * @datetime values are scanned lexically to find appropriate datatype
#
- # @see http://dev.w3.org/html5/md/
+ # @see https://dvcs.w3.org/hg/htmldata/raw-file/0d6b89f5befb/microdata-rdf/index.html
# @author [Gregg Kellogg](http://kellogg-assoc.com/)
class Reader < RDF::Reader
format Format
- XHTML = "http://www.w3.org/1999/xhtml"
URL_PROPERTY_ELEMENTS = %w(a area audio embed iframe img link object source track video)
+ DEFAULT_REGISTRY = File.expand_path(File.join(File.dirname(__FILE__), "..", "..", "..", "etc", "registry.json"))
class CrawlFailure < StandardError #:nodoc:
end
+ # Returns the HTML implementation module for this reader instance.
+ #
+ # @attr_reader [Module]
+ attr_reader :implementation
+
##
# Returns the base URI determined by this reader.
#
# @example
# reader.prefixes[:dc] #=> RDF::URI('http://purl.org/dc/terms/')
@@ -29,27 +38,148 @@
# @since 0.3.0
def base_uri
@options[:base_uri]
end
+ # Interface to registry
+ class Registry
+ ##
+ # Initialize the registry from a URI or file path
+ #
+ # @param [Hash] json
+ def self.load_registry(json)
+ @prefixes = {}
+ json.each do |prefix, elements|
+ propertyURI = elements.fetch("propertyURI", "vocabulary").to_sym
+ multipleValues = elements.fetch("multipleValues", "unordered").to_sym
+ properties = elements.fetch("properties", {})
+ @prefixes[prefix] = Registry.new(prefix, propertyURI, multipleValues, properties)
+ end
+ end
+
+ ##
+ # True if registry has already been loaded
+ def self.loaded?
+ @prefixes.is_a?(Hash)
+ end
+
+ ##
+ # Initialize registry for a particular prefix URI
+ #
+ # @param [RDF::URI] prefixURI
+ # @param [#to_sym] propertyURI (:vocabulary)
+ # @param [#to_sym] multipleValues (:unordered)
+ # @param [Hash] properties ({})
+ def initialize(prefixURI, propertyURI = :vocabulary, multipleValues = :unordered, properties = {})
+ @scheme = propertyURI.to_sym
+ @multipleValues = multipleValues.to_sym
+ @properties = properties
+ if @scheme == :vocabulary
+ @property_base = prefixURI.to_s
+ @property_base += '#' unless %w(/ #).include?(@property_base[-1]) # Append a '#' for fragment if necessary
+ else
+ @property_base = 'http://www.w3.org/ns/md?type='
+ end
+ end
+
+ ##
+ # Find a registry entry given a type URI
+ #
+ # @param [RDF::URI] type
+ # @return [Registry]
+ def self.find(type)
+ @prefixes.select do |key, value|
+ type.to_s.index(key) == 0
+ end.values.first
+ end
+
+ ##
+ # Generate a predicateURI given a `name`
+ #
+ # @param [#to_s] name
+ # @param [Hash{}] ec Evaluation Context
+ # @return [RDF::URI]
+ def predicateURI(name, ec)
+ u = RDF::URI(name)
+ return u if u.absolute?
+
+ n = frag_escape(name)
+ if ec[:current_type].nil?
+ u = RDF::URI(ec[:document_base].to_s)
+ u.fragment = frag_escape(name)
+ u
+ elsif @scheme == :vocabulary
+ # If scheme is vocabulary return the URI reference constructed by appending the fragment escaped value of name
+ # to current vocabulary, separated by a U+0023 NUMBER SIGN character (#) unless the current vocabulary ends
+ # with either a U+0023 NUMBER SIGN character (#) or SOLIDUS U+002F (/).
+ RDF::URI(@property_base + n)
+ else # @scheme == :contextual
+ if ec[:current_type].to_s.index(@property_base) == 0
+ # return the concatenation of s, a U+002E FULL STOP character (.) and the fragment-escaped value of name.
+ RDF::URI(@property_base + '.' + n)
+ else
+ # return the concatenation of http://www.w3.org/ns/md?type=, the fragment-escaped value of s,
+ # the string &prop=, and the fragment-escaped value of name
+ RDF::URI(@property_base + frag_escape(ec[:current_type]) + '?prop=' + n)
+ end
+ end
+ end
+
+
+ ##
+ # Turn a predicateURI into a simple token
+ # @param [RDF::URI] predicateURI
+ # @return [String]
+ def tokenize(predicateURI)
+ case @scheme
+ when :vocabulary
+ predicateURI.to_s.sub(@property_base, '')
+ when :contextual
+ predicateURI.to_s.split('?prop=').last.split('.').last
+ end
+ end
+
+ ##
+ # Determine if property should be serialized as a list or not
+ # @param [RDF::URI] predicateURI
+ # @return [Boolean]
+ def as_list(predicateURI)
+ tok = tokenize(predicateURI)
+ if @properties[tok].is_a?(Hash)
+ @properties[tok]["multipleValues"].to_sym == :list
+ else
+ @multipleValues == :list
+ end
+ end
+
+ ##
+ # Fragment escape a name
+ def frag_escape(name)
+ name.to_s.gsub(/["#%<>\[\\\]^{|}]/) {|c| '%' + c.unpack('H2' * c.bytesize).join('%').upcase}
+ end
+ end
+
##
# Initializes the Microdata reader instance.
#
# @param [Nokogiri::HTML::Document, Nokogiri::XML::Document, IO, File, String] input
# the input stream to read
# @param [Hash{Symbol => Object}] options
# any additional options
+ # @option options [Symbol] :library (:nokogiri)
+ # One of :nokogiri or :rexml. If nil/unspecified uses :nokogiri if available, :rexml otherwise.
# @option options [Encoding] :encoding (Encoding::UTF_8)
# the encoding of the input stream (Ruby 1.9+)
# @option options [Boolean] :validate (false)
# whether to validate the parsed statements and values
# @option options [Boolean] :canonicalize (false)
# whether to canonicalize parsed literals
# @option options [Boolean] :intern (true)
# whether to intern all parsed URIs
# @option options [#to_s] :base_uri (nil)
# the base URI to use when resolving relative URIs
+ # @option options [#to_s] :registry_uri (DEFAULT_REGISTRY)
# @option options [Array] :debug
# Array to place debug messages
# @return [reader]
# @yield [reader] `self`
# @yieldparam [RDF::Reader] reader
@@ -57,28 +187,47 @@
# @raise [Error]:: Raises RDF::ReaderError if _validate_
def initialize(input = $stdin, options = {}, &block)
super do
@debug = options[:debug]
- @doc = case input
- when Nokogiri::HTML::Document, Nokogiri::XML::Document
- input
- else
- # Try to detect charset from input
- options[:encoding] ||= input.charset if input.respond_to?(:charset)
-
- # Otherwise, default is utf-8
- options[:encoding] ||= 'utf-8'
+ @library = case options[:library]
+ when nil
+ (defined?(::Nokogiri) && RUBY_PLATFORM != 'java') ? :nokogiri : :rexml
+ when :nokogiri, :rexml
+ options[:library]
+ else
+ raise ArgumentError.new("expected :rexml or :nokogiri, but got #{options[:library].inspect}")
+ end
- add_debug(nil, "base_uri: #{base_uri}")
- Nokogiri::HTML.parse(input, base_uri.to_s, options[:encoding])
+ require "rdf/microdata/reader/#{@library}"
+ @implementation = case @library
+ when :nokogiri then Nokogiri
+ when :rexml then REXML
end
-
- errors = @doc.errors.reject {|e| e.to_s =~ /Tag (audio|source|track|video|time) invalid/}
+ self.extend(@implementation)
+
+ initialize_html(input, options) rescue raise RDF::ReaderError.new($!.message)
+
+ if (root.nil? && validate?)
+ raise RDF::ReaderError, "Empty Document"
+ end
+ errors = doc_errors.reject {|e| e.to_s =~ /Tag (audio|source|track|video|time) invalid/}
raise RDF::ReaderError, "Syntax errors:\n#{errors}" if !errors.empty? && validate?
- raise RDF::ReaderError, "Empty document" if (@doc.nil? || @doc.root.nil?) && validate?
+ add_debug(@doc, "library = #{@library}")
+
+ # Load registry
+ unless Registry.loaded?
+ registry = options[:registry_uri] || DEFAULT_REGISTRY
+ begin
+ json = RDF::Util::File.open_file(registry) { |f| JSON.load(f) }
+ rescue JSON::ParserError => e
+ raise RDF::ReaderError, "Failed to parse registry: #{e.message}"
+ end
+ Registry.load_registry(json)
+ end
+
if block_given?
case block.arity
when 0 then instance_eval(&block)
else block.call(self)
end
@@ -119,23 +268,23 @@
def bnode(value = nil)
@bnode_cache ||= {}
@bnode_cache[value.to_s] ||= RDF::Node.new(value)
end
- # Figure out the document path, if it is a Nokogiri::XML::Element or Attribute
+ # Figure out the document path, if it is an Element or Attribute
def node_path(node)
- "<#{base_uri}>" + case node
- when Nokogiri::XML::Node then node.display_path
- else node.to_s
- end
+ "<#{base_uri}>#{node.respond_to?(:display_path) ? node.display_path : node}"
end
# Add debug event to debug array, if specified
#
- # @param [XML Node, any] node:: XML Node or string for showing context
+ # @param [Nokogiri::XML::Node, #to_s] node:: XML Node or string for showing context
# @param [String] message::
- def add_debug(node, message)
+ # @yieldreturn [String] appended to message, to allow for lazy-evaulation of message
+ def add_debug(node, message = "")
+ return unless ::RDF::Microdata.debug? || @debug
+ message = message + yield if block_given?
puts "#{node_path(node)}: #{message}" if ::RDF::Microdata::debug?
@debug << "#{node_path(node)}: #{message}" if @debug.is_a?(Array)
end
def add_error(node, message)
@@ -151,207 +300,175 @@
# @param [URI, BNode, Literal] object:: the object of the statement
# @return [Statement]:: Added statement
# @raise [ReaderError]:: Checks parameter types and raises if they are incorrect if parsing mode is _validate_.
def add_triple(node, subject, predicate, object)
statement = RDF::Statement.new(subject, predicate, object)
- add_debug(node, "statement: #{RDF::NTriples.serialize(statement)}")
+ add_debug(node) {"statement: #{RDF::NTriples.serialize(statement)}"}
@callback.call(statement)
end
# Parsing a Microdata document (this is *not* the recursive method)
def parse_whole_document(doc, base)
- base_el = doc.at_css('html>head>base')
- base = base_el.attribute('href').to_s.split('#').first if base_el
-
- add_debug(doc, "parse_whole_doc: options=#{@options.inspect}")
-
- if (base)
+ base = doc_base(base)
+ options[:base_uri] = if (base)
# Strip any fragment from base
base = base.to_s.split('#').first
- base = options[:base_uri] = uri(base)
- add_debug(base_el, "parse_whole_doc: base='#{base}'")
+ base = uri(base)
else
base = RDF::URI("")
end
- # 2. For each a, area, and link element in the Document, run these substeps:
- #
- # * If the element does not have a rel attribute, then skip this element.
- # * If the element does not have an href attribute, then skip this element.
- # * If resolving the element's href attribute relative to the element is not successful,
- # then skip this element.
- doc.css('a, area, link').each do |el|
- rel, href = el.attribute('rel'), el.attribute('href')
- next unless rel && href
- href = uri(href, el.base || base)
- add_debug(el, "a: rel=#{rel.inspect}, href=#{href}")
+ add_debug(nil) {"parse_whole_doc: base='#{base}'"}
- # Otherwise, split the value of the element's rel attribute on spaces, obtaining list of tokens.
- # Coalesce duplicate tokens in list of tokens.
- tokens = rel.to_s.split(/\s+/).map do |tok|
- # Convert each token in list of tokens that does not contain a U+003A COLON characters (:)
- # to ASCII lowercase.
- tok =~ /:/ ? tok : tok.downcase
- end.uniq
-
- # If list of tokens contains both the tokens alternate and stylesheet,
- # then remove them both and replace them with the single (uppercase) token
- # ALTERNATE-STYLESHEET.
- if tokens.include?('alternate') && tokens.include?('stylesheet')
- tokens = tokens - %w(alternate stylesheet)
- tokens << 'ALTERNATE-STYLESHEET'
- end
-
- tokens.each do |tok|
- tok_uri = RDF::URI(tok)
- if tok !~ /:/
- # For each token token in list of tokens that contains no U+003A COLON characters (:),
- # generate the following triple:
- add_triple(el, base, RDF::XHV[tok.gsub('#', '%23')], href)
- elsif tok_uri.absolute?
- # For each token token in list of tokens that is an absolute URL, generate the following triple:
- add_triple(el, base, tok_uri, href)
- end
- end
+ ec = {
+ :memory => {},
+ :current_name => nil,
+ :current_type => nil,
+ :current_vocabulary => nil,
+ :document_base => base,
+ }
+ items = []
+ # 1) For each element that is also a top-level item run the following algorithm:
+ #
+ # 1) Generate the triples for an item item, using the evaluation context.
+ # Let result be the (URI reference or blank node) subject returned.
+ # 2) Append result to item list.
+ getItems.each do |el|
+ result = generate_triples(el, ec)
+ items << result
end
+
+ # 2) Generate an RDF Collection list from
+ # the ordered list of values. Set value to the value returned from generate an RDF Collection.
+ value = generateRDFCollection(root, items)
- # 3. For each meta element in the Document that has a name attribute and a content attribute,
- doc.css('meta[name][content]').each do |el|
- name, content = el.attribute('name'), el.attribute('content')
- name = name.to_s
- name_uri = uri(name, el.base || base)
- add_debug(el, "meta: name=#{name.inspect}")
- if name !~ /:/
- # If the value of the name attribute contains no U+003A COLON characters (:),
- # generate the following triple:
- add_triple(el, base, RDF::XHV[name.downcase.gsub('#', '%23')], RDF::Literal(content, :language => el.language))
- elsif name_uri.absolute?
- # If the value of the name attribute contains no U+003A COLON characters (:),
- # generate the following triple:
- add_triple(el, base, name_uri, RDF::Literal(content, :language => el.language))
- end
- end
+ # 3) Generate the following triple:
+ # subject Document base
+ # predicate http://www.w3.org/1999/xhtml/microdata#item
+ # object value
+ add_triple(doc, base, RDF::MD.item, value) if value
- # 4. For each blockquote and q element in the Document that has a cite attribute that resolves
- # successfully relative to the element, generate the following triple:
- doc.css('blockquote[cite], q[cite]').each do |el|
- object = uri(el.attribute('cite'), el.base || base)
- add_debug(el, "blockquote: cite=#{object}")
- add_triple(el, base, RDF::DC.source, object)
- end
-
- # 5. Let memory be a mapping of items to subjects, initially empty.
- # 6. For each element that is also a top-level microdata item, run the following steps:
- # * Generate the triples for the item. Pass a reference to memory as the item/subject list.
- # Let result be the subject returned.
- # * Generate the following triple:
- # subject the document's current address
- # predicate http://www.w3.org/1999/xhtml/microdata#item
- # object result
- memory = {}
- doc.css('[itemscope]').
- select {|el| !el.has_attribute?('itemprop')}.
- each do |el|
- object = generate_triples(el, memory)
- add_triple(el, base, RDF::MD.item, object)
- end
-
add_debug(doc, "parse_whole_doc: traversal complete")
end
##
# Generate triples for an item
# @param [RDF::Resource] item
- # @param [Hash{Nokogiri::XML::Element} => RDF::Resource] memory
- # @param [Hash{Symbol => Object}] options
- # @option options [RDF::Resource] :fallback_type
- # @option options [RDF::Resource] :fallback_name
+ # @param [Hash{Symbol => Object}] ec
+ # @option ec [Hash{Nokogiri::XML::Element} => RDF::Resource] memory
+ # @option ec [RDF::Resource] :current_type
# @return [RDF::Resource]
- def generate_triples(item, memory, options = {})
- fallback_type = options[:fallback_type]
- fallback_name = options[:fallback_name]
-
- # 1. If there is an entry for item in memory, then let subject be the subject of that entry.
+ def generate_triples(item, ec = {})
+ memory = ec[:memory]
+ # 1) If there is an entry for item in memory, then let subject be the subject of that entry.
# Otherwise, if item has a global identifier and that global identifier is an absolute URL,
# let subject be that global identifier. Otherwise, let subject be a new blank node.
- subject = if memory.include?(item)
- memory[item][:subject]
+ subject = if memory.include?(item.node)
+ memory[item.node][:subject]
elsif item.has_attribute?('itemid')
- u = uri(item.attribute('itemid'), item.base || base_uri)
+ uri(item.attribute('itemid'), item.base || base_uri)
end || RDF::Node.new
- memory[item] ||= {}
+ memory[item.node] ||= {}
- add_debug(item, "gentrips(2): subject=#{subject.inspect}")
+ add_debug(item) {"gentrips(2): subject=#{subject.inspect}, current_type: #{ec[:current_type]}"}
- # 2. Add a mapping from item to subject in memory, if there isn't one already.
- memory[item][:subject] ||= subject
+ # 2) Add a mapping from item to subject in memory, if there isn't one already.
+ memory[item.node][:subject] ||= subject
- # 3. If item has an item type and that item type is an absolute URL, let type be that item type.
- # Otherwise, let type be the empty string.
- rdf_type = type = uri(item.attribute('itemtype'))
- type = '' unless type.absolute?
+ # 3) For each type returned from element.itemType of the element defining the item.
+ type = nil
+ item.attribute('itemtype').to_s.split(' ').map{|n| uri(n)}.select(&:absolute?).each do |t|
+ # 3.1. If type is an absolute URL, generate the following triple:
+ type ||= t
+ add_triple(item, subject, RDF.type, t)
+ end
- if type != ''
- add_triple(item, subject, RDF.type, type)
- # 4.2. If type does not contain a U+0023 NUMBER SIGN character (#), then append a # to type.
- type += '#' unless type.to_s.include?('#')
- # 4.3. If type does not have a : after its #, append a : to type.
- type += ':' unless type.to_s.match(/\#:/)
- elsif fallback_type
- add_debug(item, "gentrips(5.2): fallback_type=#{fallback_type}, fallback_name=#{fallback_name}")
- rdf_type = type = fallback_type
- # 5.2. If type does not contain a U+0023 NUMBER SIGN character (#), then append a # to type.
- type += '#' unless type.to_s.include?('#')
- # 5.3. If type does not have a : after its #, append a : to type.
- type += ':' unless type.to_s.match(/\#:/)
- # 5.4. If the last character of type is not a :, %20 to type.
- type += '%20' unless type.to_s[-1,1] == ':'
- # 5.5. Append the fragment-escaped value of fallback name to type.
- type += fallback_name.to_s.gsub('#', '%23')
+ # 5) If type is not an absolute URL, set it to current type from the Evaluation Context if not empty.
+ type ||= ec[:current_type]
+ add_debug(item) {"gentrips(5): type=#{type.inspect}"}
+
+ # 6) If the registry contains a URI prefix that is a character for character match of type up to the length of the
+ # URI prefix, set vocab as that URI prefix
+ vocab = Registry.find(type)
+
+ # 7) Otherwise, if type is not empty, construct vocab by removing everything following the last
+ # SOLIDUS U+002F ("/") or NUMBER SIGN U+0023 ("#") from type.
+ vocab ||= begin
+ type_vocab = type.to_s.sub(/([\/\#])[^\/\#]*$/, '\1')
+ add_debug(item) {"gentrips(7): typtype_vocab=#{type_vocab.inspect}"}
+ Registry.new(type_vocab) # if type
end
- add_debug(item, "gentrips(6): type=#{type.inspect}")
-
- # 6. For each element _element_ that has one or more property names and is one of the
+ # 8) Update evaluation context setting current vocabulary to vocab.
+ ec[:current_vocabulary] = vocab
+
+ # 9) Set property list to an empty mapping between properties and one or more ordered values as established below.
+ property_list = {}
+
+ # 10. For each element _element_ that has one or more property names and is one of the
# properties of the item _item_, in the order those elements are given by the algorithm
# that returns the properties of an item, run the following substep:
props = item_properties(item)
-
- # 6.1. For each name name in element's property names, run the following substeps:
+ # 10.1. For each name name in element's property names, run the following substeps:
props.each do |element|
- element.attribute('itemprop').to_s.split(' ').each do |name|
- add_debug(element, "gentrips(6.1): name=#{name.inspect}")
- # If type is the empty string and name is not an absolute URL, then abort these substeps.
- name_uri = RDF::URI(name)
- next if type == '' && !name_uri.absolute?
-
+ element.attribute('itemprop').to_s.split(' ').compact.each do |name|
+ add_debug(element) {"gentrips(10.1): name=#{name.inspect}, type=#{type}"}
+ # Let context be a copy of evaluation context with current type set to type and current vocabulary set to vocab.
+ ec_new = ec.merge({:current_type => type, :current_vocabulary => vocab})
+
+ predicate = vocab.predicateURI(name, ec_new)
+ ec_new[:current_name] = predicate
+ add_debug(element) {"gentrips(10.1.2): predicate=#{predicate}"}
+
+ # 10.1.3) Let value be the property value of element.
value = property_value(element)
- add_debug(element, "gentrips(6.1.2) value=#{value.inspect}")
+ add_debug(element) {"gentrips(10.1.3) value=#{value.inspect}"}
+ # 10.1.4) If value is an item, then generate the triples for value using a copy of evaluation context with
+ # current type set to type. Replace value by the subject returned from those steps.
if value.is_a?(Hash)
- value = generate_triples(element, memory, :fallback_type => type, :fallback_name => name)
+ value = generate_triples(element, ec_new)
+ add_debug(element) {"gentrips(10.1.4): value=#{value.inspect}"}
end
-
- add_debug(element, "gentrips(6.1.3): value=#{value.inspect}")
- predicate = if name_uri.absolute?
- name_uri
- else
- # Use the URI of the type to create URIs for @itemprop terms
- add_debug(element, "gentrips: rdf_type=#{rdf_type}")
- predicate = RDF::URI(rdf_type.to_s.sub(/([\/\#])[^\/\#]*$/, '\1' + name))
- end
- add_debug(element, "gentrips(6.1.5): predicate=#{predicate}")
-
- add_triple(element, subject, predicate, value) if predicate
+ property_list[predicate] ||= []
+ property_list[predicate] << value
end
end
+ # 11) For each predicate in property list
+ property_list.each do |predicate, values|
+ generatePropertyValues(item, subject, predicate, values, ec)
+ end
+
subject
end
+ def generatePropertyValues(element, subject, predicate, values, ec)
+ registry = ec[:current_vocabulary]
+ if registry.as_list(predicate)
+ value = generateRDFCollection(element, values)
+ add_triple(element, subject, predicate, value)
+ else
+ values.each {|v| add_triple(element, subject, predicate, v)}
+ end
+ end
+
##
+ # Called when values has more than one entry
+ # @param [Nokogiri::HTML::Element] element
+ # @param [Array<RDF::Value>] values
+ # @return [RDF::Node]
+ def generateRDFCollection(element, values)
+ list = RDF::List.new(nil, nil, values)
+ list.each_statement do |st|
+ add_triple(element, st.subject, st.predicate, st.object) unless st.object == RDF.List
+ end
+ list.subject
+ end
+
+ ##
# To find the properties of an item defined by the element root, the user agent must try
# to crawl the properties of the element root, with an empty list as the value of memory:
# if this fails, then the properties of the item defined by the element root is an empty
# list; otherwise, it is the returned list.
#
@@ -376,17 +493,18 @@
# @param [Nokogiri::XML::Element] root
# @param [Array<Nokokogiri::XML::Element>] memory
# @return [Array<Array<Nokogiri::XML::Element>, Integer>]
# Resultant elements and error count
def crawl_properties(root, memory)
+
# 1. If root is in memory, then the algorithm fails; abort these steps.
raise CrawlFailure, "crawl_props mem already has #{root.inspect}" if memory.include?(root)
# 2. Collect all the elements in the item root; let results be the resulting
# list of elements, and errors be the resulting count of errors.
results, errors = elements_in_item(root)
- add_debug(root, "crawl_properties results=#{results.inspect}, errors=#{errors}")
+ add_debug(root) {"crawl_properties results=#{results.map {|e| node_path(e)}.inspect}, errors=#{errors}"}
# 3. Remove any elements from results that do not have an itemprop attribute specified.
results = results.select {|e| e.has_attribute?('itemprop')}
# 4. Let new memory be a new list consisting of the old list memory with the addition of root.
@@ -425,17 +543,17 @@
pending = root.elements
# If root has an itemref attribute, split the value of that itemref attribute on spaces.
# For each resulting token ID,
root.attribute('itemref').to_s.split(' ').each do |id|
- add_debug(root, "elements_in_item itemref id #{id}")
+ add_debug(root) {"elements_in_item itemref id #{id}"}
# if there is an element in the home subtree of root with the ID ID,
# then add the first such element to pending.
- id_elem = @doc.at_css("##{id}")
+ id_elem = find_element_by_id(id)
pending << id_elem if id_elem
end
- add_debug(root, "elements_in_item pending #{pending.inspect}")
+ add_debug(root) {"elements_in_item pending #{pending.inspect}"}
# Loop: Remove an element from pending and let current be that element.
while current = pending.shift
if results.include?(current)
# If current is already in results, increment errors.
@@ -455,40 +573,45 @@
end
##
#
def property_value(element)
- add_debug(element, "property_value(#{element.inspect}): base #{element.base.inspect}, base_uri: #{base_uri.inspect}")
- case
+ base = element.base || base_uri
+ add_debug(element) {"property_value(#{element.name}): base #{base.inspect}"}
+ value = case
when element.has_attribute?('itemscope')
{}
when element.name == 'meta'
- element.attribute('content').to_s
+ RDF::Literal.new(element.attribute('content').to_s, :language => element.language)
+ when element.name == 'data'
+ RDF::Literal.new(element.attribute('value').to_s, :language => element.language)
when %w(audio embed iframe img source track video).include?(element.name)
- uri(element.attribute('src'), element.base || base_uri)
+ uri(element.attribute('src'), base)
when %w(a area link).include?(element.name)
- uri(element.attribute('href'), element.base || base_uri)
+ uri(element.attribute('href'), base)
when %w(object).include?(element.name)
- uri(element.attribute('data'), element.base || base_uri)
- when %w(time).include?(element.name) && element.has_attribute?('datetime')
+ uri(element.attribute('data'), base)
+ when %w(time).include?(element.name)
# Lexically scan value and assign appropriate type, otherwise, leave untyped
- v = element.attribute('datetime').to_s
- datatype = %w(Date Time DateTime).map {|t| RDF::Literal.const_get(t)}.detect do |dt|
+ v = (element.attribute('datetime') || element.text).to_s
+ datatype = %w(Date Time DateTime Duration).map {|t| RDF::Literal.const_get(t)}.detect do |dt|
v.match(dt::GRAMMAR)
end || RDF::Literal
- datatype.new(v)
+ datatype.new(v, :language => element.language)
else
- RDF::Literal.new(element.text, :language => element.language)
+ RDF::Literal.new(element.inner_text, :language => element.language)
end
+ add_debug(element) {" #{value.inspect}"}
+ value
end
# Fixme, what about xml:base relative to element?
def uri(value, base = nil)
value = if base
base = uri(base) unless base.is_a?(RDF::URI)
- base.join(value)
+ base.join(value.to_s)
else
- RDF::URI(value)
+ RDF::URI(value.to_s)
end
value.validate! if validate?
value.canonicalize! if canonicalize?
value = RDF::URI.intern(value) if intern?
value
\ No newline at end of file