lib/rdf/microdata/reader.rb in rdf-microdata-2.2.1 vs lib/rdf/microdata/reader.rb in rdf-microdata-2.2.2
- old
+ new
@@ -13,19 +13,20 @@
class Reader < RDF::Reader
format Format
include Expansion
include RDF::Util::Logger
URL_PROPERTY_ELEMENTS = %w(a area audio embed iframe img link object source track video)
- DEFAULT_REGISTRY = File.expand_path(File.join(File.dirname(__FILE__), "..", "..", "..", "etc", "registry.json"))
# @private
class CrawlFailure < StandardError; end
- # @!attribute [r] implementation
# @return [Module] Returns the HTML implementation module for this reader instance.
attr_reader :implementation
+ # @return [Hash{Object => RDF::Resource}] maps RDF elements (items) to resources
+ attr_reader :memory
+
##
# Returns the base URI determined by this reader.
#
# @example
# reader.prefixes[:dc] #=> RDF::URI('http://purl.org/dc/terms/')
@@ -34,113 +35,50 @@
# @since 0.3.0
def base_uri
@options[:base_uri]
end
- # Interface to registry
- class Registry
- # @return [RDF::URI] Prefix of vocabulary
- attr_reader :uri
+ ##
+ # Reader options
+ # @see http://www.rubydoc.info/github/ruby-rdf/rdf/RDF/Reader#options-class_method
+ def self.options
+ super + [
+ RDF::CLI::Option.new(
+ symbol: :rdfa,
+ datatype: TrueClass,
+ on: ["--rdfa"],
+ description: "Transform and parse as RDFa.") {true},
+ ]
+ end
- # @return [Hash] properties
- attr_reader :properties
-
- ##
- # Initialize the registry from a URI or file path
- #
- # @param [String] registry_uri
- def self.load_registry(registry_uri)
- return if @registry_uri == registry_uri
-
- json = RDF::Util::File.open_file(registry_uri) { |f| JSON.load(f) }
-
- @prefixes = {}
- json.each do |prefix, elements|
- next unless elements.is_a?(Hash)
- properties = elements.fetch("properties", {})
- @prefixes[prefix] = Registry.new(prefix, properties)
+ ##
+ # Redirect for RDFa Reader given `:rdfa` option
+ #
+ # @private
+ def self.new(input = nil, options = {}, &block)
+ klass = if options[:rdfa]
+ # Requires rdf-rdfa gem to be loaded
+ begin
+ require 'rdf/rdfa'
+ rescue LoadError
+ raise ReaderError, "Use of RDFa-based reader requires rdf-rdfa gem"
end
- @registry_uri = registry_uri
- end
-
- ##
- # Initialize registry for a particular prefix URI
- #
- # @param [RDF::URI] prefixURI
- # @param [Hash] properties ({})
- def initialize(prefixURI, properties = {})
- @uri = prefixURI
- @properties = properties
- @property_base = prefixURI.to_s
- # Append a '#' for fragment if necessary
- @property_base += '#' unless %w(/ #).include?(@property_base[-1,1])
- end
-
- ##
- # Find a registry entry given a type URI
- #
- # @param [RDF::URI] type
- # @return [Registry]
- def self.find(type)
- @prefixes ||= {}
- k = @prefixes.keys.detect {|key| type.to_s.index(key) == 0 }
- @prefixes[k] if k
- end
-
- ##
- # Generate a predicateURI given a `name`
- #
- # @param [#to_s] name
- # @param [Hash{}] ec Evaluation Context
- # @return [RDF::URI]
- def predicateURI(name, ec)
- u = RDF::URI(name)
- # 1) If _name_ is an _absolute URL_, return _name_ as a _URI reference_
- return u if u.absolute?
-
- n = frag_escape(name)
- if ec[:current_type].nil?
- # 2) If current type from context is null, there can be no current vocabulary.
- # Return the URI reference that is the document base with its fragment set to the fragment-escaped value of name
- u = RDF::URI(ec[:document_base].to_s)
- u.fragment = frag_escape(name)
- u
- else
- # 4) If scheme is vocabulary return the URI reference constructed by appending the fragment escaped value of name to current vocabulary, separated by a U+0023 NUMBER SIGN character (#) unless the current vocabulary ends with either a U+0023 NUMBER SIGN character (#) or SOLIDUS U+002F (/).
- RDF::URI(@property_base + n)
+ RdfaReader
+ elsif options[:jsonld]
+ # Requires rdf-rdfa gem to be loaded
+ begin
+ require 'json/ld'
+ rescue LoadError
+ raise ReaderError, "Use of JSON-LD-based reader requires json-ld gem"
end
+ JsonLdReader
+ else
+ self
end
-
- ##
- # Yield a equivalentProperty or subPropertyOf if appropriate
- #
- # @param [RDF::URI] predicateURI
- # @yield equiv
- # @yieldparam [RDF::URI] equiv
- def expand(predicateURI)
- tok = tokenize(predicateURI)
- if @properties[tok].is_a?(Hash)
- value = @properties[tok].fetch("subPropertyOf", nil)
- value ||= @properties[tok].fetch("equivalentProperty", nil)
-
- Array(value).each {|equiv| yield RDF::URI(equiv)}
- end
- end
-
- ##
- # Turn a predicateURI into a simple token
- # @param [RDF::URI] predicateURI
- # @return [String]
- def tokenize(predicateURI)
- predicateURI.to_s.sub(@property_base, '')
- end
-
- ##
- # Fragment escape a name
- def frag_escape(name)
- name.to_s.gsub(/["#%<>\[\\\]^{|}]/) {|c| '%' + c.unpack('H2' * c.bytesize).join('%').upcase}
- end
+ reader = klass.allocate
+ reader.send(:initialize, input, options, &block)
+ reader
end
##
# Initializes the Microdata reader instance.
#
@@ -176,16 +114,16 @@
initialize_html(input, options) rescue log_fatal($!.message, exception: RDF::ReaderError)
log_error("Empty document") if root.nil?
log_error(doc_errors.map(&:message).uniq.join("\n")) if !doc_errors.empty?
- log_debug(@doc, "library = #{@library}")
+ log_debug('', "library = #{@library}")
# Load registry
begin
- registry_uri = options[:registry] || DEFAULT_REGISTRY
- log_debug(@doc, "registry = #{registry_uri.inspect}")
+ registry_uri = options[:registry] || RDF::Microdata::DEFAULT_REGISTRY
+ log_debug('', "registry = #{registry_uri.inspect}")
Registry.load_registry(registry_uri)
rescue JSON::ParserError => e
log_fatal("Failed to parse registry: #{e.message}", exception: RDF::ReaderError) if (root.nil? && validate?)
end
@@ -268,103 +206,88 @@
end
# Parsing a Microdata document (this is *not* the recursive method)
def parse_whole_document(doc, base)
base = doc_base(base)
+ @memory = {}
options[:base_uri] = if (base)
# Strip any fragment from base
base = base.to_s.split('#').first
base = uri(base)
else
base = RDF::URI("")
end
log_info(nil) {"parse_whole_doc: base='#{base}'"}
- ec = {
- memory: {},
- current_type: nil,
- current_vocabulary: nil,
- document_base: base,
- }
# 1) For each element that is also a top-level item, Generate the triples for that item using the evaluation context.
getItems.each do |el|
- log_depth {generate_triples(el, ec)}
+ log_depth {generate_triples(el, Registry.new(nil))}
end
log_info(doc, "parse_whole_doc: traversal complete")
end
##
# Generate triples for an item
#
# @param [RDF::Resource] item
- # @param [Hash{Symbol => Object}] ec
+ # @param [Registry] vocab
# @option ec [Hash{Nokogiri::XML::Element} => RDF::Resource] memory
- # @option ec [RDF::Resource] :current_type
+ # @option ec [RDF::Resource] :current_vocabulary
# @return [RDF::Resource]
- def generate_triples(item, ec = {})
- memory = ec[:memory]
+ def generate_triples(item, vocab)
# 1) If there is an entry for item in memory, then let subject be the subject of that entry. Otherwise, if item has a global identifier and that global identifier is an absolute URL, let subject be that global identifier. Otherwise, let subject be a new blank node.
subject = if memory.include?(item.node)
memory[item.node][:subject]
elsif item.has_attribute?('itemid')
uri(item.attribute('itemid'), item.base || base_uri)
end || RDF::Node.new
memory[item.node] ||= {}
- log_debug(item) {"gentrips(2): subject=#{subject.inspect}, current_type: #{ec[:current_type]}"}
+ log_debug(item) {"gentrips(2): subject=#{subject.inspect}, vocab: #{vocab.inspect}"}
# 2) Add a mapping from item to subject in memory, if there isn't one already.
memory[item.node][:subject] ||= subject
# 3) For each type returned from element.itemType of the element defining the item.
+ # 4) Set vocab to the first value returned from element.itemType of the element defining the item.
type = nil
item.attribute('itemtype').to_s.split(' ').map{|n| uri(n)}.select(&:absolute?).each do |t|
# 3.1. If type is an absolute URL, generate the following triple:
type ||= t
add_triple(item, subject, RDF.type, t)
end
- # 4) Set type to the first value returned from element.itemType of the element defining the item.
-
- # 5) Otherwise, set type to current type from the Evaluation Context if not empty.
- type ||= ec[:current_type]
- log_debug(item) {"gentrips(5): type=#{type.inspect}"}
-
- # 6) If the registry contains a URI prefix that is a character for character match of type up to the length of the URI prefix, set vocab as that URI prefix.
- vocab = Registry.find(type)
-
- # 7) Otherwise, if type is not empty, construct vocab by removing everything following the last SOLIDUS U+002F ("/") or NUMBER SIGN U+0023 ("#") from the path component of type.
- vocab ||= begin
- type_vocab = type.to_s.sub(/([\/\#])[^\/\#]*$/, '\1')
- log_debug(item) {"gentrips(7): type_vocab=#{type_vocab.inspect}"}
- Registry.new(type_vocab)
+ # 6) If the registry contains a URI prefix that is a character for character match of vocab up to the length of the URI prefix, set vocab as that URI prefix.
+ if type || vocab.nil?
+ vocab = Registry.find(type) || begin
+ type_vocab = type.to_s.sub(/([\/\#])[^\/\#]*$/, '\1') unless type.nil?
+ log_debug(item) {"gentrips(7): type_vocab=#{type_vocab.inspect}"}
+ Registry.new(type_vocab)
+ end
end
- # 8) Update evaluation context setting current vocabulary to vocab.
- ec[:current_vocabulary] = vocab
+ # Otherwise, use vocab from evaluation context
+ log_debug(item) {"gentrips(8): vocab: #{vocab.inspect}"}
# 9. For each element _element_ that has one or more property names and is one of the properties of the item _item_, run the following substep:
props = item_properties(item)
# 9.1. For each name name in element's property names, run the following substeps:
props.each do |element|
element.attribute('itemprop').to_s.split(' ').compact.each do |name|
- log_debug(item) {"gentrips(9.1): name=#{name.inspect}, type=#{type}"}
- # 9.1.1) Let context be a copy of evaluation context with current type set to type and current vocabulary set to vocab.
- ec_new = ec.merge({current_type: type, current_vocabulary: vocab})
-
+ log_debug(item) {"gentrips(9.1): name=#{name.inspect}, vocab=#{vocab.inspect}"}
# 9.1.2) Let predicate be the result of generate predicate URI using context and name. Update context by setting current name to predicate.
- predicate = vocab.predicateURI(name, ec_new)
+ predicate = vocab.predicateURI(name, base_uri)
# 9.1.3) Let value be the property value of element.
value = property_value(element)
log_debug(item) {"gentrips(9.1.3) value=#{value.inspect}"}
# 9.1.4) If value is an item, then generate the triples for value context. Replace value by the subject returned from those steps.
if value.is_a?(Hash)
- value = generate_triples(element, ec_new)
+ value = generate_triples(element, vocab)
log_debug(item) {"gentrips(9.1.4): value=#{value.inspect}"}
end
# 9.1.4) Generate the following triple:
add_triple(item, subject, predicate, value)
@@ -382,23 +305,21 @@
props = item_properties(item, true)
# 10.1. For each name name in element's reverse property names, run the following substeps:
props.each do |element|
element.attribute('itemprop-reverse').to_s.split(' ').compact.each do |name|
log_debug(item) {"gentrips(10.1): name=#{name.inspect}"}
- # 10.1.1) Let context be a copy of evaluation context with current type set to type and current vocabulary set to vocab.
- ec_new = ec.merge({current_type: type, current_vocabulary: vocab})
# 10.1.2) Let predicate be the result of generate predicate URI using context and name. Update context by setting current name to predicate.
- predicate = vocab.predicateURI(name, ec_new)
+ predicate = vocab.predicateURI(name, base_uri)
# 10.1.3) Let value be the property value of element.
value = property_value(element)
log_debug(item) {"gentrips(10.1.3) value=#{value.inspect}"}
# 10.1.4) If value is an item, then generate the triples for value context. Replace value by the subject returned from those steps.
if value.is_a?(Hash)
- value = generate_triples(element, ec_new)
+ value = generate_triples(element, vocab)
log_debug(item) {"gentrips(10.1.4): value=#{value.inspect}"}
elsif value.is_a?(RDF::Literal)
# 10.1.5) Otherwise, if value is a literal, ignore the value and continue to the next name; it is an error for the value of @itemprop-reverse to be a literal
log_error(element, "Value of @itemprop-reverse may not be a literal: #{value.inspect}")
next
@@ -430,32 +351,32 @@
##
# To crawl the properties of an element root with a list memory, the user agent must run the following steps. These steps either fail or return a list with a count of errors. The count of errors is used as part of the authoring conformance criteria below.
#
# @param [Nokogiri::XML::Element] root
- # @param [Array<Nokokogiri::XML::Element>] memory
+ # @param [Array<Nokokogiri::XML::Element>] memo
# @param [Boolean] reverse crawl reverse properties
# @return [Array<Nokogiri::XML::Element>]
# Resultant elements
- def crawl_properties(root, memory, reverse)
- # 1. If root is in memory, then the algorithm fails; abort these steps.
- raise CrawlFailure, "crawl_props mem already has #{root.inspect}" if memory.include?(root)
+ def crawl_properties(root, memo, reverse)
+ # 1. If root is in memo, then the algorithm fails; abort these steps.
+ raise CrawlFailure, "crawl_props mem already has #{root.inspect}" if memo.include?(root)
# 2. Collect all the elements in the item root; let results be the resulting list of elements, and errors be the resulting count of errors.
results = elements_in_item(root)
log_debug(root) {"crawl_properties reverse=#{reverse.inspect} results=#{results.map {|e| node_path(e)}.inspect}"}
# 3. Remove any elements from results that do not have an @itemprop (@itemprop-reverse) attribute specified.
results = results.select {|e| e.has_attribute?(reverse ? 'itemprop-reverse' : 'itemprop')}
- # 4. Let new memory be a new list consisting of the old list memory with the addition of root.
- raise CrawlFailure, "itemref recursion" if memory.detect {|n| root.node.object_id == n.node.object_id}
- new_memory = memory + [root]
+ # 4. Let new memo be a new list consisting of the old list memo with the addition of root.
+ raise CrawlFailure, "itemref recursion" if memo.detect {|n| root.node.object_id == n.node.object_id}
+ new_memo = memo + [root]
- # 5. For each element in results that has an @itemscope attribute specified, crawl the properties of the element, with new memory as the memory.
+ # 5. For each element in results that has an @itemscope attribute specified, crawl the properties of the element, with new memo as the memo.
results.select {|e| e.has_attribute?('itemscope')}.each do |element|
- log_depth {crawl_properties(element, new_memory, reverse)}
+ log_depth {crawl_properties(element, new_memo, reverse)}
end
results
end
@@ -467,11 +388,11 @@
# Resultant elements and error count
# @raise [CrawlFailure] on element recursion
def elements_in_item(root)
# Let results and pending be empty lists of elements.
# Let errors be zero.
- results, memory, errors = [], [], 0
+ results, memo, errors = [], [], 0
# Add all the children elements of root to pending.
pending = root.elements
# If root has an itemref attribute, split the value of that itemref attribute on spaces.
@@ -485,16 +406,16 @@
end
log_debug(root) {"elements_in_item pending #{pending.inspect}"}
# Loop: Remove an element from pending and let current be that element.
while current = pending.shift
- if memory.include?(current)
+ if memo.include?(current)
raise CrawlFailure, "elements_in_item: results already includes #{current.inspect}"
elsif !current.has_attribute?('itemscope')
# If current is not already in results and current does not have an itemscope attribute, then: add all the child elements of current to pending.
pending += current.elements
end
- memory << current
+ memo << current
# If current is not already in results, then: add current to results.
results << current unless results.include?(current)
end
\ No newline at end of file