lib/rdf/microdata/reader.rb in rdf-microdata-2.2.1 vs lib/rdf/microdata/reader.rb in rdf-microdata-2.2.2

- old
+ new

@@ -13,19 +13,20 @@ class Reader < RDF::Reader format Format include Expansion include RDF::Util::Logger URL_PROPERTY_ELEMENTS = %w(a area audio embed iframe img link object source track video) - DEFAULT_REGISTRY = File.expand_path(File.join(File.dirname(__FILE__), "..", "..", "..", "etc", "registry.json")) # @private class CrawlFailure < StandardError; end - # @!attribute [r] implementation # @return [Module] Returns the HTML implementation module for this reader instance. attr_reader :implementation + # @return [Hash{Object => RDF::Resource}] maps RDF elements (items) to resources + attr_reader :memory + ## # Returns the base URI determined by this reader. # # @example # reader.prefixes[:dc] #=> RDF::URI('http://purl.org/dc/terms/') @@ -34,113 +35,50 @@ # @since 0.3.0 def base_uri @options[:base_uri] end - # Interface to registry - class Registry - # @return [RDF::URI] Prefix of vocabulary - attr_reader :uri + ## + # Reader options + # @see http://www.rubydoc.info/github/ruby-rdf/rdf/RDF/Reader#options-class_method + def self.options + super + [ + RDF::CLI::Option.new( + symbol: :rdfa, + datatype: TrueClass, + on: ["--rdfa"], + description: "Transform and parse as RDFa.") {true}, + ] + end - # @return [Hash] properties - attr_reader :properties - - ## - # Initialize the registry from a URI or file path - # - # @param [String] registry_uri - def self.load_registry(registry_uri) - return if @registry_uri == registry_uri - - json = RDF::Util::File.open_file(registry_uri) { |f| JSON.load(f) } - - @prefixes = {} - json.each do |prefix, elements| - next unless elements.is_a?(Hash) - properties = elements.fetch("properties", {}) - @prefixes[prefix] = Registry.new(prefix, properties) + ## + # Redirect for RDFa Reader given `:rdfa` option + # + # @private + def self.new(input = nil, options = {}, &block) + klass = if options[:rdfa] + # Requires rdf-rdfa gem to be loaded + begin + require 'rdf/rdfa' + rescue LoadError + raise ReaderError, "Use of RDFa-based reader requires rdf-rdfa gem" end - @registry_uri = registry_uri - end - - ## - # Initialize registry for a particular prefix URI - # - # @param [RDF::URI] prefixURI - # @param [Hash] properties ({}) - def initialize(prefixURI, properties = {}) - @uri = prefixURI - @properties = properties - @property_base = prefixURI.to_s - # Append a '#' for fragment if necessary - @property_base += '#' unless %w(/ #).include?(@property_base[-1,1]) - end - - ## - # Find a registry entry given a type URI - # - # @param [RDF::URI] type - # @return [Registry] - def self.find(type) - @prefixes ||= {} - k = @prefixes.keys.detect {|key| type.to_s.index(key) == 0 } - @prefixes[k] if k - end - - ## - # Generate a predicateURI given a `name` - # - # @param [#to_s] name - # @param [Hash{}] ec Evaluation Context - # @return [RDF::URI] - def predicateURI(name, ec) - u = RDF::URI(name) - # 1) If _name_ is an _absolute URL_, return _name_ as a _URI reference_ - return u if u.absolute? - - n = frag_escape(name) - if ec[:current_type].nil? - # 2) If current type from context is null, there can be no current vocabulary. - # Return the URI reference that is the document base with its fragment set to the fragment-escaped value of name - u = RDF::URI(ec[:document_base].to_s) - u.fragment = frag_escape(name) - u - else - # 4) If scheme is vocabulary return the URI reference constructed by appending the fragment escaped value of name to current vocabulary, separated by a U+0023 NUMBER SIGN character (#) unless the current vocabulary ends with either a U+0023 NUMBER SIGN character (#) or SOLIDUS U+002F (/). - RDF::URI(@property_base + n) + RdfaReader + elsif options[:jsonld] + # Requires rdf-rdfa gem to be loaded + begin + require 'json/ld' + rescue LoadError + raise ReaderError, "Use of JSON-LD-based reader requires json-ld gem" end + JsonLdReader + else + self end - - ## - # Yield a equivalentProperty or subPropertyOf if appropriate - # - # @param [RDF::URI] predicateURI - # @yield equiv - # @yieldparam [RDF::URI] equiv - def expand(predicateURI) - tok = tokenize(predicateURI) - if @properties[tok].is_a?(Hash) - value = @properties[tok].fetch("subPropertyOf", nil) - value ||= @properties[tok].fetch("equivalentProperty", nil) - - Array(value).each {|equiv| yield RDF::URI(equiv)} - end - end - - ## - # Turn a predicateURI into a simple token - # @param [RDF::URI] predicateURI - # @return [String] - def tokenize(predicateURI) - predicateURI.to_s.sub(@property_base, '') - end - - ## - # Fragment escape a name - def frag_escape(name) - name.to_s.gsub(/["#%<>\[\\\]^{|}]/) {|c| '%' + c.unpack('H2' * c.bytesize).join('%').upcase} - end + reader = klass.allocate + reader.send(:initialize, input, options, &block) + reader end ## # Initializes the Microdata reader instance. # @@ -176,16 +114,16 @@ initialize_html(input, options) rescue log_fatal($!.message, exception: RDF::ReaderError) log_error("Empty document") if root.nil? log_error(doc_errors.map(&:message).uniq.join("\n")) if !doc_errors.empty? - log_debug(@doc, "library = #{@library}") + log_debug('', "library = #{@library}") # Load registry begin - registry_uri = options[:registry] || DEFAULT_REGISTRY - log_debug(@doc, "registry = #{registry_uri.inspect}") + registry_uri = options[:registry] || RDF::Microdata::DEFAULT_REGISTRY + log_debug('', "registry = #{registry_uri.inspect}") Registry.load_registry(registry_uri) rescue JSON::ParserError => e log_fatal("Failed to parse registry: #{e.message}", exception: RDF::ReaderError) if (root.nil? && validate?) end @@ -268,103 +206,88 @@ end # Parsing a Microdata document (this is *not* the recursive method) def parse_whole_document(doc, base) base = doc_base(base) + @memory = {} options[:base_uri] = if (base) # Strip any fragment from base base = base.to_s.split('#').first base = uri(base) else base = RDF::URI("") end log_info(nil) {"parse_whole_doc: base='#{base}'"} - ec = { - memory: {}, - current_type: nil, - current_vocabulary: nil, - document_base: base, - } # 1) For each element that is also a top-level item, Generate the triples for that item using the evaluation context. getItems.each do |el| - log_depth {generate_triples(el, ec)} + log_depth {generate_triples(el, Registry.new(nil))} end log_info(doc, "parse_whole_doc: traversal complete") end ## # Generate triples for an item # # @param [RDF::Resource] item - # @param [Hash{Symbol => Object}] ec + # @param [Registry] vocab # @option ec [Hash{Nokogiri::XML::Element} => RDF::Resource] memory - # @option ec [RDF::Resource] :current_type + # @option ec [RDF::Resource] :current_vocabulary # @return [RDF::Resource] - def generate_triples(item, ec = {}) - memory = ec[:memory] + def generate_triples(item, vocab) # 1) If there is an entry for item in memory, then let subject be the subject of that entry. Otherwise, if item has a global identifier and that global identifier is an absolute URL, let subject be that global identifier. Otherwise, let subject be a new blank node. subject = if memory.include?(item.node) memory[item.node][:subject] elsif item.has_attribute?('itemid') uri(item.attribute('itemid'), item.base || base_uri) end || RDF::Node.new memory[item.node] ||= {} - log_debug(item) {"gentrips(2): subject=#{subject.inspect}, current_type: #{ec[:current_type]}"} + log_debug(item) {"gentrips(2): subject=#{subject.inspect}, vocab: #{vocab.inspect}"} # 2) Add a mapping from item to subject in memory, if there isn't one already. memory[item.node][:subject] ||= subject # 3) For each type returned from element.itemType of the element defining the item. + # 4) Set vocab to the first value returned from element.itemType of the element defining the item. type = nil item.attribute('itemtype').to_s.split(' ').map{|n| uri(n)}.select(&:absolute?).each do |t| # 3.1. If type is an absolute URL, generate the following triple: type ||= t add_triple(item, subject, RDF.type, t) end - # 4) Set type to the first value returned from element.itemType of the element defining the item. - - # 5) Otherwise, set type to current type from the Evaluation Context if not empty. - type ||= ec[:current_type] - log_debug(item) {"gentrips(5): type=#{type.inspect}"} - - # 6) If the registry contains a URI prefix that is a character for character match of type up to the length of the URI prefix, set vocab as that URI prefix. - vocab = Registry.find(type) - - # 7) Otherwise, if type is not empty, construct vocab by removing everything following the last SOLIDUS U+002F ("/") or NUMBER SIGN U+0023 ("#") from the path component of type. - vocab ||= begin - type_vocab = type.to_s.sub(/([\/\#])[^\/\#]*$/, '\1') - log_debug(item) {"gentrips(7): type_vocab=#{type_vocab.inspect}"} - Registry.new(type_vocab) + # 6) If the registry contains a URI prefix that is a character for character match of vocab up to the length of the URI prefix, set vocab as that URI prefix. + if type || vocab.nil? + vocab = Registry.find(type) || begin + type_vocab = type.to_s.sub(/([\/\#])[^\/\#]*$/, '\1') unless type.nil? + log_debug(item) {"gentrips(7): type_vocab=#{type_vocab.inspect}"} + Registry.new(type_vocab) + end end - # 8) Update evaluation context setting current vocabulary to vocab. - ec[:current_vocabulary] = vocab + # Otherwise, use vocab from evaluation context + log_debug(item) {"gentrips(8): vocab: #{vocab.inspect}"} # 9. For each element _element_ that has one or more property names and is one of the properties of the item _item_, run the following substep: props = item_properties(item) # 9.1. For each name name in element's property names, run the following substeps: props.each do |element| element.attribute('itemprop').to_s.split(' ').compact.each do |name| - log_debug(item) {"gentrips(9.1): name=#{name.inspect}, type=#{type}"} - # 9.1.1) Let context be a copy of evaluation context with current type set to type and current vocabulary set to vocab. - ec_new = ec.merge({current_type: type, current_vocabulary: vocab}) - + log_debug(item) {"gentrips(9.1): name=#{name.inspect}, vocab=#{vocab.inspect}"} # 9.1.2) Let predicate be the result of generate predicate URI using context and name. Update context by setting current name to predicate. - predicate = vocab.predicateURI(name, ec_new) + predicate = vocab.predicateURI(name, base_uri) # 9.1.3) Let value be the property value of element. value = property_value(element) log_debug(item) {"gentrips(9.1.3) value=#{value.inspect}"} # 9.1.4) If value is an item, then generate the triples for value context. Replace value by the subject returned from those steps. if value.is_a?(Hash) - value = generate_triples(element, ec_new) + value = generate_triples(element, vocab) log_debug(item) {"gentrips(9.1.4): value=#{value.inspect}"} end # 9.1.4) Generate the following triple: add_triple(item, subject, predicate, value) @@ -382,23 +305,21 @@ props = item_properties(item, true) # 10.1. For each name name in element's reverse property names, run the following substeps: props.each do |element| element.attribute('itemprop-reverse').to_s.split(' ').compact.each do |name| log_debug(item) {"gentrips(10.1): name=#{name.inspect}"} - # 10.1.1) Let context be a copy of evaluation context with current type set to type and current vocabulary set to vocab. - ec_new = ec.merge({current_type: type, current_vocabulary: vocab}) # 10.1.2) Let predicate be the result of generate predicate URI using context and name. Update context by setting current name to predicate. - predicate = vocab.predicateURI(name, ec_new) + predicate = vocab.predicateURI(name, base_uri) # 10.1.3) Let value be the property value of element. value = property_value(element) log_debug(item) {"gentrips(10.1.3) value=#{value.inspect}"} # 10.1.4) If value is an item, then generate the triples for value context. Replace value by the subject returned from those steps. if value.is_a?(Hash) - value = generate_triples(element, ec_new) + value = generate_triples(element, vocab) log_debug(item) {"gentrips(10.1.4): value=#{value.inspect}"} elsif value.is_a?(RDF::Literal) # 10.1.5) Otherwise, if value is a literal, ignore the value and continue to the next name; it is an error for the value of @itemprop-reverse to be a literal log_error(element, "Value of @itemprop-reverse may not be a literal: #{value.inspect}") next @@ -430,32 +351,32 @@ ## # To crawl the properties of an element root with a list memory, the user agent must run the following steps. These steps either fail or return a list with a count of errors. The count of errors is used as part of the authoring conformance criteria below. # # @param [Nokogiri::XML::Element] root - # @param [Array<Nokokogiri::XML::Element>] memory + # @param [Array<Nokokogiri::XML::Element>] memo # @param [Boolean] reverse crawl reverse properties # @return [Array<Nokogiri::XML::Element>] # Resultant elements - def crawl_properties(root, memory, reverse) - # 1. If root is in memory, then the algorithm fails; abort these steps. - raise CrawlFailure, "crawl_props mem already has #{root.inspect}" if memory.include?(root) + def crawl_properties(root, memo, reverse) + # 1. If root is in memo, then the algorithm fails; abort these steps. + raise CrawlFailure, "crawl_props mem already has #{root.inspect}" if memo.include?(root) # 2. Collect all the elements in the item root; let results be the resulting list of elements, and errors be the resulting count of errors. results = elements_in_item(root) log_debug(root) {"crawl_properties reverse=#{reverse.inspect} results=#{results.map {|e| node_path(e)}.inspect}"} # 3. Remove any elements from results that do not have an @itemprop (@itemprop-reverse) attribute specified. results = results.select {|e| e.has_attribute?(reverse ? 'itemprop-reverse' : 'itemprop')} - # 4. Let new memory be a new list consisting of the old list memory with the addition of root. - raise CrawlFailure, "itemref recursion" if memory.detect {|n| root.node.object_id == n.node.object_id} - new_memory = memory + [root] + # 4. Let new memo be a new list consisting of the old list memo with the addition of root. + raise CrawlFailure, "itemref recursion" if memo.detect {|n| root.node.object_id == n.node.object_id} + new_memo = memo + [root] - # 5. For each element in results that has an @itemscope attribute specified, crawl the properties of the element, with new memory as the memory. + # 5. For each element in results that has an @itemscope attribute specified, crawl the properties of the element, with new memo as the memo. results.select {|e| e.has_attribute?('itemscope')}.each do |element| - log_depth {crawl_properties(element, new_memory, reverse)} + log_depth {crawl_properties(element, new_memo, reverse)} end results end @@ -467,11 +388,11 @@ # Resultant elements and error count # @raise [CrawlFailure] on element recursion def elements_in_item(root) # Let results and pending be empty lists of elements. # Let errors be zero. - results, memory, errors = [], [], 0 + results, memo, errors = [], [], 0 # Add all the children elements of root to pending. pending = root.elements # If root has an itemref attribute, split the value of that itemref attribute on spaces. @@ -485,16 +406,16 @@ end log_debug(root) {"elements_in_item pending #{pending.inspect}"} # Loop: Remove an element from pending and let current be that element. while current = pending.shift - if memory.include?(current) + if memo.include?(current) raise CrawlFailure, "elements_in_item: results already includes #{current.inspect}" elsif !current.has_attribute?('itemscope') # If current is not already in results and current does not have an itemscope attribute, then: add all the child elements of current to pending. pending += current.elements end - memory << current + memo << current # If current is not already in results, then: add current to results. results << current unless results.include?(current) end \ No newline at end of file