require 'nokogiri' # FIXME: Implement using different modules as in RDF::TriX module RDF::Microdata ## # An Microdata parser in Ruby # # Based on processing rules described here: # @see http://dev.w3.org/html5/md/ # # @author [Gregg Kellogg](http://kellogg-assoc.com/) class Reader < RDF::Reader format Format XHTML = "http://www.w3.org/1999/xhtml" URL_PROPERTY_ELEMENTS = %w(a area audio embed iframe img link object source track video) class CrawlFailure < StandardError #:nodoc: end ## # Returns the base URI determined by this reader. # # @attr [RDF::URI] attr_reader :base_uri ## # Initializes the Microdata reader instance. # # @param [Nokogiri::HTML::Document, Nokogiri::XML::Document, IO, File, String] input # the input stream to read # @param [Hash{Symbol => Object}] options # any additional options # @option options [Encoding] :encoding (Encoding::UTF_8) # the encoding of the input stream (Ruby 1.9+) # @option options [Boolean] :validate (false) # whether to validate the parsed statements and values # @option options [Boolean] :canonicalize (false) # whether to canonicalize parsed literals # @option options [Boolean] :intern (true) # whether to intern all parsed URIs # @option options [#to_s] :base_uri (nil) # the base URI to use when resolving relative URIs # @option options [Boolean] :rdf_terms (false) # Generate URIs for itemprop terms based on namespace of itemtype # @option options [Array] :debug # Array to place debug messages # @return [reader] # @yield [reader] `self` # @yieldparam [RDF::Reader] reader # @yieldreturn [void] ignored # @raise [Error]:: Raises RDF::ReaderError if _validate_ def initialize(input = $stdin, options = {}, &block) super do @debug = options[:debug] @doc = case input when Nokogiri::HTML::Document, Nokogiri::XML::Document input else # Try to detect charset from input options[:encoding] ||= input.charset if input.respond_to?(:charset) # Otherwise, default is utf-8 options[:encoding] ||= 'utf-8' Nokogiri::HTML.parse(input, @base_uri.to_s, options[:encoding]) end if (@doc.nil? || @doc.root.nil?) add_error(nil, "Empty document") raise RDF::ReaderError, "Empty Document" end errors = @doc.errors.reject {|e| e.to_s =~ /Tag (audio|source|track|video|time) invalid/} add_error(nil, "Synax errors:\n#{@doc.errors}") if !errors.empty? && validate? block.call(self) if block_given? end end ## # Iterates the given block for each RDF statement in the input. # # @yield [statement] # @yieldparam [RDF::Statement] statement # @return [void] def each_statement(&block) @callback = block # parse parse_whole_document(@doc, @base_uri) end ## # Iterates the given block for each RDF triple in the input. # # @yield [subject, predicate, object] # @yieldparam [RDF::Resource] subject # @yieldparam [RDF::URI] predicate # @yieldparam [RDF::Value] object # @return [void] def each_triple(&block) each_statement do |statement| block.call(*statement.to_triple) end end private # Keep track of allocated BNodes def bnode(value = nil) @bnode_cache ||= {} @bnode_cache[value.to_s] ||= RDF::Node.new(value) end # Figure out the document path, if it is a Nokogiri::XML::Element or Attribute def node_path(node) "<#{@base_uri}>" + case node when Nokogiri::XML::Node then node.display_path else node.to_s end end # Add debug event to debug array, if specified # # @param [XML Node, any] node:: XML Node or string for showing context # @param [String] message:: def add_debug(node, message) puts "#{node_path(node)}: #{message}" if ::RDF::Microdata::debug? @debug << "#{node_path(node)}: #{message}" if @debug.is_a?(Array) end def add_error(node, message) add_debug(node, message) raise RDF::ReaderError, message if validate? end # add a statement, object can be literal or URI or bnode # # @param [Nokogiri::XML::Node, any] node:: XML Node or string for showing context # @param [URI, BNode] subject:: the subject of the statement # @param [URI] predicate:: the predicate of the statement # @param [URI, BNode, Literal] object:: the object of the statement # @return [Statement]:: Added statement # @raise [ReaderError]:: Checks parameter types and raises if they are incorrect if parsing mode is _validate_. def add_triple(node, subject, predicate, object) statement = RDF::Statement.new(subject, predicate, object) add_debug(node, "statement: #{RDF::NTriples.serialize(statement)}") @callback.call(statement) end # Parsing a Microdata document (this is *not* the recursive method) def parse_whole_document(doc, base) base_el = doc.at_css('html>head>base') base = base_el.attribute('href').to_s.split('#').first if base_el add_debug(doc, "parse_whole_doc: options=#{@options.inspect}") if (base) # Strip any fragment from base base = base.to_s.split('#').first base = @base_uri = uri(base) add_debug(base_el, "parse_whole_doc: base='#{base}'") else base = RDF::URI("") end ## # 1. If the title element is not null, then generate the following triple: # # subject: the document's current address # predicate: http://purl.org/dc/terms/title # object: the concatenation of the data of all the child text nodes of the title element, # in tree order, as a plain literal, with the language information set from # the language of the title element, if it is not unknown. doc.css('html>head>title').each do |title| lang = title.attribute('language') add_triple(title, base, RDF::DC.title, title.inner_text) end # 2. For each a, area, and link element in the Document, run these substeps: # # * If the element does not have a rel attribute, then skip this element. # * If the element does not have an href attribute, then skip this element. # * If resolving the element's href attribute relative to the element is not successful, # then skip this element. doc.css('a, area, link').each do |el| rel, href = el.attribute('rel'), el.attribute('href') next unless rel && href href = uri(href, el.base || base) add_debug(el, "a: rel=#{rel.inspect}, href=#{href}") # Otherwise, split the value of the element's rel attribute on spaces, obtaining list of tokens. # Coalesce duplicate tokens in list of tokens. tokens = rel.to_s.split(/\s+/).map do |tok| # Convert each token in list of tokens that does not contain a U+003A COLON characters (:) # to ASCII lowercase. tok =~ /:/ ? tok : tok.downcase end.uniq # If list of tokens contains both the tokens alternate and stylesheet, # then remove them both and replace them with the single (uppercase) token # ALTERNATE-STYLESHEET. if tokens.include?('alternate') && tokens.include?('stylesheet') tokens = tokens - %w(alternate stylesheet) tokens << 'ALTERNATE-STYLESHEET' end tokens.each do |tok| tok_uri = RDF::URI(tok) if tok !~ /:/ # For each token token in list of tokens that contains no U+003A COLON characters (:), # generate the following triple: add_triple(el, base, RDF::XHV[tok.gsub('#', '%23')], href) elsif tok_uri.absolute? # For each token token in list of tokens that is an absolute URL, generate the following triple: add_triple(el, base, tok_uri, href) end end end # 3. For each meta element in the Document that has a name attribute and a content attribute, doc.css('meta[name][content]').each do |el| name, content = el.attribute('name'), el.attribute('content') name = name.to_s name_uri = uri(name, el.base || base) add_debug(el, "meta: name=#{name.inspect}") if name !~ /:/ # If the value of the name attribute contains no U+003A COLON characters (:), # generate the following triple: add_triple(el, base, RDF::XHV[name.downcase.gsub('#', '%23')], RDF::Literal(content, :language => el.language)) elsif name_uri.absolute? # If the value of the name attribute contains no U+003A COLON characters (:), # generate the following triple: add_triple(el, base, name_uri, RDF::Literal(content, :language => el.language)) end end # 4. For each blockquote and q element in the Document that has a cite attribute that resolves # successfully relative to the element, generate the following triple: doc.css('blockquote[cite], q[cite]').each do |el| object = uri(el.attribute('cite'), el.base || base) add_debug(el, "blockquote: cite=#{object}") add_triple(el, base, RDF::DC.source, object) end # 5. Let memory be a mapping of items to subjects, initially empty. # 6. For each element that is also a top-level microdata item, run the following steps: # * Generate the triples for the item. Pass a reference to memory as the item/subject list. # Let result be the subject returned. # * Generate the following triple: # subject the document's current address # predicate http://www.w3.org/1999/xhtml/microdata#item # object result memory = {} doc.css('[itemscope]'). select {|el| !el.has_attribute?('itemprop')}. each do |el| object = generate_triples(el, memory) add_triple(el, base, RDF::MD.item, object) end add_debug(doc, "parse_whole_doc: traversal complete") end ## # Generate triples for an item # @param [RDF::Resource] item # @param [Hash{Nokogiri::XML::Element} => RDF::Resource] memory # @param [Hash{Symbol => Object}] options # @option options [RDF::Resource] :fallback_type # @option options [RDF::Resource] :fallback_name # @return [RDF::Resource] def generate_triples(item, memory, options = {}) fallback_type = options[:fallback_type] fallback_name = options[:fallback_name] # 1. If there is an entry for item in memory, then let subject be the subject of that entry. # Otherwise, if item has a global identifier and that global identifier is an absolute URL, # let subject be that global identifier. Otherwise, let subject be a new blank node. subject = if memory.include?(item) memory[item][:subject] elsif item.has_attribute?('itemid') u = uri(item.attribute('itemid')) end || RDF::Node.new memory[item] ||= {} add_debug(item, "gentrips(2): subject=#{subject.inspect}") # 2. Add a mapping from item to subject in memory, if there isn't one already. memory[item][:subject] ||= subject # 3. If item has an item type and that item type is an absolute URL, let type be that item type. # Otherwise, let type be the empty string. rdf_type = type = uri(item.attribute('itemtype')) type = '' unless type.absolute? if type != '' add_triple(item, subject, RDF.type, type) # 4.2. If type does not contain a U+0023 NUMBER SIGN character (#), then append a # to type. type += '#' unless type.to_s.include?('#') # 4.3. If type does not have a : after its #, append a : to type. type += ':' unless type.to_s.match(/\#:/) elsif fallback_type add_debug(item, "gentrips(5.2): fallback_type=#{fallback_type}, fallback_name=#{fallback_name}") rdf_type = type = fallback_type # 5.2. If type does not contain a U+0023 NUMBER SIGN character (#), then append a # to type. type += '#' unless type.to_s.include?('#') # 5.3. If type does not have a : after its #, append a : to type. type += ':' unless type.to_s.match(/\#:/) # 5.4. If the last character of type is not a :, %20 to type. type += '%20' unless type.to_s[-1,1] == ':' # 5.5. Append the fragment-escaped value of fallback name to type. type += fallback_name.to_s.gsub('#', '%23') end add_debug(item, "gentrips(6): type=#{type.inspect}") # 6. For each element _element_ that has one or more property names and is one of the # properties of the item _item_, in the order those elements are given by the algorithm # that returns the properties of an item, run the following substep: props = item_properties(item) # 6.1. For each name name in element's property names, run the following substeps: props.each do |element| element.attribute('itemprop').to_s.split(' ').each do |name| add_debug(element, "gentrips(6.1): name=#{name.inspect}") # If type is the empty string and name is not an absolute URL, then abort these substeps. name_uri = RDF::URI(name) next if type == '' && !name_uri.absolute? value = property_value(element) add_debug(element, "gentrips(6.1.2) value=#{value.inspect}") if value.is_a?(Hash) value = generate_triples(element, memory, :fallback_type => type, :fallback_name => name) end add_debug(element, "gentrips(6.1.3): value=#{value.inspect}") predicate = if name_uri.absolute? name_uri elsif @options[:rdf_terms] # Use the URI of the type to create URIs for @itemprop terms add_debug(element, "gentrips: rdf_type=#{rdf_type}") predicate = RDF::URI(rdf_type.to_s.sub(/([\/\#])[^\/\#]*$/, '\1' + name)) elsif !name.include?(':') s = type.to_s s += '%20' unless s[-1,1] == ':' s += name RDF::MD[s.gsub('#', '%23')] end add_debug(element, "gentrips(6.1.5): predicate=#{predicate}") add_triple(element, subject, predicate, value) if predicate end end subject end ## # To find the properties of an item defined by the element root, the user agent must try # to crawl the properties of the element root, with an empty list as the value of memory: # if this fails, then the properties of the item defined by the element root is an empty # list; otherwise, it is the returned list. # # @param [Nokogiri::XML::Element] item # @return [Array] # List of property elements for an item def item_properties(item) add_debug(item, "item_properties") results, errors = crawl_properties(item, []) raise CrawlFailure, "item_props: errors=#{errors}" if errors > 0 results rescue CrawlFailure => e add_error(element, e.message) return [] end ## # To crawl the properties of an element root with a list memory, the user agent must run # the following steps. These steps either fail or return a list with a count of errors. # The count of errors is used as part of the authoring conformance criteria below. # # @param [Nokogiri::XML::Element] root # @param [Array] memory # @return [Array, Integer>] # Resultant elements and error count def crawl_properties(root, memory) # 1. If root is in memory, then the algorithm fails; abort these steps. raise CrawlFailure, "crawl_props mem already has #{root.inspect}" if memory.include?(root) # 2. Collect all the elements in the item root; let results be the resulting # list of elements, and errors be the resulting count of errors. results, errors = elements_in_item(root) add_debug(root, "crawl_properties results=#{results.inspect}, errors=#{errors}") # 3. Remove any elements from results that do not have an itemprop attribute specified. results = results.select {|e| e.has_attribute?('itemprop')} # 4. Let new memory be a new list consisting of the old list memory with the addition of root. new_memory = memory + [root] # 5. For each element in results that has an itemscope attribute specified, # crawl the properties of the element, with new memory as the memory. results.select {|e| e.has_attribute?('itemscope')}.each do |element| begin crawl_properties(element, new_memory) rescue CrawlFailure => e # If this fails, then remove the element from results and increment errors. # (If it succeeds, the return value is discarded.) memory -= element add_error(element, e.message) errors += 1 end end [results, errors] end ## # To collect all the elements in the item root, the user agent must run these steps. # They return a list of elements and a count of errors. # # @param [Nokogiri::XML::Element] root # @return [Array, Integer>] # Resultant elements and error count def elements_in_item(root) # Let results and pending be empty lists of elements. # Let errors be zero. results, errors = [], 0 # Add all the children elements of root to pending. pending = root.elements # If root has an itemref attribute, split the value of that itemref attribute on spaces. # For each resulting token ID, root.attribute('itemref').to_s.split(' ').each do |id| add_debug(root, "elements_in_item itemref id #{id}") # if there is an element in the home subtree of root with the ID ID, # then add the first such element to pending. id_elem = @doc.at_css("##{id}") pending << id_elem if id_elem end add_debug(root, "elements_in_item pending #{pending.inspect}") # Loop: Remove an element from pending and let current be that element. while current = pending.shift if results.include?(current) # If current is already in results, increment errors. add_error(current, "elements_in_item: results already includes #{current.inspect}") errors += 1 elsif !current.has_attribute?('itemscope') # If current is not already in results and current does not have an itemscope attribute, # then: add all the child elements of current to pending. pending += current.elements end # If current is not already in results, then: add current to results. results << current unless results.include?(current) end [results, errors] end ## # def property_value(element) add_debug(element, "property_value(#{element.inspect})") case when element.has_attribute?('itemscope') {} when element.name == 'meta' element.attribute('content').to_s when %w(audio embed iframe img source track video).include?(element.name) uri(element.attribute('src'), element.base) when %w(a area link).include?(element.name) uri(element.attribute('href'), element.base) when %w(object).include?(element.name) uri(element.attribute('data'), element.base) when %w(time).include?(element.name) && element.has_attribute?('datetime') RDF::Literal::DateTime.new(element.attribute('datetime')) else RDF::Literal.new(element.text, :language => element.language) end end # Fixme, what about xml:base relative to element? def uri(value, base = nil) value = if base base = uri(base) unless base.is_a?(RDF::URI) base.join(value) else RDF::URI(value) end value.validate! if validate? value.canonicalize! if canonicalize? value = RDF::URI.intern(value) if intern? value end end end