lib/rdf/microdata/reader.rb in rdf-microdata-0.2.5 vs lib/rdf/microdata/reader.rb in rdf-microdata-0.3.0
- old
+ new
@@ -11,20 +11,21 @@
##
# An Microdata parser in Ruby
#
# Based on processing rules, amended with the following:
#
- # @see https://dvcs.w3.org/hg/htmldata/raw-file/0d6b89f5befb/microdata-rdf/index.html
+ # @see http://dvcs.w3.org/hg/htmldata/raw-file/0d6b89f5befb/microdata-rdf/index.html
# @author [Gregg Kellogg](http://greggkellogg.net/)
class Reader < RDF::Reader
format Format
+ include Expansion
URL_PROPERTY_ELEMENTS = %w(a area audio embed iframe img link object source track video)
DEFAULT_REGISTRY = File.expand_path(File.join(File.dirname(__FILE__), "..", "..", "..", "etc", "registry.json"))
-
- class CrawlFailure < StandardError #:nodoc:
- end
+ # @private
+ class CrawlFailure < StandardError; end
+
# Returns the HTML implementation module for this reader instance.
#
# @attr_reader [Module]
attr_reader :implementation
@@ -40,19 +41,22 @@
@options[:base_uri]
end
# Interface to registry
class Registry
+ # @attr_reader [RDF::URI] uri Prefix of vocabulary
+ attr_reader :uri
+
##
# Initialize the registry from a URI or file path
#
# @param [String] registry_uri
def self.load_registry(registry_uri)
return if @registry_uri == registry_uri
json = RDF::Util::File.open_file(registry_uri) { |f| JSON.load(f) }
-
+
@prefixes = {}
json.each do |prefix, elements|
next unless elements.is_a?(Hash)
propertyURI = elements.fetch("propertyURI", "vocabulary").to_sym
multipleValues = elements.fetch("multipleValues", "unordered").to_sym
@@ -74,10 +78,11 @@
# @param [RDF::URI] prefixURI
# @param [#to_sym] propertyURI (:vocabulary)
# @param [#to_sym] multipleValues (:unordered)
# @param [Hash] properties ({})
def initialize(prefixURI, propertyURI = :vocabulary, multipleValues = :unordered, properties = {})
+ @uri = prefixURI
@scheme = propertyURI.to_sym
@multipleValues = multipleValues.to_sym
@properties = properties
if @scheme == :vocabulary
@property_base = prefixURI.to_s
@@ -110,33 +115,30 @@
return u if u.absolute?
n = frag_escape(name)
if ec[:current_type].nil?
# 2) If current type from context is null, there can be no current vocabulary.
- # Return the URI reference that is the document base with its fragment set to
- # the fragment-escaped value of name
+ # Return the URI reference that is the document base with its fragment set to the fragment-escaped value of name
u = RDF::URI(ec[:document_base].to_s)
u.fragment = frag_escape(name)
u
elsif @scheme == :vocabulary
- # 4) If scheme is vocabulary return the URI reference constructed by appending the fragment escaped value of name
- # to current vocabulary, separated by a U+0023 NUMBER SIGN character (#) unless the current vocabulary ends
- # with either a U+0023 NUMBER SIGN character (#) or SOLIDUS U+002F (/).
+ # 4) If scheme is vocabulary return the URI reference constructed by appending the fragment escaped value of name to current vocabulary, separated by a U+0023 NUMBER SIGN character (#) unless the current vocabulary ends with either a U+0023 NUMBER SIGN character (#) or SOLIDUS U+002F (/).
RDF::URI(@property_base + n)
else # @scheme == :contextual
if ec[:current_name].to_s.index(@property_base) == 0
# 5.2) return the concatenation of s, a U+002E FULL STOP character (.) and the fragment-escaped value of name.
RDF::URI(ec[:current_name] + '.' + n)
else
- # 5.3) return the concatenation of http://www.w3.org/ns/md?type=, the fragment-escaped value of current type,
- # the string &prop=, and the fragment-escaped value of name
- RDF::URI(@property_base + frag_escape(ec[:current_type]) + '&prop=' + n)
+ # 5.3) return the concatenation of http://www.w3.org/ns/md?type=, the fragment-escaped value of current type, the string &prop=, and the fragment-escaped value of name
+ RDF::URI(@property_base +
+ frag_escape(ec[:current_type]) +
+ '&prop=' + n)
end
end
end
-
-
+
##
# Turn a predicateURI into a simple token
# @param [RDF::URI] predicateURI
# @return [String]
def tokenize(predicateURI)
@@ -152,18 +154,45 @@
# Determine if property should be serialized as a list or not
# @param [RDF::URI] predicateURI
# @return [Boolean]
def as_list(predicateURI)
tok = tokenize(predicateURI)
- if @properties[tok].is_a?(Hash)
+ if @properties[tok].is_a?(Hash) &&
+ @properties[tok].has_key?("multipleValues")
@properties[tok]["multipleValues"].to_sym == :list
else
@multipleValues == :list
end
end
##
+ # Yield a equivalentProperty or subPropertyOf if appropriate
+ # @param [RDF::URI] predicateURI
+ # @yield statement
+ # @yieldparam [RDF::Statement] statement
+ # @return [Boolean]
+ def expand(predicateURI)
+ tok = tokenize(predicateURI)
+ if @properties[tok].is_a?(Hash)
+ if value = @properties[tok]["equivalentProperty"]
+ [value].flatten.each do |v|
+ yield RDF::Statement.new(predicateURI,
+ RDF::OWL.equivalentProperty,
+ RDF::URI(v))
+ end
+ elsif value = @properties[tok]["subPropertyOf"]
+ [value].flatten.each do |v|
+ yield RDF::Statement.new(predicateURI,
+ RDF::RDFS.subPropertyOf,
+ RDF::URI(v))
+ end
+ end
+ value = @properties[tok]
+ end
+ end
+
+ ##
# Fragment escape a name
def frag_escape(name)
name.to_s.gsub(/["#%<>\[\\\]^{|}]/) {|c| '%' + c.unpack('H2' * c.bytesize).join('%').upcase}
end
end
@@ -186,20 +215,23 @@
# @option options [Boolean] :intern (true)
# whether to intern all parsed URIs
# @option options [#to_s] :base_uri (nil)
# the base URI to use when resolving relative URIs
# @option options [#to_s] :registry_uri (DEFAULT_REGISTRY)
+ # @option options [Boolean] :vocab_expansion (true)
+ # whether to perform OWL2 expansion on the resulting graph
# @option options [Array] :debug
# Array to place debug messages
# @return [reader]
# @yield [reader] `self`
# @yieldparam [RDF::Reader] reader
# @yieldreturn [void] ignored
- # @raise [Error]:: Raises RDF::ReaderError if _validate_
+ # @raise [Error] Raises `RDF::ReaderError` when validating
def initialize(input = $stdin, options = {}, &block)
super do
@debug = options[:debug]
+ @vocab_expansion = options.fetch(:vocab_expansion, true)
@library = case options[:library]
when nil
(defined?(::Nokogiri) && RUBY_PLATFORM != 'java') ? :nokogiri : :rexml
when :nokogiri, :rexml
@@ -221,15 +253,16 @@
raise RDF::ReaderError, "Empty Document"
end
errors = doc_errors.reject {|e| e.to_s =~ /Tag (audio|source|track|video|time) invalid/}
raise RDF::ReaderError, "Syntax errors:\n#{errors}" if !errors.empty? && validate?
- add_debug(@doc, "library = #{@library}")
+ add_debug(@doc, "library = #{@library}, expand = #{@vocab_expansion}")
# Load registry
begin
registry_uri = options[:registry_uri] || DEFAULT_REGISTRY
+ add_debug(@doc, "registry = #{registry_uri}")
Registry.load_registry(registry_uri)
rescue JSON::ParserError => e
raise RDF::ReaderError, "Failed to parse registry: #{e.message}"
end
@@ -243,18 +276,26 @@
end
##
# Iterates the given block for each RDF statement in the input.
#
+ # Reads to graph and performs expansion if required.
+ #
# @yield [statement]
# @yieldparam [RDF::Statement] statement
# @return [void]
def each_statement(&block)
- @callback = block
+ if @vocab_expansion
+ @vocab_expansion = false
+ expand.each_statement(&block)
+ @vocab_expansion = true
+ else
+ @callback = block
- # parse
- parse_whole_document(@doc, base_uri)
+ # parse
+ parse_whole_document(@doc, base_uri)
+ end
end
##
# Iterates the given block for each RDF triple in the input.
#
@@ -394,13 +435,14 @@
# 6) Otherwise, set type to current type from the Evaluation Context if not empty.
type ||= ec[:current_type]
add_debug(item) {"gentrips(6): type=#{type.inspect}"}
- # 7) If the registry contains a URI prefix that is a character for character match of type up to the length of the
- # URI prefix, set vocab as that URI prefix
+ # 7) If the registry contains a URI prefix that is a character for character match of type up to the length of the URI prefix, set vocab as that URI prefix and generate the following triple (unless it has already been generated):
vocab = Registry.find(type)
+ add_debug(item) {"gentrips(7): vocab=#{vocab.inspect}"}
+ add_triple(item, base_uri, USES_VOCAB, RDF::URI(vocab.uri)) if vocab
# 8) Otherwise, if type is not empty, construct vocab by removing everything following the last
# SOLIDUS U+002F ("/") or NUMBER SIGN U+0023 ("#") from the path component of type.
vocab ||= begin
type_vocab = type.to_s.sub(/([\/\#])[^\/\#]*$/, '\1')
@@ -425,10 +467,19 @@
# 11.1.1) Let context be a copy of evaluation context with current type set to type and current vocabulary set to vocab.
ec_new = ec.merge({:current_type => type, :current_vocabulary => vocab})
# 11.1.2) Let predicate be the result of generate predicate URI using context and name. Update context by setting current name to predicate.
predicate = vocab.predicateURI(name, ec_new)
+
+ # (Generate Predicate URI steps 6 and 7)
+ vocab.expand(predicate) do |statement|
+ add_debug(item) {
+ "gentrips(11.1.2): expansion #{statement.inspect}"
+ }
+ @callback.call(statement)
+ end
+
ec_new[:current_name] = predicate
add_debug(item) {"gentrips(11.1.2): predicate=#{predicate}"}
# 11.1.3) Let value be the property value of element.
value = property_value(element)
@@ -457,9 +508,10 @@
def generatePropertyValues(element, subject, predicate, values)
# If the registry contains a URI prefix that is a character for character match of predicate up to the length
# of the URI prefix, set vocab as that URI prefix. Otherwise set vocab to null
registry = Registry.find(predicate)
+ add_debug("generatePropertyValues") { "list(#{predicate})? #{registry.as_list(predicate).inspect}"} if registry
if registry && registry.as_list(predicate)
value = generateRDFCollection(element, values)
add_triple(element, subject, predicate, value)
else
values.each {|v| add_triple(element, subject, predicate, v)}
\ No newline at end of file