lib/rdf/rdfa/reader.rb in rdf-rdfa-0.3.3.1 vs lib/rdf/rdfa/reader.rb in rdf-rdfa-0.3.3.2
- old
+ new
@@ -143,11 +143,11 @@
@incomplete_triples = from.incomplete_triples.clone
@namespaces = from.namespaces.clone
end
def inspect
- v = %w(base parent_subject parent_object language default_vocabulary).map {|a| "#{a}='#{self.send(a).nil? ? '<nil>' : self.send(a)}'"}
+ v = %w(base parent_subject parent_object language default_vocabulary).map {|a| "#{a}='#{self.send(a).inspect}'"}
v << "uri_mappings[#{uri_mappings.keys.length}]"
v << "incomplete_triples[#{incomplete_triples.length}]"
v << "term_mappings[#{term_mappings.keys.length}]"
v.join(", ")
end
@@ -198,15 +198,21 @@
@doc = case input
when Nokogiri::HTML::Document, Nokogiri::XML::Document
input
else
+ # Try to detect charset from input
+ options[:encoding] ||= input.charset if input.respond_to?(:charset)
+
+ # Otherwise, default is utf-8
+ options[:encoding] ||= 'utf-8'
+
case @host_language
when :html4, :html5
- Nokogiri::HTML.parse(input, @base_uri.to_s)
+ Nokogiri::HTML.parse(input, @base_uri.to_s, options[:encoding])
else
- Nokogiri::XML.parse(input, @base_uri.to_s)
+ Nokogiri::XML.parse(input, @base_uri.to_s, options[:encoding])
end
end
if (@doc.nil? || @doc.root.nil?)
add_error(nil, "Empty document", RDF::RDFA.DocumentError)
@@ -280,10 +286,24 @@
doc_type_string = head.match(%r(<!DOCTYPE[^>]*>)m).to_s
root = head.match(%r(<[^!\?>]*>)m).to_s
root_element = root.match(%r(^<(\S+)[ >])) ? $1 : ""
version_attr = root.match(/version\s+=\s+(\S+)[\s">]/m) ? $1 : ""
+ head_element = head.match(%r(<head.*<\/head>)mi)
+ head_doc = Nokogiri::HTML.parse(head_element.to_s)
+
+ # May determine content-type and/or charset from meta
+ # Easist way is to parse head into a document and iterate
+ # of CSS matches
+ head_doc.css("meta").each do |e|
+ if e.attr("http-equiv").to_s.downcase == 'content-type'
+ content_type, e = e.attr("content").to_s.downcase.split(";")
+ options[:encoding] = $1.downcase if e.to_s =~ /charset=([^\s]*)$/i
+ elsif e.attr("charset")
+ options[:encoding] = e.attr("charset").to_s.downcase
+ end
+ end
end
# Already using XML parser, determine from DOCTYPE and/or root element
@version ||= :"rdfa1.0" if doc_type_string =~ /RDFa 1\.0/
@version ||= :"rdfa1.0" if version_attr =~ /RDFa 1\.0/
@@ -505,23 +525,37 @@
# look for xmlns
# (note, this may be dependent on @host_language)
# Regardless of how the mapping is declared, the value to be mapped must be converted to lower case,
# and the URI is not processed in any way; in particular if it is a relative path it is
# not resolved against the current base.
+ ns_defs = {}
element.namespace_definitions.each do |ns|
+ ns_defs[ns.prefix] = ns.href.to_s
+ end
+
+ # HTML parsing doesn't create namespace_definitions
+ if ns_defs.empty?
+ ns_defs = {}
+ element.attributes.each do |k, v|
+ ns_defs[$1] = v.to_s if k =~ /^xmlns(?:\:(.+))?/
+ end
+ end
+
+ ns_defs.each do |prefix, href|
# A Conforming RDFa Processor must ignore any definition of a mapping for the '_' prefix.
- next if ns.prefix == "_"
+ next if prefix == "_"
# Downcase prefix for RDFa 1.1
- pfx_lc = (@version == :"rdfa1.0" || ns.prefix.nil?) ? ns.prefix : ns.prefix.to_s.downcase
- if ns.prefix
- uri_mappings[pfx_lc.to_sym] = ns.href
- namespaces[pfx_lc] ||= ns.href
- prefix(pfx_lc, ns.href)
- add_info(element, "extract_mappings: xmlns:#{ns.prefix} => <#{ns.href}>")
+ pfx_lc = (@version == :"rdfa1.0" || prefix.nil?) ? prefix : prefix.downcase
+ if prefix
+ uri_mappings[pfx_lc.to_sym] = href
+ namespaces[pfx_lc] ||= href
+ prefix(pfx_lc, href)
+ add_info(element, "extract_mappings: #{prefix} => <#{href}>")
else
- namespaces[""] ||= ns.href
+ add_info(element, "extract_mappings: nil => <#{href}>")
+ namespaces[""] ||= href
end
end
# Set mappings from @prefix
# prefix is a whitespace separated list of prefix-name URI pairs of the form
@@ -670,9 +704,13 @@
# If both the lang attribute in no namespace and the lang attribute in the XML namespace are set
# on an element, user agents must use the lang attribute in the XML namespace, and the lang
# attribute in no namespace must be ignored for the purposes of determining the element's
# language.
language = case
+ when @doc.is_a?(Nokogiri::HTML::Document) && element.attributes["xml:lang"]
+ element.attributes["xml:lang"].to_s
+ when @doc.is_a?(Nokogiri::HTML::Document) && element.attributes["lang"]
+ element.attributes["lang"].to_s
when element.at_xpath("@xml:lang", "xml" => RDF::XML["uri"].to_s)
element.at_xpath("@xml:lang", "xml" => RDF::XML["uri"].to_s).to_s
when element.at_xpath("@lang")
element.at_xpath("@lang").to_s
else
\ No newline at end of file