require 'iconv'
module RdfaParser
# An RDF Literal, with value, encoding and language elements.
class Literal
class Encoding
attr_reader :value
def self.integer
@integer ||= coerce "http://www.w3.org/2001/XMLSchema#int"
end
def self.float
@float ||= coerce "http://www.w3.org/2001/XMLSchema#float"
end
def self.string
@string ||= coerce "http://www.w3.org/2001/XMLSchema#string"
end
def self.coerce(string_or_nil)
if string_or_nil.nil? || string_or_nil == ''
the_null_encoding
elsif xmlliteral == string_or_nil.to_s
xmlliteral
else
new string_or_nil
end
end
def inspect
to_s()
end
def self.the_null_encoding
@the_null_encoding ||= Null.new(nil)
end
def self.xmlliteral
@xmlliteral ||= XMLLiteral.new("http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral")
end
def initialize(value)
@value = URIRef.new(value.to_s) if value
end
def should_quote?
#@value != self.class.integer.to_s
true # All non-XML literals are quoted per W3C RDF Test Cases
end
def ==(other)
case other
when String
other == @value.to_s
when self.class
other.value.to_s == @value.to_s
else
false
end
end
def hash
@value.hash
end
def to_s
@value.to_s
end
# Serialize literal, adding datatype and language elements, if present.
# XMLLiteral and String values are encoding using C-style strings with
# non-printable ASCII characters escaped.
def format_as_n3(content, lang)
content = c_style(content.to_s)
quoted_content = should_quote? ? "\"#{content}\"" : content
"#{quoted_content}^^<#{value}>#{lang ? "@#{lang}" : ""}"
end
def format_as_trix(content, lang)
lang = " xml:lang=\"#{lang}\"" if lang
"#{content}"
end
def xml_args(content, lang)
hash = {"rdf:datatype" => @value.to_s}
hash["xml:lang"] = lang if lang
[content.to_s, hash]
end
def compare_contents(a, b, same_lang)
a == b && same_lang
end
def encode_contents(contents, options)
contents
end
def xmlliteral?
false
end
#private
# "Borrowed" from JSON utf8_to_json
MAP = {
"\x0" => '\u0000',
"\x1" => '\u0001',
"\x2" => '\u0002',
"\x3" => '\u0003',
"\x4" => '\u0004',
"\x5" => '\u0005',
"\x6" => '\u0006',
"\x7" => '\u0007',
"\b" => '\b',
"\t" => '\t',
"\n" => '\n',
"\xb" => '\u000B',
"\f" => '\f',
"\r" => '\r',
"\xe" => '\u000E',
"\xf" => '\u000F',
"\x10" => '\u0010',
"\x11" => '\u0011',
"\x12" => '\u0012',
"\x13" => '\u0013',
"\x14" => '\u0014',
"\x15" => '\u0015',
"\x16" => '\u0016',
"\x17" => '\u0017',
"\x18" => '\u0018',
"\x19" => '\u0019',
"\x1a" => '\u001A',
"\x1b" => '\u001B',
"\x1c" => '\u001C',
"\x1d" => '\u001D',
"\x1e" => '\u001E',
"\x1f" => '\u001F',
'"' => '\"',
'\\' => '\\\\',
'/' => '/',
} # :nodoc:
# Convert a UTF8 encoded Ruby string _string_ to a C-style string, encoded with
# UTF16 big endian characters as \U????, and return it.
if String.method_defined?(:force_encoding)
def c_style(string) # :nodoc:
string << '' # XXX workaround: avoid buffer sharing
string.force_encoding(Encoding::ASCII_8BIT)
string.gsub!(/["\\\/\x0-\x1f]/) { MAP[$&] }
string.gsub!(/(
(?:
[\xc2-\xdf][\x80-\xbf] |
[\xe0-\xef][\x80-\xbf]{2} |
[\xf0-\xf4][\x80-\xbf]{3}
)+ |
[\x80-\xc1\xf5-\xff] # invalid
)/nx) { |c|
c.size == 1 and raise TypeError, "invalid utf8 byte: '#{c}'"
s = Iconv.new('utf-16be', 'utf-8').iconv(c).unpack('H*')[0].upcase
s.gsub!(/.{4}/n, '\\\\u\&')
}
string.force_encoding(Encoding::UTF_8)
string
end
else
def c_style(string) # :nodoc:
string = string.gsub(/["\\\/\x0-\x1f]/) { MAP[$&] }
string.gsub!(/(
(?:
[\xc2-\xdf][\x80-\xbf] |
[\xe0-\xef][\x80-\xbf]{2} |
[\xf0-\xf4][\x80-\xbf]{3}
)+ |
[\x80-\xc1\xf5-\xff] # invalid
)/nx) { |c|
c.size == 1 and raise TypeError, "invalid utf8 byte: '#{c}'"
s = Iconv.new('utf-16be', 'utf-8').iconv(c).unpack('H*')[0].upcase
s.gsub!(/.{4}/n, '\\\\u\&')
}
string
end
end
end
class Null < Encoding
def to_s
''
end
def format_as_n3(content, lang)
"\"#{c_style(content)}\"" + (lang ? "@#{lang}" : "")
# Perform translation on value if it's typed
end
def format_as_trix(content, lang)
if lang
"#{content}"
else
"#{content}"
end
end
def xml_args(content, lang)
hash = {}
hash["xml:lang"] = lang if lang
[content, hash]
end
def inspect
""
end
def xmlliteral?
false
end
end
class XMLLiteral < Encoding
# Compare XMLLiterals
# FIXME: Nokogiri doesn't do a deep compare of elements
def compare_contents(a, b, same_lang)
begin
a_hash = ActiveSupport::XmlMini.parse("#{a}")
b_hash = ActiveSupport::XmlMini.parse("#{b}")
a_hash == b_hash
rescue
super
end
end
def format_as_n3(content, lang)
"\"#{c_style(content)}\"^^<#{value}>"
end
def format_as_trix(content, lang)
"#{content}"
end
def xml_args(content, lang)
hash = {"rdf:parseType" => "Literal"}
[content, hash]
end
# Map namespaces from context to each top-level element found within snippet
def encode_contents(contents, options)
ns_hash = options[:namespaces].values.inject({}) {|h, ns| h.merge(ns.xmlns_hash)}
ns_strs = []
ns_hash.each_pair {|a, u| ns_strs << "#{a}=\"#{u}\""}
# Add inherited namespaces to created root element so that they're inherited to sub-elements
contents = Nokogiri::XML::Document.parse("#{contents}").root
@contents = contents.children.map do |c|
if c.is_a?(Nokogiri::XML::Element)
ns_hash.each_pair { |a, u| c[a] = u unless c.namespaces[a]}
if options[:language] && c["lang"].to_s.empty?
c["xml:lang"] = options[:language]
end
end
c.to_html
end.join("")
end
def xmlliteral?
true
end
end
class Language
attr_accessor :value
def initialize(string)
@value = string.to_s.downcase
end
def clean(string)
case string
when "eng"; "en"
else string
end
end
def == (other)
case other
when String
other == @value
when self.class
other.value == @value
end
end
def to_s; @value; end
end
attr_accessor :contents, :encoding, :lang
# Create a new Literal. Optinally pass a namespaces hash
# for use in applying to rdf::XMLLiteral values.
def initialize(contents, encoding, options = {})
unless encoding.is_a?(Encoding)
raise TypeError, "#{encoding.inspect} should be an instance of Encoding"
end
@encoding = encoding
lang = options[:language]
@lang = Language.new(lang) if lang
options = {:namespaces => {}}.merge(options)
@contents = @encoding.encode_contents(contents, options)
end
def self.untyped(contents, language = nil)
options = {}
options[:language] = language if language
new(contents, Encoding.the_null_encoding, options)
end
# Options include:
# _namespaces_:: A hash of namespace entries (for XMLLiteral)
# _language_:: Language encoding
def self.typed(contents, encoding, options = {})
encoding = Encoding.coerce(encoding)
new(contents, encoding, options)
end
def self.build_from(object)
new(object.to_s, infer_encoding_for(object))
end
def self.infer_encoding_for(object)
case object
when Integer then Encoding.new("http://www.w3.org/2001/XMLSchema#int")
when Float then Encoding.new("http://www.w3.org/2001/XMLSchema#float")
when Time then Encoding.new("http://www.w3.org/2001/XMLSchema#time")
when DateTime then Encoding.new("http://www.w3.org/2001/XMLSchema#dateTime")
when Date then Encoding.new("http://www.w3.org/2001/XMLSchema#date")
else Encoding.new("http://www.w3.org/2001/XMLSchema#string")
end
end
class << self
protected :new
end
def ==(other)
case other
when String then other == self.contents
when self.class
other.encoding == @encoding &&
@encoding.compare_contents(self.contents, other.contents, other.lang == @lang)
else false
end
end
def to_n3
encoding.format_as_n3(self.contents, @lang)
end
alias_method :to_ntriples, :to_n3
def to_trix
encoding.format_as_trix(@contents, @lang)
end
def xml_args
encoding.xml_args( @contents, @lang)
end
def xmlliteral?
encoding.xmlliteral?
end
# Output value
def to_s
self.contents.to_s
end
end
end