require 'java'
require 'saxon/jaxp'
require 'uri'
require 'open-uri'
require 'pathname'
module Saxon
# Provides a wrapper around the JAXP StreamSource class Saxon uses to bring
# the XML bytestream in. Provides some extra methods to make handling closing
# the source and its inputstream after consumption more idiomatic
class Source
# Helper methods for getting Java-useful representations of source document
# strings and files
module Helpers
# Given a File, or IO object which will return either #path or
# #base_uri, return the #base_uri, if present, or the #path, if present, or
# nil
# @param [File, IO] io A File or IO
# object representing the input XML file or data, or a String containing
# the XML
# @return [String, nil] the path or URI from the IO (or nil if there is none)
def self.base_uri(io)
if io.respond_to?(:base_uri)
return io.base_uri.to_s
io.path if io.respond_to?(:path)
# Given a File or IO return a Java InputStream, or an InputStreamReader if
# the Encoding is explicitly specified (rather than inferred from the
# ) declaration in the source.
# @param io [File, IO, org.jruby.util.IOInputStream,]
# input to be converted to an input stream
# @param encoding [Encoding, String] the character encoding to be used to
# for the stream, overriding the XML parser.
# @return [] the wrapped input
def self.inputstream(io, encoding = nil)
stream = case io
when org.jruby.util.IOInputStream,
io.to_inputstream if io.respond_to?(:read)
return stream if encoding.nil?, ruby_encoding_to_charset(encoding))
# Given a path return a Java File object
# @param path [String, Pathname] the path to the file
# @return [] the Java File object
def self.file(path)
# Given a file path and encoding, return a Java InputStreamReader object
# for the file.
# @param path [String, Pathname] the path to the file
# @param encoding [String, Encoding] the file's character encoding
# @return [] a Java InputStreamReader object
# wrapping a FileInputStream for the file
def self.file_reader(path, encoding), ruby_encoding_to_charset(encoding))
# Return a File or Reader object for a file, depending on whether the
# encoding must be explicitly specified or not.
# @param path [String, Pathname] the path to the file
# @param encoding [String, Encoding] the file's character encoding
# @return [] a Java Reader object
def self.file_or_reader(path, encoding = nil)
encoding.nil? ? file(path) : file_reader(path, encoding)
# Return a Reader object for the String with an explicitly set encoding.
# If the encoding is +ASCII_8BIT+ then a binary-mode StreamReader is
# returned, rather than a character Reader
# @param string [String] the string
# @param encoding [String, Encoding] the string's character encoding
# @return [,] a Java InputStream or Reader object
def self.string_reader(string, encoding)
inputstream =
encoding = ruby_encoding(encoding)
return inputstream if encoding == ::Encoding::ASCII_8BIT, ruby_encoding_to_charset(encoding))
# Figure out the equivalent Java +Charset+ for a Ruby {Encoding}.
# @param encoding [String, Encoding] the encoding to find a +Charset+ for
def self.ruby_encoding_to_charset(encoding)
# Given a String with an {Encoding} name or an {Encoding} instance, return
# an {Encoding} instance
# @param encoding [String, Encoding] the encoding or encoding name
# @return [Encoding] the encoding
def self.ruby_encoding(encoding)
encoding.nil? ? nil : ::Encoding.find(encoding)
# Lambda that checks if the given path exists and is a file
PathChecker = ->(path) {
# Lambda that checks if the given string is a valid URI
URIChecker = ->(uri) {
rescue URI::InvalidURIError
class << self
# Generate a Saxon::Source given an IO-like
# @param [IO, File] io The IO-like containing XML to be parsed
# @param [Hash] opts
# @option opts [String] :base_uri The Base URI for the Source - an
# absolute URI or relative path that will be used to resolve relative
# URLs in the XML. Setting this will override any path or URI derived
# from the IO-like.
# @option opts [String, Encoding] :encoding The encoding of the source.
# Note that specifying this will force the parser to ignore the charset
# if it's set in the XML declaration of the source. Only really useful
# if there's a discrepancy between the source's declared and actual
# encoding. Defaults to the declaration in the
# source.
# @return [Saxon::Source] the Saxon::Source wrapping the input
def from_io(io, opts = {})
base_uri = opts.fetch(:base_uri) { Helpers.base_uri(io) }
encoding = opts.fetch(:encoding, nil)
inputstream = Helpers.inputstream(io, encoding)
from_inputstream_or_reader(inputstream, base_uri)
# Generate a Saxon::Source given a path to a file
# @param [String, Pathname] path The path to the XML file to be parsed
# @param [Hash] opts
# @option opts [String] :base_uri The Base URI for the Source - an
# absolute URI or relative path that will be used to resolve relative
# URLs in the XML. Setting this will override the file path.
# @option opts [String, Encoding] :encoding The encoding of the source.
# Note that specifying this will force the parser to ignore the charset
# if it's set in the XML declaration of the source. Only really useful
# if there's a discrepancy between the source's declared and actual
# encoding. Defaults to the declaration in the
# source.
# @return [Saxon::Source] the Saxon::Source wrapping the input
def from_path(path, opts = {})
encoding = opts.fetch(:encoding, nil)
return from_inputstream_or_reader(Helpers.file(path), opts[:base_uri]) if encoding.nil?
reader = Helpers.file_reader(path, encoding)
base_uri = opts.fetch(:base_uri) { File.expand_path(path) }
from_inputstream_or_reader(reader, base_uri)
# Generate a Saxon::Source given a URI
# @param [String, URI] uri The URI to the XML file to be parsed
# @param [Hash] opts
# @option opts [String] :base_uri The Base URI for the Source - an
# absolute URI or relative path that will be used to resolve relative
# URLs in the XML. Setting this will override the given URI.
# @option opts [String, Encoding] :encoding The encoding of the source.
# Note that specifying this will force the parser to ignore the charset
# if it's set in the XML declaration of the source. Only really useful
# if there's a discrepancy between the source's declared and actual
# encoding. Defaults to the declaration in the
# source.
# @return [Saxon::Source] the Saxon::Source wrapping the input
def from_uri(uri, opts = {})
encoding = opts.fetch(:encoding, nil)
return from_io(open(uri), encoding: encoding) if encoding
from_inputstream_or_reader(uri.to_s, opts[:base_uri])
# Generate a Saxon::Source given a string containing XML
# @param [String] string The string containing XML to be parsed
# @param [Hash] opts
# @option opts [String] :base_uri The Base URI for the Source - an
# absolute URI or relative path that will be used to resolve relative
# URLs in the XML. This will be nil unless set.
# @option opts [String, Encoding] :encoding The encoding of the source.
# Note that specifying this will force the parser to ignore the charset
# if it's set in the XML declaration of the source. Only really useful
# if there's a discrepancy between the encoding of the string and the
# encoding of the source. Defaults to the encoding of the string, unless
# that is ASCII-8BIT, in which case the parser will use the
# declaration in the source to pick the encoding.
# @return [Saxon::Source] the Saxon::Source wrapping the input
def from_string(string, opts = {})
encoding = opts.fetch(:encoding) { string.encoding }
reader = Helpers.string_reader(string, encoding)
from_inputstream_or_reader(reader, opts[:base_uri])
# Generate a Saxon::Source from one of the several inputs allowed.
# If possible the character encoding of the input source will be left to
# the XML parser to discover (from the XML
# declaration).
# The Base URI for the source (its absolute path, or URI) can be set by
# passing in the +:base_uri+ option. This is the same thing as an XML
# document's 'System ID' - Base URI is the term most widely used in Ruby
# libraries for this, so that's what's used here.
# If the source's character encoding can't be correctly discovered by the
# parser from the XML declaration ( at the top of the document), then it can be passed
# as the +:encoding+ option.
# If an existing {Source} is passed in, simply return it.
# @param [Saxon::Source, IO, File, String, Pathname, URI] input The XML to be parsed
# @param [Hash] opts
# @option opts [String] :base_uri The Base URI for the Source - an
# absolute URI or relative path that will be used to resolve relative
# URLs in the XML. Setting this will override any path or URI derived
# from an IO, URI, or Path.
# @option opts [String, Encoding] :encoding The encoding of the source.
# Note that specifying this will force the parser to ignore the charset
# if it's set in the XML declaration of the source. Only really useful
# if there's a discrepancy between the source's declared and actual
# encoding. Defaults to the declaration in the
# source.
# @return [Saxon::Source] the Saxon::Source wrapping the input
def create(input, opts = {})
case input
when Saxon::Source
when IO, File,, StringIO
from_io(input, opts)
when Pathname, PathChecker
from_path(input, opts)
when URIChecker
from_uri(input, opts)
from_string(input, opts)
def from_inputstream_or_reader(inputstream_or_reader, base_uri = nil)
stream_source =
stream_source.setSystemId(base_uri) if base_uri
new(stream_source, inputstream_or_reader)
attr_reader :stream_source, :inputstream
private :stream_source, :inputstream
# @api private
# @param [] stream_source The Java JAXP StreamSource
# @param [,] inputstream The Java InputStream or StringReader
def initialize(stream_source, inputstream = nil)
@stream_source = stream_source
@inputstream = inputstream
@closed = false
# @return [String] The base URI of the Source
def base_uri
# @param [String, URI] uri The URI to use as the Source's Base URI
# @return [String] The new base URI of the Source
def base_uri=(uri)
# Close the Source and its associated InputStream or Reader, allowing those
# resources to be freed.
# @return [TrueClass] Returns true
def close
@closed = true
# @return [Boolean] Returns true if the source is closed, false otherwise
def closed?
# Yields itself and then closes itself. To be used by DocumentBuilders or
# other consumers, making it easy to ensure the source is closed after it
# has been consumed.
# @raise [Saxon::SourceClosedError] if the Source has already been closed
# @yield [source] Yields self to the block
def consume(&block)
raise SourceClosedError if closed?
# @return [] The underlying JAXP StreamSource
def to_java
# Error raised when trying to consume an already-consumed, and closed, Source
class SourceClosedError < Exception; end