optional_require 'nokogiri'
require 'open-uri'
optional_require 'zlib'
module ETL
module Parser
class NokogiriXmlParser < ETL::Parser::Parser
# Initialize the parser
# * source: The Source object
# * options: Parser options Hash
def initialize(source, options={})
super
configure
end
# Returns each row
def each
Dir.glob(file).each do |source|
doc = nil
gzip = false
magic = "1F8B".to_i(base=16) # Check for gzip archives
if File.exist?(source)
gzip = true if magic == (
File.open(source).read(2).unpack("H2H2").to_s.to_i(base=16))
end
if gzip
doc = Nokogiri::XML(Zlib::GzipReader.open(source))
else
doc = Nokogiri::XML(open(source))
end
doc.xpath(@collection_xpath).each do |nodeset|
row = {}
fields.each do |f|
value = nodeset.xpath(f.xpath).text
row[f.name] = value
end
yield row
end
end
end
# Get an array of defined fields
def fields
@fields ||= []
end
private
def configure
@collection_xpath = source.definition[:collection]
if @collection_xpath.nil?
raise ":collection => 'XPath' argument required"
end
source.definition[:fields].each do |options|
case options
when Symbol
fields << Field.new(options, options.to_s)
when Hash
options[:xpath] ||= options[:name]
fields << Field.new(options[:name], options[:xpath].to_s)
else
raise DefinitionError,
"Each field definition must either be an symbol " +
"or a hash of options for the field"
end
end
end
class Field
attr_reader :name, :xpath
def initialize(name, xpath)
@name = name
@xpath = xpath
end
end
end
end
end