require 'rexml/document'
module ETL
module Parser
class XmlParser < ETL::Parser::Parser
# Initialize the parser
# * source: The Source object
# * options: Parser options Hash
def initialize(source, options={})
super
configure
end
# Returns each row
def each
Dir.glob(file).each do |file|
doc = nil
t = Benchmark.realtime do
doc = REXML::Document.new(File.new(file))
end
Engine.logger.info "XML #{file} parsed in #{t}s"
doc.elements.each(@collection_xpath) do |element|
row = {}
fields.each do |f|
value = element.text(f.xpath)
row[f.name] = value
end
yield row
end
end
end
# Get an array of defined fields
def fields
@fields ||= []
end
private
def configure
@collection_xpath = source.definition[:collection]
raise "Collection XPath is required" if @collection_xpath.nil?
source.definition[:fields].each do |options|
case options
when Symbol
fields << Field.new(options, options.to_s)
when Hash
options[:xpath] ||= options[:name]
fields << Field.new(options[:name], options[:xpath].to_s)
else
raise DefinitionError, "Each field definition must either be an symbol or a hash of options for the field"
end
end
end
class Field
attr_reader :name, :xpath
def initialize(name, xpath)
@name = name
@xpath = xpath
end
end
end
end
end