require 'rexml/parsers/sax2parser'
require 'rexml/sax2listener'
module ETL
module Parser
class SaxParser < ETL::Parser::Parser
# The write trigger causes whatever values are currently specified for the row to be returned.
# After returning the values will not be cleared, thus allowing for values which are assigned
# higher in the XML tree to remain in memory.
attr_accessor :write_trigger
# Initialize the parser
# * source: The Source object
# * options: Parser options Hash
def initialize(source, options={})
super
configure
end
# Returns each row
def each(&block)
Dir.glob(file).each do |file|
parser = REXML::Parsers::SAX2Parser.new(File.new(file))
listener = Listener.new(self, &block)
parser.listen(listener)
parser.parse
end
end
def fields
@fields ||= []
end
private
def configure
#puts "write trigger in source.definition: #{source.definition[:write_trigger]}"
self.write_trigger = source.definition[:write_trigger]
# map paths to field names
source.definition[:fields].each do |name, path|
#puts "defined field #{name}, path: #{path}"
fields << Field.new(name, XPath::Path.parse(path))
end
end
class Field
attr_reader :name, :path
def initialize(name, path)
@name = name
@path = path
end
end
end
class Listener
include REXML::SAX2Listener
def initialize(parser, &block)
@parser = parser
@row = {}
@value = nil
@proc = Proc.new(&block)
end
def cdata(text)
@value << text
end
def characters(text)
text = text.strip
if (!text.nil? && text != '')
@value ||= ''
@value << text
end
end
def start_document
@path = XPath::Path.new
end
def end_document
end
def start_element(uri, localname, qname, attributes)
@path.elements << XPath::Element.new(localname, attributes)
end
def end_element(uri, localname, qname)
element = @path.elements.last
@parser.fields.each do |field|
#puts "#{@path} match? #{field.path}"
if @path.match?(field.path)
#puts "field.path: #{field.path}"
if field.path.is_attribute?
@row[field.name] = element.attributes[field.path.attribute]
else
@row[field.name] = @value
end
end
end
#puts @path.to_s
if @path.match?(@parser.write_trigger)
#puts "matched: #{@path} =~ #{@parser.write_trigger}"
#puts "calling proc with #{@row.inspect}"
@proc.call(@row.clone)
end
@value = nil
@path.elements.pop
end
def progress(position)
@position = position
end
end
module XPath
class Path
attr_accessor :elements
def initialize
@elements = []
end
def to_s
@elements.map{ |e| e.to_s }.join("/")
end
# Returns true if the last part of the path refers to an attribute
def is_attribute?
elements.last.attributes.length > 0
end
# Return the name of the attribute referenced by the last element in this path. Returns nil if the last element
# does not reference an attribute.
#
# Warning: the path must only reference a single attribute, otherwise the result of this method will be random,
# since attributes are stored in a Hash.
def attribute
return nil unless is_attribute?
elements.last.attributes.keys.first
end
# Return true if this XPath::Path matches the given path string. This is a fail-fast match, so the first mismatch
# will cause the method to return false.
def match?(s)
path = Path.parse(s)
return false unless path.elements.length == elements.length
elements.each_with_index do |element, index|
path_element = path.elements[index]
return false if path_element.nil?
return false if element.name != path_element.name
path_element.attributes.each do |key, value|
return false unless element.attributes[key] =~ value
end
end
return true
end
# Parse the string into an XPath::Path object
def self.parse(s)
return s if s.is_a?(Path)
path = Path.new
parts = s.split('/')
parts.each_with_index do |part, i|
attributes = {}
part.gsub!(/(.*)\[(.*)\]/, '\1')
if !$2.nil?
$2.split(",").each do |pair|
key, value = pair.split("=")
value = ".*" if value.nil?
attributes[key] = Regexp.new(value)
end
end
path.elements << Element.new(part, attributes)
end
path
end
end
class Element
attr_reader :name
attr_reader :attributes
def initialize(name, attributes={})
@name = name
@attributes = attributes
end
def to_s
s = "#{name}"
if !@attributes.empty?
attr_str = @attributes.collect do |key,value|
value = value.source if value.is_a?(Regexp)
"#{key}=#{value}"
end.join(",")
s << "[" + attr_str + "]"
end
s
end
end
end
end
end