module ETL #:nodoc:
module Parser #:nodoc:
# Parses delimited files
class DelimitedParser < ETL::Parser::Parser
# Initialize the parser
# * source: The Source object
# * options: Hash of options for the parser, defaults to an empty hash
def initialize(source, options={})
super
configure
end
# Returns each row.
def each
Dir.glob(file).each do |file|
ETL::Engine.logger.debug "parsing #{file}"
line = 0
lines_skipped = 0
FasterCSV.foreach(file, options) do |raw_row|
if lines_skipped < source.skip_lines
ETL::Engine.logger.debug "skipping line"
lines_skipped += 1
next
end
line += 1
row = {}
validate_row(raw_row, line, file)
raw_row.each_with_index do |value, index|
f = fields[index]
row[f.name] = value
end
yield row
end
end
end
# Get an array of defined fields
def fields
@fields ||= []
end
private
def validate_row(row, line, file)
ETL::Engine.logger.debug "validating line #{line} in file #{file}"
if row.length != fields.length
raise_with_info( MismatchError,
"The number of columns from the source (#{row.length}) does not match the number of columns in the definition (#{fields.length})",
line, file
)
end
end
def configure
source.definition.each do |options|
case options
when Symbol
fields << Field.new(options)
when Hash
fields << Field.new(options[:name])
else
raise DefinitionError, "Each field definition must either be a symbol or a hash"
end
end
end
class Field #:nodoc:
attr_reader :name
def initialize(name)
@name = name
end
end
end
end
end