lib/remi/data_subject/csv_file.rb in remi-0.2.33 vs lib/remi/data_subject/csv_file.rb in remi-0.2.34
- old
+ new
@@ -52,11 +52,12 @@
def to_dataframe
# Assumes that each file has exactly the same structure
result_df = nil
extract.each_with_index do |filename, idx|
@logger.info "Converting #{filename} to a dataframe"
- csv_df = Daru::DataFrame.from_csv filename, @csv_options
+ processed_filename = preprocess(filename)
+ csv_df = Daru::DataFrame.from_csv processed_filename, @csv_options
csv_df[@filename_field] = Daru::Vector.new([filename] * csv_df.size, index: csv_df.index) if @filename_field
if idx == 0
result_df = csv_df
else
@@ -102,15 +103,34 @@
def valid_headers?
(fields.keys - headers).empty?
end
+
private
- def init_csv_file(*args, extractor:, csv_options: {}, filename_field: nil, **kargs, &block)
+ def preprocess(filename)
+ return filename unless @preprocessor
+ @logger.info "Preprocessing #{filename}"
+ tmp_filename = File.join(Remi::Settings.work_dir, "#{Pathname.new(filename).basename}-#{SecureRandom.uuid}")
+
+ dirname = Pathname.new(tmp_filename).dirname
+ FileUtils.mkdir_p(dirname) unless File.directory? dirname
+
+ File.open(tmp_filename, 'w') do |outfile|
+ File.foreach(filename) do |in_line|
+ outfile.write @preprocessor.call(in_line)
+ end
+ end
+
+ tmp_filename
+ end
+
+ def init_csv_file(*args, extractor:, csv_options: {}, filename_field: nil, preprocessor: nil, **kargs, &block)
self.extractor = extractor
@csv_options = self.class.default_csv_options.merge(csv_options)
@filename_field = filename_field
+ @preprocessor = preprocessor
end
end