lib/chronicle/etl/extractors/file_extractor.rb in chronicle-etl-0.4.0 vs lib/chronicle/etl/extractors/file_extractor.rb in chronicle-etl-0.4.1

- old
+ new

@@ -1,37 +1,57 @@ require 'pathname' module Chronicle module ETL + # Return filenames that match a pattern in a directory class FileExtractor < Chronicle::ETL::Extractor - include Extractors::Helpers::FilesystemReader register_connector do |r| r.description = 'file or directory of files' end - # TODO: consolidate this with @config.filename - setting :dir_glob_pattern + setting :input, default: ['.'] + setting :dir_glob_pattern, default: "**/*" + setting :larger_than + setting :smaller_than + def prepare + @pathnames = gather_files + end + def extract - filenames.each do |filename| - yield Chronicle::ETL::Extraction.new(data: filename) + @pathnames.each do |pathname| + yield Chronicle::ETL::Extraction.new(data: pathname.to_path) end end def results_count - filenames.count + @pathnames.count end private - def filenames - @filenames ||= filenames_in_directory( - path: @config.filename, - dir_glob_pattern: @config.dir_glob_pattern, - load_since: @config.since, - load_until: @config.until - ) + def gather_files + roots = [@config.input].flatten.map { |filename| Pathname.new(filename) } + raise(ExtractionError, "Input must exist") unless roots.all?(&:exist?) + + directories, files = roots.partition(&:directory?) + + directories.each do |directory| + files += Dir.glob(File.join(directory, @config.dir_glob_pattern)).map { |filename| Pathname.new(filename) } + end + + files = files.uniq + + files = files.keep_if { |f| (f.mtime > @config.since) } if @config.since + files = files.keep_if { |f| (f.mtime < @config.until) } if @config.until + + # pass in file sizes in bytes + files = files.keep_if { |f| (f.size < @config.smaller_than) } if @config.smaller_than + files = files.keep_if { |f| (f.size > @config.larger_than) } if @config.larger_than + + # # TODO: incorporate sort argument + files.sort_by(&:mtime) end end end end