Sha256: f70ca903362613bf018330f452b608b8422bb8fcfb0e1ee23503c82cf1d7e87c

Contents?: true

Size: 1.42 KB

Versions: 4

Compression:

Stored size: 1.42 KB

Contents

module EasyML::Data
  class Datasource
    class MergedDatasource < Datasource
      include GlueGun::DSL

      attribute :root_dir, :string
      attribute :polars_args, :hash, default: {}
      attribute :merge
      validates :root_dir, presence: true
      validates :merge, presence: true

      def in_batches(of: 10_000, &block)
        Polars.read_csv(file_path, **polars_args).iter_batches(batch_size: of, &block)
      end

      def file_path
        @file_path ||= File.join(root_dir, "merged_data.csv")
      end

      def last_updated_at
        datasources.map(&:last_updated_at).min
      end

      def refresh!
        cleanup
        if datasources.is_a?(Array)
          datasources.each(&:refresh!)
        elsif datasources.is_a?(Hash)
          datasources.values.each(&:refresh!)
        end
      end

      def data
        @data ||= if file_exists?
                    Polars.read_csv(file_path, **polars_args)
                  else
                    merge_and_save
                  end
      end

      def cleanup
        FileUtils.rm_f(file_path)
      end

      private

      def file_exists?
        File.exist?(file_path)
      end

      def merge_and_save
        refresh!
        merge.call(datasources).tap do |merged_data|
          save_to_file(merged_data)
        end
      end

      def save_to_file(df)
        FileUtils.mkdir_p(root_dir)
        df.write_csv(file_path)
      end
    end
  end
end

Version data entries

4 entries across 4 versions & 1 rubygems

Version Path
easy_ml-0.1.4 lib/easy_ml/data/datasource/merged_datasource.rb
easy_ml-0.1.3 lib/easy_ml/data/datasource/merged_datasource.rb
easy_ml-0.1.2 lib/easy_ml/data/datasource/merged_datasource.rb
easy_ml-0.1.1 lib/easy_ml/data/datasource/merged_datasource.rb