module Remi module DataFrame class Daru < SimpleDelegator include Remi::DataFrame def initialize(*args, **kargs, &block) if args[0].is_a? ::Daru::DataFrame super(args[0]) else super(::Daru::DataFrame.new(*args, **kargs, &block)) end end # Public: Returns the type of DataFrame def remi_df_type :daru end # Public: Saves a Dataframe to a file. def hash_dump(filename) File.binwrite(filename, Marshal.dump(self)) end # Public: Creates a DataFrame by reading the dumped version from a file. def self.from_hash_dump(filename) Marshal.load(File.binread(filename)) end # Public: Allows the user to define an arbitrary aggregation function. # # by - The name of the DataFrame vector to use to group records. # func - A lambda function that accepts three arguments - the # first argument is the DataFrame, the second is the # key to the current group, and the third is the index # of the elements belonging to a group. # # Example: # df = Remi::DataFrame::Daru.new( { a: ['a','a','a','b','b'], year: ['2018','2015','2019', '2014', '2013'] }) # # mymin = lambda do |vector, df, group_key, indices| # values = indices.map { |idx| df.row[idx][vector] } # "Group #{group_key} has a minimum value of #{values.min}" # end # # df.aggregate(by: :a, func: mymin.curry.(:year)) # # # Returns a Daru::Vector. def aggregate(by:, func:) grouped = self.group_by(by) df_indices = self.index.to_a ::Daru::Vector.new( grouped.groups.reduce({}) do |h, (key, indices)| # Daru groups don't use the index of the dataframe when returning groups (WTF?). # Instead they return the position of the record in the dataframe. Here, we group_df_indices = indices.map { |v| df_indices[v] } group_key = key.size == 1 ? key.first : key h[group_key] = func.(self, group_key, group_df_indices) h end ) end end end end