module DaruLite class DataFrame module Pivotable # Pivots a data frame on specified vectors and applies an aggregate function # to quickly generate a summary. # # == Options # # +:index+ - Keys to group by on the pivot table row index. Pass vector names # contained in an Array. # # +:vectors+ - Keys to group by on the pivot table column index. Pass vector # names contained in an Array. # # +:agg+ - Function to aggregate the grouped values. Default to *:mean*. Can # use any of the statistics functions applicable on Vectors that can be found in # the DaruLite::Statistics::Vector module. # # +:values+ - Columns to aggregate. Will consider all numeric columns not # specified in *:index* or *:vectors*. Optional. # # == Usage # # df = DaruLite::DataFrame.new({ # a: ['foo' , 'foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar'], # b: ['one' , 'one', 'one', 'two', 'two', 'one', 'one', 'two', 'two'], # c: ['small','large','large','small','small','large','small','large','small'], # d: [1,2,2,3,3,4,5,6,7], # e: [2,4,4,6,6,8,10,12,14] # }) # df.pivot_table(index: [:a], vectors: [:b], agg: :sum, values: :e) # # #=> # # #<DaruLite::DataFrame:88342020 @name = 08cdaf4e-b154-4186-9084-e76dd191b2c9 @size = 2> # # [:e, :one] [:e, :two] # # [:bar] 18 26 # # [:foo] 10 12 def pivot_table(opts = {}) raise ArgumentError, 'Specify grouping index' if Array(opts[:index]).empty? index = opts[:index] vectors = opts[:vectors] || [] aggregate_function = opts[:agg] || :mean values = prepare_pivot_values index, vectors, opts raise IndexError, 'No numeric vectors to aggregate' if values.empty? grouped = group_by(index) return grouped.send(aggregate_function) if vectors.empty? super_hash = make_pivot_hash grouped, vectors, values, aggregate_function pivot_dataframe super_hash end private def prepare_pivot_values(index, vectors, opts) case opts[:values] when nil # values not specified at all. (@vectors.to_a - (index | vectors)) & numeric_vector_names when Array # multiple values specified. opts[:values] else # single value specified. [opts[:values]] end end def make_pivot_hash(grouped, vectors, values, aggregate_function) grouped.groups.transform_values { |_| {} }.tap do |super_hash| values.each do |value| grouped.groups.each do |group_name, row_numbers| row_numbers.each do |num| arry = [value, *vectors.map { |v| self[v][num] }] sub_hash = super_hash[group_name] sub_hash[arry] ||= [] sub_hash[arry] << self[value][num] end end end setup_pivot_aggregates super_hash, aggregate_function end end def setup_pivot_aggregates(super_hash, aggregate_function) super_hash.each_value do |sub_hash| sub_hash.each do |group_name, aggregates| sub_hash[group_name] = DaruLite::Vector.new(aggregates).send(aggregate_function) end end end def pivot_dataframe(super_hash) df_index = DaruLite::MultiIndex.from_tuples super_hash.keys df_vectors = DaruLite::MultiIndex.from_tuples super_hash.values.flat_map(&:keys).uniq DaruLite::DataFrame.new({}, index: df_index, order: df_vectors).tap do |pivoted_dataframe| super_hash.each do |row_index, sub_h| sub_h.each do |vector_index, val| pivoted_dataframe[vector_index][row_index] = val end end end end end end end