lib/daru/dataframe.rb in daru-0.2.0 vs lib/daru/dataframe.rb in daru-0.2.1

- old
+ new

@@ -547,10 +547,24 @@ # df.row[:b] = [1,2,3] # set row ':b' to [1,2,3] def row Daru::Accessors::DataFrameByRow.new(self) end + # Extract a dataframe given row indexes or positions + # @param keys [Array] can be positions (if by_position is true) or indexes (if by_position if false) + # @return [Daru::Dataframe] + def get_sub_dataframe(keys, by_position: true) + return Daru::DataFrame.new({}) if keys == [] + + keys = @index.pos(*keys) unless by_position + + sub_df = row_at(*keys) + sub_df = sub_df.to_df.transpose if sub_df.is_a?(Daru::Vector) + + sub_df + end + # Duplicate the DataFrame entirely. # # == Arguments # # * +vectors_to_dup+ - An Array specifying the names of Vectors to @@ -696,10 +710,11 @@ # 6 1 5 5 # 7 7 5 7 # def rolling_fillna!(direction=:forward) @data.each { |vec| vec.rolling_fillna!(direction) } + self end def rolling_fillna(direction=:forward) dup.rolling_fillna!(direction) end @@ -988,10 +1003,21 @@ end self end + def apply_method(method, keys: nil, by_position: true) + df = keys ? get_sub_dataframe(keys, by_position: by_position) : self + + case method + when Symbol then df.send(method) + when Proc then method.call(df) + else raise + end + end + alias :apply_method_on_sub_df :apply_method + # Retrieves a Daru::Vector, based on the result of calculation # performed on each row. def collect_rows &block return to_enum(:collect_rows) unless block_given? @@ -1448,15 +1474,14 @@ # # ["foo", "one", 3]=>[6], # # ["foo", "three", 8]=>[7], # # ["foo", "two", 3]=>[2, 4]} def group_by *vectors vectors.flatten! - # FIXME: wouldn't it better to do vectors - @vectors here and - # raise one error with all non-existent vector names?.. - zverok, 2016-05-18 - vectors.each { |v| - raise(ArgumentError, "Vector #{v} does not exist") unless has_vector?(v) - } + missing = vectors - @vectors.to_a + unless missing.empty? + raise(ArgumentError, "Vector(s) missing: #{missing.join(', ')}") + end vectors = [@vectors.first] if vectors.empty? Daru::Core::GroupBy.new(self, vectors) end @@ -2247,26 +2272,10 @@ .rename(cat) .delete_vector cat_name end end - # returns array of row tuples at given index(s) - def access_row_tuples_by_indexs *indexes - positions = @index.pos(*indexes) - - return populate_row_for(positions) if positions.is_a? Numeric - - res = [] - new_rows = @data.map { |vec| vec[*indexes] } - indexes.each do |index| - tuples = [] - new_rows.map { |row| tuples += [row[index]] } - res << tuples - end - res - end - # Function to use for aggregating the data. # # @param options [Hash] options for column, you want in resultant dataframe # # @return [Daru::DataFrame] @@ -2280,11 +2289,11 @@ # 1 b 12 # 2 c 7 # 3 d 17 # 4 e 1 # - # df.aggregate(num_100_times: ->(df) { df.num*100 }) + # df.aggregate(num_100_times: ->(df) { (df.num*100).first }) # => #<Daru::DataFrame(5x1)> # num_100_ti # 0 5200 # 1 1200 # 2 700 @@ -2310,45 +2319,30 @@ # b 12 # c 1 # # Note: `GroupBy` class `aggregate` method uses this `aggregate` method # internally. - def aggregate(options={}) - colmn_value, index_tuples = aggregated_colmn_value(options) - Daru::DataFrame.new( - colmn_value, index: index_tuples, order: options.keys - ) - end + def aggregate(options={}, multi_index_level=-1) + positions_tuples, new_index = group_index_for_aggregation(@index, multi_index_level) - private + colmn_value = aggregate_by_positions_tuples(options, positions_tuples) - # Do the `method` (`method` can be :sum, :mean, :std, :median, etc or - # lambda), on the column. - def apply_method_on_colmns colmn, index_tuples, method - rows = [] - index_tuples.each do |indexes| - # If single element then also make it vector. - slice = Daru::Vector.new(Array(self[colmn][*indexes])) - case method - when Symbol - rows << (slice.is_a?(Daru::Vector) ? slice.send(method) : slice) - when Proc - rows << method.call(slice) - end - end - rows + Daru::DataFrame.new(colmn_value, index: new_index, order: options.keys) end - def apply_method_on_df index_tuples, method - rows = [] - index_tuples.each do |indexes| - slice = row[*indexes] - rows << method.call(slice) - end - rows + # Is faster than using group_by followed by aggregate (because it doesn't generate an intermediary dataframe) + def group_by_and_aggregate(*group_by_keys, **aggregation_map) + positions_groups = Daru::Core::GroupBy.get_positions_group_map_for_df(self, group_by_keys.flatten, sort: true) + + new_index = Daru::MultiIndex.from_tuples(positions_groups.keys).coerce_index + colmn_value = aggregate_by_positions_tuples(aggregation_map, positions_groups.values) + + Daru::DataFrame.new(colmn_value, index: new_index, order: aggregation_map.keys) end + private + def headers Daru::Index.new(Array(index.name) + @vectors.to_a) end def row_headers @@ -2908,30 +2902,44 @@ Daru::Vector.new vector end end def update_data source, vectors - @data = @vectors.each_with_index.map do |_vec,idx| + @data = @vectors.each_with_index.map do |_vec, idx| Daru::Vector.new(source[idx], index: @index, name: vectors[idx]) end end - def aggregated_colmn_value(options) - colmn_value = [] - index_tuples = Array(@index).uniq - options.keys.each do |vec| - do_this_on_vec = options[vec] - colmn_value << if @vectors.include?(vec) - apply_method_on_colmns( - vec, index_tuples, do_this_on_vec - ) - else - apply_method_on_df( - index_tuples, do_this_on_vec - ) - end + def aggregate_by_positions_tuples(options, positions_tuples) + options.map do |vect, method| + if @vectors.include?(vect) + vect = self[vect] + + positions_tuples.map do |positions| + vect.apply_method_on_sub_vector(method, keys: positions) + end + else + positions_tuples.map do |positions| + apply_method_on_sub_df(method, keys: positions) + end + end end - [colmn_value, index_tuples] + end + + def group_index_for_aggregation(index, multi_index_level=-1) + case index + when Daru::MultiIndex + groups = Daru::Core::GroupBy.get_positions_group_for_aggregation(index, multi_index_level) + new_index, pos_tuples = groups.keys, groups.values + + new_index = Daru::MultiIndex.from_tuples(new_index).coerce_index + when Daru::Index, Daru::CategoricalIndex + new_index = Array(index).uniq + pos_tuples = new_index.map { |idx| [*index.pos(idx)] } + else raise + end + + [pos_tuples, new_index] end # coerce ranges, integers and array in appropriate ways def coerce_positions *positions, size if positions.size == 1