lib/daru/dataframe.rb in daru-0.2.0 vs lib/daru/dataframe.rb in daru-0.2.1
- old
+ new
@@ -547,10 +547,24 @@
# df.row[:b] = [1,2,3] # set row ':b' to [1,2,3]
def row
Daru::Accessors::DataFrameByRow.new(self)
end
+ # Extract a dataframe given row indexes or positions
+ # @param keys [Array] can be positions (if by_position is true) or indexes (if by_position if false)
+ # @return [Daru::Dataframe]
+ def get_sub_dataframe(keys, by_position: true)
+ return Daru::DataFrame.new({}) if keys == []
+
+ keys = @index.pos(*keys) unless by_position
+
+ sub_df = row_at(*keys)
+ sub_df = sub_df.to_df.transpose if sub_df.is_a?(Daru::Vector)
+
+ sub_df
+ end
+
# Duplicate the DataFrame entirely.
#
# == Arguments
#
# * +vectors_to_dup+ - An Array specifying the names of Vectors to
@@ -696,10 +710,11 @@
# 6 1 5 5
# 7 7 5 7
#
def rolling_fillna!(direction=:forward)
@data.each { |vec| vec.rolling_fillna!(direction) }
+ self
end
def rolling_fillna(direction=:forward)
dup.rolling_fillna!(direction)
end
@@ -988,10 +1003,21 @@
end
self
end
+ def apply_method(method, keys: nil, by_position: true)
+ df = keys ? get_sub_dataframe(keys, by_position: by_position) : self
+
+ case method
+ when Symbol then df.send(method)
+ when Proc then method.call(df)
+ else raise
+ end
+ end
+ alias :apply_method_on_sub_df :apply_method
+
# Retrieves a Daru::Vector, based on the result of calculation
# performed on each row.
def collect_rows &block
return to_enum(:collect_rows) unless block_given?
@@ -1448,15 +1474,14 @@
# # ["foo", "one", 3]=>[6],
# # ["foo", "three", 8]=>[7],
# # ["foo", "two", 3]=>[2, 4]}
def group_by *vectors
vectors.flatten!
- # FIXME: wouldn't it better to do vectors - @vectors here and
- # raise one error with all non-existent vector names?.. - zverok, 2016-05-18
- vectors.each { |v|
- raise(ArgumentError, "Vector #{v} does not exist") unless has_vector?(v)
- }
+ missing = vectors - @vectors.to_a
+ unless missing.empty?
+ raise(ArgumentError, "Vector(s) missing: #{missing.join(', ')}")
+ end
vectors = [@vectors.first] if vectors.empty?
Daru::Core::GroupBy.new(self, vectors)
end
@@ -2247,26 +2272,10 @@
.rename(cat)
.delete_vector cat_name
end
end
- # returns array of row tuples at given index(s)
- def access_row_tuples_by_indexs *indexes
- positions = @index.pos(*indexes)
-
- return populate_row_for(positions) if positions.is_a? Numeric
-
- res = []
- new_rows = @data.map { |vec| vec[*indexes] }
- indexes.each do |index|
- tuples = []
- new_rows.map { |row| tuples += [row[index]] }
- res << tuples
- end
- res
- end
-
# Function to use for aggregating the data.
#
# @param options [Hash] options for column, you want in resultant dataframe
#
# @return [Daru::DataFrame]
@@ -2280,11 +2289,11 @@
# 1 b 12
# 2 c 7
# 3 d 17
# 4 e 1
#
- # df.aggregate(num_100_times: ->(df) { df.num*100 })
+ # df.aggregate(num_100_times: ->(df) { (df.num*100).first })
# => #<Daru::DataFrame(5x1)>
# num_100_ti
# 0 5200
# 1 1200
# 2 700
@@ -2310,45 +2319,30 @@
# b 12
# c 1
#
# Note: `GroupBy` class `aggregate` method uses this `aggregate` method
# internally.
- def aggregate(options={})
- colmn_value, index_tuples = aggregated_colmn_value(options)
- Daru::DataFrame.new(
- colmn_value, index: index_tuples, order: options.keys
- )
- end
+ def aggregate(options={}, multi_index_level=-1)
+ positions_tuples, new_index = group_index_for_aggregation(@index, multi_index_level)
- private
+ colmn_value = aggregate_by_positions_tuples(options, positions_tuples)
- # Do the `method` (`method` can be :sum, :mean, :std, :median, etc or
- # lambda), on the column.
- def apply_method_on_colmns colmn, index_tuples, method
- rows = []
- index_tuples.each do |indexes|
- # If single element then also make it vector.
- slice = Daru::Vector.new(Array(self[colmn][*indexes]))
- case method
- when Symbol
- rows << (slice.is_a?(Daru::Vector) ? slice.send(method) : slice)
- when Proc
- rows << method.call(slice)
- end
- end
- rows
+ Daru::DataFrame.new(colmn_value, index: new_index, order: options.keys)
end
- def apply_method_on_df index_tuples, method
- rows = []
- index_tuples.each do |indexes|
- slice = row[*indexes]
- rows << method.call(slice)
- end
- rows
+ # Is faster than using group_by followed by aggregate (because it doesn't generate an intermediary dataframe)
+ def group_by_and_aggregate(*group_by_keys, **aggregation_map)
+ positions_groups = Daru::Core::GroupBy.get_positions_group_map_for_df(self, group_by_keys.flatten, sort: true)
+
+ new_index = Daru::MultiIndex.from_tuples(positions_groups.keys).coerce_index
+ colmn_value = aggregate_by_positions_tuples(aggregation_map, positions_groups.values)
+
+ Daru::DataFrame.new(colmn_value, index: new_index, order: aggregation_map.keys)
end
+ private
+
def headers
Daru::Index.new(Array(index.name) + @vectors.to_a)
end
def row_headers
@@ -2908,30 +2902,44 @@
Daru::Vector.new vector
end
end
def update_data source, vectors
- @data = @vectors.each_with_index.map do |_vec,idx|
+ @data = @vectors.each_with_index.map do |_vec, idx|
Daru::Vector.new(source[idx], index: @index, name: vectors[idx])
end
end
- def aggregated_colmn_value(options)
- colmn_value = []
- index_tuples = Array(@index).uniq
- options.keys.each do |vec|
- do_this_on_vec = options[vec]
- colmn_value << if @vectors.include?(vec)
- apply_method_on_colmns(
- vec, index_tuples, do_this_on_vec
- )
- else
- apply_method_on_df(
- index_tuples, do_this_on_vec
- )
- end
+ def aggregate_by_positions_tuples(options, positions_tuples)
+ options.map do |vect, method|
+ if @vectors.include?(vect)
+ vect = self[vect]
+
+ positions_tuples.map do |positions|
+ vect.apply_method_on_sub_vector(method, keys: positions)
+ end
+ else
+ positions_tuples.map do |positions|
+ apply_method_on_sub_df(method, keys: positions)
+ end
+ end
end
- [colmn_value, index_tuples]
+ end
+
+ def group_index_for_aggregation(index, multi_index_level=-1)
+ case index
+ when Daru::MultiIndex
+ groups = Daru::Core::GroupBy.get_positions_group_for_aggregation(index, multi_index_level)
+ new_index, pos_tuples = groups.keys, groups.values
+
+ new_index = Daru::MultiIndex.from_tuples(new_index).coerce_index
+ when Daru::Index, Daru::CategoricalIndex
+ new_index = Array(index).uniq
+ pos_tuples = new_index.map { |idx| [*index.pos(idx)] }
+ else raise
+ end
+
+ [pos_tuples, new_index]
end
# coerce ranges, integers and array in appropriate ways
def coerce_positions *positions, size
if positions.size == 1