lib/polars/group_by.rb in polars-df-0.1.4 vs lib/polars/group_by.rb in polars-df-0.1.5
- old
+ new
@@ -10,11 +10,52 @@
self._dataframe_class = dataframe_class
self.by = by
self.maintain_order = maintain_order
end
- # def apply
+ # Apply a custom/user-defined function (UDF) over the groups as a sub-DataFrame.
+ #
+ # Implementing logic using a Ruby function is almost always _significantly_
+ # slower and more memory intensive than implementing the same logic using
+ # the native expression API because:
+
+ # - The native expression engine runs in Rust; UDFs run in Ruby.
+ # - Use of Ruby UDFs forces the DataFrame to be materialized in memory.
+ # - Polars-native expressions can be parallelised (UDFs cannot).
+ # - Polars-native expressions can be logically optimised (UDFs cannot).
+ #
+ # Wherever possible you should strongly prefer the native expression API
+ # to achieve the best performance.
+ #
+ # @return [DataFrame]
+ #
+ # @example
+ # df = Polars::DataFrame.new(
+ # {
+ # "id" => [0, 1, 2, 3, 4],
+ # "color" => ["red", "green", "green", "red", "red"],
+ # "shape" => ["square", "triangle", "square", "triangle", "square"]
+ # }
+ # )
+ # df.groupby("color").apply { |group_df| group_df.sample(2) }
+ # # =>
+ # # shape: (4, 3)
+ # # ┌─────┬───────┬──────────┐
+ # # │ id ┆ color ┆ shape │
+ # # │ --- ┆ --- ┆ --- │
+ # # │ i64 ┆ str ┆ str │
+ # # ╞═════╪═══════╪══════════╡
+ # # │ 1 ┆ green ┆ triangle │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
+ # # │ 2 ┆ green ┆ square │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
+ # # │ 4 ┆ red ┆ square │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
+ # # │ 3 ┆ red ┆ triangle │
+ # # └─────┴───────┴──────────┘
+ # def apply(&f)
+ # _dataframe_class._from_rbdf(_df.groupby_apply(by, f))
# end
# Use multiple aggregations on columns.
#
# This can be combined with complete lazy API and is considered idiomatic polars.
@@ -180,11 +221,10 @@
.collect(no_optimization: true, string_cache: false)
)
_dataframe_class._from_rbdf(df._df)
end
- # def pivot
- # end
+ # pivot is deprecated
# Aggregate the first values in the group.
#
# @return [DataFrame]
#