lib/polars/group_by.rb in polars-df-0.1.4 vs lib/polars/group_by.rb in polars-df-0.1.5

- old
+ new

@@ -10,11 +10,52 @@ self._dataframe_class = dataframe_class self.by = by self.maintain_order = maintain_order end - # def apply + # Apply a custom/user-defined function (UDF) over the groups as a sub-DataFrame. + # + # Implementing logic using a Ruby function is almost always _significantly_ + # slower and more memory intensive than implementing the same logic using + # the native expression API because: + + # - The native expression engine runs in Rust; UDFs run in Ruby. + # - Use of Ruby UDFs forces the DataFrame to be materialized in memory. + # - Polars-native expressions can be parallelised (UDFs cannot). + # - Polars-native expressions can be logically optimised (UDFs cannot). + # + # Wherever possible you should strongly prefer the native expression API + # to achieve the best performance. + # + # @return [DataFrame] + # + # @example + # df = Polars::DataFrame.new( + # { + # "id" => [0, 1, 2, 3, 4], + # "color" => ["red", "green", "green", "red", "red"], + # "shape" => ["square", "triangle", "square", "triangle", "square"] + # } + # ) + # df.groupby("color").apply { |group_df| group_df.sample(2) } + # # => + # # shape: (4, 3) + # # ┌─────┬───────┬──────────┐ + # # │ id ┆ color ┆ shape │ + # # │ --- ┆ --- ┆ --- │ + # # │ i64 ┆ str ┆ str │ + # # ╞═════╪═══════╪══════════╡ + # # │ 1 ┆ green ┆ triangle │ + # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤ + # # │ 2 ┆ green ┆ square │ + # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤ + # # │ 4 ┆ red ┆ square │ + # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤ + # # │ 3 ┆ red ┆ triangle │ + # # └─────┴───────┴──────────┘ + # def apply(&f) + # _dataframe_class._from_rbdf(_df.groupby_apply(by, f)) # end # Use multiple aggregations on columns. # # This can be combined with complete lazy API and is considered idiomatic polars. @@ -180,11 +221,10 @@ .collect(no_optimization: true, string_cache: false) ) _dataframe_class._from_rbdf(df._df) end - # def pivot - # end + # pivot is deprecated # Aggregate the first values in the group. # # @return [DataFrame] #