lib/polars/lazy_frame.rb in polars-df-0.1.4 vs lib/polars/lazy_frame.rb in polars-df-0.1.5

- old
+ new

@@ -147,13 +147,24 @@ end # def self.from_json # end - # def self.read_json - # end + # Read a logical plan from a JSON file to construct a LazyFrame. + # + # @param file [String] + # Path to a file or a file-like object. + # + # @return [LazyFrame] + def self.read_json(file) + if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname)) + file = Utils.format_path(file) + end + Utils.wrap_ldf(RbLazyFrame.read_json(file)) + end + # Get or set column names. # # @return [Array] # # @example @@ -243,15 +254,61 @@ #{describe_plan} EOS end - # def write_json - # end + # Write the logical plan of this LazyFrame to a file or string in JSON format. + # + # @param file [String] + # File path to which the result should be written. + # + # @return [nil] + def write_json(file) + if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname)) + file = Utils.format_path(file) + end + _ldf.write_json(file) + nil + end - # def pipe - # end + # Offers a structured way to apply a sequence of user-defined functions (UDFs). + # + # @param func [Object] + # Callable; will receive the frame as the first parameter, + # followed by any given args/kwargs. + # @param args [Object] + # Arguments to pass to the UDF. + # @param kwargs [Object] + # Keyword arguments to pass to the UDF. + # + # @return [LazyFrame] + # + # @example + # cast_str_to_int = lambda do |data, col_name:| + # data.with_column(Polars.col(col_name).cast(:i64)) + # end + # + # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => ["10", "20", "30", "40"]}).lazy + # df.pipe(cast_str_to_int, col_name: "b").collect() + # # => + # # shape: (4, 2) + # # ┌─────┬─────┐ + # # │ a ┆ b │ + # # │ --- ┆ --- │ + # # │ i64 ┆ i64 │ + # # ╞═════╪═════╡ + # # │ 1 ┆ 10 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 2 ┆ 20 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 3 ┆ 30 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 4 ┆ 40 │ + # # └─────┴─────┘ + def pipe(func, *args, **kwargs, &block) + func.call(self, *args, **kwargs, &block) + end # Create a string representation of the unoptimized query plan. # # @return [String] def describe_plan @@ -259,13 +316,32 @@ end # Create a string representation of the optimized query plan. # # @return [String] - # def describe_optimized_plan - # end + def describe_optimized_plan( + type_coercion: true, + predicate_pushdown: true, + projection_pushdown: true, + simplify_expression: true, + slice_pushdown: true, + common_subplan_elimination: true, + allow_streaming: false + ) + ldf = _ldf.optimization_toggle( + type_coercion, + predicate_pushdown, + projection_pushdown, + simplify_expression, + slice_pushdown, + common_subplan_elimination, + allow_streaming, + ) + ldf.describe_optimized_plan + end + # def show_graph # end # Sort the DataFrame. # @@ -724,19 +800,547 @@ rbexprs_by = Utils.selection_to_rbexpr_list(by) lgb = _ldf.groupby(rbexprs_by, maintain_order) LazyGroupBy.new(lgb, self.class) end - # def groupby_rolling - # end + # Create rolling groups based on a time column. + # + # Also works for index values of type `:i32` or `:i64`. + # + # Different from a `dynamic_groupby` the windows are now determined by the + # individual values and are not of constant intervals. For constant intervals + # use *groupby_dynamic*. + # + # The `period` and `offset` arguments are created either from a timedelta, or + # by using the following string language: + # + # - 1ns (1 nanosecond) + # - 1us (1 microsecond) + # - 1ms (1 millisecond) + # - 1s (1 second) + # - 1m (1 minute) + # - 1h (1 hour) + # - 1d (1 day) + # - 1w (1 week) + # - 1mo (1 calendar month) + # - 1y (1 calendar year) + # - 1i (1 index count) + # + # Or combine them: + # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + # + # In case of a groupby_rolling on an integer column, the windows are defined by: + # + # - "1i" # length 1 + # - "10i" # length 10 + # + # @param index_column [Object] + # Column used to group based on the time window. + # Often to type Date/Datetime + # This column must be sorted in ascending order. If not the output will not + # make sense. + # + # In case of a rolling groupby on indices, dtype needs to be one of + # `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if + # performance matters use an `:i64` column. + # @param period [Object] + # Length of the window. + # @param offset [Object] + # Offset of the window. Default is -period. + # @param closed ["right", "left", "both", "none"] + # Define whether the temporal window interval is closed or not. + # @param by [Object] + # Also group by this column/these columns. + # + # @return [LazyFrame] + # + # @example + # dates = [ + # "2020-01-01 13:45:48", + # "2020-01-01 16:42:13", + # "2020-01-01 16:45:09", + # "2020-01-02 18:12:48", + # "2020-01-03 19:45:32", + # "2020-01-08 23:16:43" + # ] + # df = Polars::DataFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column( + # Polars.col("dt").str.strptime(:datetime) + # ) + # df.groupby_rolling(index_column: "dt", period: "2d").agg( + # [ + # Polars.sum("a").alias("sum_a"), + # Polars.min("a").alias("min_a"), + # Polars.max("a").alias("max_a") + # ] + # ) + # # => + # # shape: (6, 4) + # # ┌─────────────────────┬───────┬───────┬───────┐ + # # │ dt ┆ sum_a ┆ min_a ┆ max_a │ + # # │ --- ┆ --- ┆ --- ┆ --- │ + # # │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + # # ╞═════════════════════╪═══════╪═══════╪═══════╡ + # # │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + # # │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + # # │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + # # │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + # # │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + # # │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + # # └─────────────────────┴───────┴───────┴───────┘ + def groupby_rolling( + index_column:, + period:, + offset: nil, + closed: "right", + by: nil + ) + if offset.nil? + offset = "-#{period}" + end - # def groupby_dynamic - # end + rbexprs_by = by.nil? ? [] : Utils.selection_to_rbexpr_list(by) + period = Utils._timedelta_to_pl_duration(period) + offset = Utils._timedelta_to_pl_duration(offset) - # def join_asof - # end + lgb = _ldf.groupby_rolling( + index_column, period, offset, closed, rbexprs_by + ) + LazyGroupBy.new(lgb, self.class) + end + # Group based on a time value (or index value of type `:i32`, `:i64`). + # + # Time windows are calculated and rows are assigned to windows. Different from a + # normal groupby is that a row can be member of multiple groups. The time/index + # window could be seen as a rolling window, with a window size determined by + # dates/times/values instead of slots in the DataFrame. + # + # A window is defined by: + # + # - every: interval of the window + # - period: length of the window + # - offset: offset of the window + # + # The `every`, `period` and `offset` arguments are created with + # the following string language: + # + # - 1ns (1 nanosecond) + # - 1us (1 microsecond) + # - 1ms (1 millisecond) + # - 1s (1 second) + # - 1m (1 minute) + # - 1h (1 hour) + # - 1d (1 day) + # - 1w (1 week) + # - 1mo (1 calendar month) + # - 1y (1 calendar year) + # - 1i (1 index count) + # + # Or combine them: + # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + # + # In case of a groupby_dynamic on an integer column, the windows are defined by: + # + # - "1i" # length 1 + # - "10i" # length 10 + # + # @param index_column + # Column used to group based on the time window. + # Often to type Date/Datetime + # This column must be sorted in ascending order. If not the output will not + # make sense. + # + # In case of a dynamic groupby on indices, dtype needs to be one of + # `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if + # performance matters use an `:i64` column. + # @param every + # Interval of the window. + # @param period + # Length of the window, if None it is equal to 'every'. + # @param offset + # Offset of the window if None and period is None it will be equal to negative + # `every`. + # @param truncate + # Truncate the time value to the window lower bound. + # @param include_boundaries + # Add the lower and upper bound of the window to the "_lower_bound" and + # "_upper_bound" columns. This will impact performance because it's harder to + # parallelize + # @param closed ["right", "left", "both", "none"] + # Define whether the temporal window interval is closed or not. + # @param by + # Also group by this column/these columns + # + # @return [DataFrame] + # + # @example + # df = Polars::DataFrame.new( + # { + # "time" => Polars.date_range( + # DateTime.new(2021, 12, 16), + # DateTime.new(2021, 12, 16, 3), + # "30m" + # ), + # "n" => 0..6 + # } + # ) + # # => + # # shape: (7, 2) + # # ┌─────────────────────┬─────┐ + # # │ time ┆ n │ + # # │ --- ┆ --- │ + # # │ datetime[μs] ┆ i64 │ + # # ╞═════════════════════╪═════╡ + # # │ 2021-12-16 00:00:00 ┆ 0 │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 2021-12-16 00:30:00 ┆ 1 │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 2021-12-16 01:00:00 ┆ 2 │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 2021-12-16 01:30:00 ┆ 3 │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 2021-12-16 02:00:00 ┆ 4 │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 2021-12-16 02:30:00 ┆ 5 │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 2021-12-16 03:00:00 ┆ 6 │ + # # └─────────────────────┴─────┘ + # + # @example Group by windows of 1 hour starting at 2021-12-16 00:00:00. + # df.groupby_dynamic("time", every: "1h", closed: "right").agg( + # [ + # Polars.col("time").min.alias("time_min"), + # Polars.col("time").max.alias("time_max") + # ] + # ) + # # => + # # shape: (4, 3) + # # ┌─────────────────────┬─────────────────────┬─────────────────────┐ + # # │ time ┆ time_min ┆ time_max │ + # # │ --- ┆ --- ┆ --- │ + # # │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + # # ╞═════════════════════╪═════════════════════╪═════════════════════╡ + # # │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + # # │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + # # │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + # # │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + # # └─────────────────────┴─────────────────────┴─────────────────────┘ + # + # @example The window boundaries can also be added to the aggregation result. + # df.groupby_dynamic( + # "time", every: "1h", include_boundaries: true, closed: "right" + # ).agg([Polars.col("time").count.alias("time_count")]) + # # => + # # shape: (4, 4) + # # ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + # # │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + # # │ --- ┆ --- ┆ --- ┆ --- │ + # # │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + # # ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + # # │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤ + # # │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤ + # # │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤ + # # │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + # # └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + # + # @example When closed="left", should not include right end of interval. + # df.groupby_dynamic("time", every: "1h", closed: "left").agg( + # [ + # Polars.col("time").count.alias("time_count"), + # Polars.col("time").list.alias("time_agg_list") + # ] + # ) + # # => + # # shape: (4, 3) + # # ┌─────────────────────┬────────────┬─────────────────────────────────────┐ + # # │ time ┆ time_count ┆ time_agg_list │ + # # │ --- ┆ --- ┆ --- │ + # # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + # # ╞═════════════════════╪════════════╪═════════════════════════════════════╡ + # # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16... │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + # # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16... │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + # # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16... │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + # # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + # # └─────────────────────┴────────────┴─────────────────────────────────────┘ + # + # @example When closed="both" the time values at the window boundaries belong to 2 groups. + # df.groupby_dynamic("time", every: "1h", closed: "both").agg( + # [Polars.col("time").count.alias("time_count")] + # ) + # # => + # # shape: (5, 2) + # # ┌─────────────────────┬────────────┐ + # # │ time ┆ time_count │ + # # │ --- ┆ --- │ + # # │ datetime[μs] ┆ u32 │ + # # ╞═════════════════════╪════════════╡ + # # │ 2021-12-15 23:00:00 ┆ 1 │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤ + # # │ 2021-12-16 00:00:00 ┆ 3 │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤ + # # │ 2021-12-16 01:00:00 ┆ 3 │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤ + # # │ 2021-12-16 02:00:00 ┆ 3 │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤ + # # │ 2021-12-16 03:00:00 ┆ 1 │ + # # └─────────────────────┴────────────┘ + # + # @example Dynamic groupbys can also be combined with grouping on normal keys. + # df = Polars::DataFrame.new( + # { + # "time" => Polars.date_range( + # DateTime.new(2021, 12, 16), + # DateTime.new(2021, 12, 16, 3), + # "30m" + # ), + # "groups" => ["a", "a", "a", "b", "b", "a", "a"] + # } + # ) + # df.groupby_dynamic( + # "time", + # every: "1h", + # closed: "both", + # by: "groups", + # include_boundaries: true + # ).agg([Polars.col("time").count.alias("time_count")]) + # # => + # # shape: (7, 5) + # # ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + # # │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + # # │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + # # ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + # # │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤ + # # │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤ + # # │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤ + # # │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤ + # # │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤ + # # │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤ + # # │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + # # └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + # + # @example Dynamic groupby on an index column. + # df = Polars::DataFrame.new( + # { + # "idx" => Polars.arange(0, 6, eager: true), + # "A" => ["A", "A", "B", "B", "B", "C"] + # } + # ) + # df.groupby_dynamic( + # "idx", + # every: "2i", + # period: "3i", + # include_boundaries: true, + # closed: "right" + # ).agg(Polars.col("A").list.alias("A_agg_list")) + # # => + # # shape: (3, 4) + # # ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + # # │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + # # │ --- ┆ --- ┆ --- ┆ --- │ + # # │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + # # ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + # # │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + # # │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + # # │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + # # └─────────────────┴─────────────────┴─────┴─────────────────┘ + def groupby_dynamic( + index_column, + every:, + period: nil, + offset: nil, + truncate: true, + include_boundaries: false, + closed: "left", + by: nil + ) + if offset.nil? + if period.nil? + offset = "-#{every}" + else + offset = "0ns" + end + end + + if period.nil? + period = every + end + + period = Utils._timedelta_to_pl_duration(period) + offset = Utils._timedelta_to_pl_duration(offset) + every = Utils._timedelta_to_pl_duration(every) + + rbexprs_by = by.nil? ? [] : Utils.selection_to_rbexpr_list(by) + lgb = _ldf.groupby_dynamic( + index_column, + every, + period, + offset, + truncate, + include_boundaries, + closed, + rbexprs_by + ) + LazyGroupBy.new(lgb, self.class) + end + + # Perform an asof join. + # + # This is similar to a left-join except that we match on nearest key rather than + # equal keys. + # + # Both DataFrames must be sorted by the join_asof key. + # + # For each row in the left DataFrame: + # + # - A "backward" search selects the last row in the right DataFrame whose 'on' key is less than or equal to the left's key. + # - A "forward" search selects the first row in the right DataFrame whose 'on' key is greater than or equal to the left's key. + # + # The default is "backward". + # + # @param other [LazyFrame] + # Lazy DataFrame to join with. + # @param left_on [String] + # Join column of the left DataFrame. + # @param right_on [String] + # Join column of the right DataFrame. + # @param on [String] + # Join column of both DataFrames. If set, `left_on` and `right_on` should be + # None. + # @param by [Object] + # Join on these columns before doing asof join. + # @param by_left [Object] + # Join on these columns before doing asof join. + # @param by_right [Object] + # Join on these columns before doing asof join. + # @param strategy ["backward", "forward"] + # Join strategy. + # @param suffix [String] + # Suffix to append to columns with a duplicate name. + # @param tolerance [Object] + # Numeric tolerance. By setting this the join will only be done if the near + # keys are within this distance. If an asof join is done on columns of dtype + # "Date", "Datetime", "Duration" or "Time" you use the following string + # language: + # + # - 1ns (1 nanosecond) + # - 1us (1 microsecond) + # - 1ms (1 millisecond) + # - 1s (1 second) + # - 1m (1 minute) + # - 1h (1 hour) + # - 1d (1 day) + # - 1w (1 week) + # - 1mo (1 calendar month) + # - 1y (1 calendar year) + # - 1i (1 index count) + # + # Or combine them: + # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + # + # @param allow_parallel [Boolean] + # Allow the physical plan to optionally evaluate the computation of both + # DataFrames up to the join in parallel. + # @param force_parallel [Boolean] + # Force the physical plan to evaluate the computation of both DataFrames up to + # the join in parallel. + # + # @return [LazyFrame] + def join_asof( + other, + left_on: nil, + right_on: nil, + on: nil, + by_left: nil, + by_right: nil, + by: nil, + strategy: "backward", + suffix: "_right", + tolerance: nil, + allow_parallel: true, + force_parallel: false + ) + if !other.is_a?(LazyFrame) + raise ArgumentError, "Expected a `LazyFrame` as join table, got #{other.class.name}" + end + + if on.is_a?(String) + left_on = on + right_on = on + end + + if left_on.nil? || right_on.nil? + raise ArgumentError, "You should pass the column to join on as an argument." + end + + if by_left.is_a?(String) || by_left.is_a?(Expr) + by_left_ = [by_left] + else + by_left_ = by_left + end + + if by_right.is_a?(String) || by_right.is_a?(Expr) + by_right_ = [by_right] + else + by_right_ = by_right + end + + if by.is_a?(String) + by_left_ = [by] + by_right_ = [by] + elsif by.is_a?(Array) + by_left_ = by + by_right_ = by + end + + tolerance_str = nil + tolerance_num = nil + if tolerance.is_a?(String) + tolerance_str = tolerance + else + tolerance_num = tolerance + end + + _from_rbldf( + _ldf.join_asof( + other._ldf, + Polars.col(left_on)._rbexpr, + Polars.col(right_on)._rbexpr, + by_left_, + by_right_, + allow_parallel, + force_parallel, + suffix, + strategy, + tolerance_num, + tolerance_str + ) + ) + end + # Add a join operation to the Logical Plan. # # @param other [LazyFrame] # Lazy DataFrame to join with. # @param left_on [Object] @@ -951,13 +1555,49 @@ end _from_rbldf(_ldf.with_columns(rbexprs)) end - # def with_context - # end + # Add an external context to the computation graph. + # + # This allows expressions to also access columns from DataFrames + # that are not part of this one. + # + # @param other [Object] + # Lazy DataFrame to join with. + # + # @return [LazyFrame] + # + # @example + # df_a = Polars::DataFrame.new({"a" => [1, 2, 3], "b" => ["a", "c", nil]}).lazy + # df_other = Polars::DataFrame.new({"c" => ["foo", "ham"]}) + # ( + # df_a.with_context(df_other.lazy).select( + # [Polars.col("b") + Polars.col("c").first] + # ) + # ).collect + # # => + # # shape: (3, 1) + # # ┌──────┐ + # # │ b │ + # # │ --- │ + # # │ str │ + # # ╞══════╡ + # # │ afoo │ + # # ├╌╌╌╌╌╌┤ + # # │ cfoo │ + # # ├╌╌╌╌╌╌┤ + # # │ null │ + # # └──────┘ + def with_context(other) + if !other.is_a?(Array) + other = [other] + end + _from_rbldf(_ldf.with_context(other.map(&:_ldf))) + end + # Add or overwrite column in a DataFrame. # # @param column [Object] # Expression that evaluates to column or a Series to use. # @@ -1229,12 +1869,47 @@ # @return [LazyFrame] def first slice(0, 1) end - # def with_row_count - # end + # Add a column at index 0 that counts the rows. + # + # @param name [String] + # Name of the column to add. + # @param offset [Integer] + # Start the row count at this offset. + # + # @return [LazyFrame] + # + # @note + # This can have a negative effect on query performance. + # This may, for instance, block predicate pushdown optimization. + # + # @example + # df = Polars::DataFrame.new( + # { + # "a" => [1, 3, 5], + # "b" => [2, 4, 6] + # } + # ).lazy + # df.with_row_count.collect + # # => + # # shape: (3, 3) + # # ┌────────┬─────┬─────┐ + # # │ row_nr ┆ a ┆ b │ + # # │ --- ┆ --- ┆ --- │ + # # │ u32 ┆ i64 ┆ i64 │ + # # ╞════════╪═════╪═════╡ + # # │ 0 ┆ 1 ┆ 2 │ + # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 1 ┆ 3 ┆ 4 │ + # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 2 ┆ 5 ┆ 6 │ + # # └────────┴─────┴─────┘ + def with_row_count(name: "row_nr", offset: 0) + _from_rbldf(_ldf.with_row_count(name, offset)) + end # Take every nth row in the LazyFrame and return as a new LazyFrame. # # @return [LazyFrame] # @@ -1552,14 +2227,109 @@ subset = [subset] end _from_rbldf(_ldf.unique(maintain_order, subset, keep)) end - # def drop_nulls - # end + # Drop rows with null values from this LazyFrame. + # + # @param subset [Object] + # Subset of column(s) on which `drop_nulls` will be applied. + # + # @return [LazyFrame] + # + # @example + # df = Polars::DataFrame.new( + # { + # "foo" => [1, 2, 3], + # "bar" => [6, nil, 8], + # "ham" => ["a", "b", "c"] + # } + # ) + # df.lazy.drop_nulls.collect + # # => + # # shape: (2, 3) + # # ┌─────┬─────┬─────┐ + # # │ foo ┆ bar ┆ ham │ + # # │ --- ┆ --- ┆ --- │ + # # │ i64 ┆ i64 ┆ str │ + # # ╞═════╪═════╪═════╡ + # # │ 1 ┆ 6 ┆ a │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 3 ┆ 8 ┆ c │ + # # └─────┴─────┴─────┘ + def drop_nulls(subset: nil) + if !subset.nil? && !subset.is_a?(Array) + subset = [subset] + end + _from_rbldf(_ldf.drop_nulls(subset)) + end - # def melt - # end + # Unpivot a DataFrame from wide to long format. + # + # Optionally leaves identifiers set. + # + # This function is useful to massage a DataFrame into a format where one or more + # columns are identifier variables (id_vars), while all other columns, considered + # measured variables (value_vars), are "unpivoted" to the row axis, leaving just + # two non-identifier columns, 'variable' and 'value'. + # + # @param id_vars [Object] + # Columns to use as identifier variables. + # @param value_vars [Object] + # Values to use as identifier variables. + # If `value_vars` is empty all columns that are not in `id_vars` will be used. + # @param variable_name [String] + # Name to give to the `value` column. Defaults to "variable" + # @param value_name [String] + # Name to give to the `value` column. Defaults to "value" + # + # @return [LazyFrame] + # + # @example + # df = Polars::DataFrame.new( + # { + # "a" => ["x", "y", "z"], + # "b" => [1, 3, 5], + # "c" => [2, 4, 6] + # } + # ).lazy + # df.melt(id_vars: "a", value_vars: ["b", "c"]).collect + # # => + # # shape: (6, 3) + # # ┌─────┬──────────┬───────┐ + # # │ a ┆ variable ┆ value │ + # # │ --- ┆ --- ┆ --- │ + # # │ str ┆ str ┆ i64 │ + # # ╞═════╪══════════╪═══════╡ + # # │ x ┆ b ┆ 1 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + # # │ y ┆ b ┆ 3 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + # # │ z ┆ b ┆ 5 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + # # │ x ┆ c ┆ 2 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + # # │ y ┆ c ┆ 4 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + # # │ z ┆ c ┆ 6 │ + # # └─────┴──────────┴───────┘ + def melt(id_vars: nil, value_vars: nil, variable_name: nil, value_name: nil) + if value_vars.is_a?(String) + value_vars = [value_vars] + end + if id_vars.is_a?(String) + id_vars = [id_vars] + end + if value_vars.nil? + value_vars = [] + end + if id_vars.nil? + id_vars = [] + end + _from_rbldf( + _ldf.melt(id_vars, value_vars, value_name, variable_name) + ) + end # def map # end # Interpolate intermediate values. The interpolation method is linear.