lib/polars/lazy_frame.rb in polars-df-0.1.4 vs lib/polars/lazy_frame.rb in polars-df-0.1.5
- old
+ new
@@ -147,13 +147,24 @@
end
# def self.from_json
# end
- # def self.read_json
- # end
+ # Read a logical plan from a JSON file to construct a LazyFrame.
+ #
+ # @param file [String]
+ # Path to a file or a file-like object.
+ #
+ # @return [LazyFrame]
+ def self.read_json(file)
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
+ file = Utils.format_path(file)
+ end
+ Utils.wrap_ldf(RbLazyFrame.read_json(file))
+ end
+
# Get or set column names.
#
# @return [Array]
#
# @example
@@ -243,15 +254,61 @@
#{describe_plan}
EOS
end
- # def write_json
- # end
+ # Write the logical plan of this LazyFrame to a file or string in JSON format.
+ #
+ # @param file [String]
+ # File path to which the result should be written.
+ #
+ # @return [nil]
+ def write_json(file)
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
+ file = Utils.format_path(file)
+ end
+ _ldf.write_json(file)
+ nil
+ end
- # def pipe
- # end
+ # Offers a structured way to apply a sequence of user-defined functions (UDFs).
+ #
+ # @param func [Object]
+ # Callable; will receive the frame as the first parameter,
+ # followed by any given args/kwargs.
+ # @param args [Object]
+ # Arguments to pass to the UDF.
+ # @param kwargs [Object]
+ # Keyword arguments to pass to the UDF.
+ #
+ # @return [LazyFrame]
+ #
+ # @example
+ # cast_str_to_int = lambda do |data, col_name:|
+ # data.with_column(Polars.col(col_name).cast(:i64))
+ # end
+ #
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => ["10", "20", "30", "40"]}).lazy
+ # df.pipe(cast_str_to_int, col_name: "b").collect()
+ # # =>
+ # # shape: (4, 2)
+ # # ┌─────┬─────┐
+ # # │ a ┆ b │
+ # # │ --- ┆ --- │
+ # # │ i64 ┆ i64 │
+ # # ╞═════╪═════╡
+ # # │ 1 ┆ 10 │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ 2 ┆ 20 │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ 3 ┆ 30 │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ 4 ┆ 40 │
+ # # └─────┴─────┘
+ def pipe(func, *args, **kwargs, &block)
+ func.call(self, *args, **kwargs, &block)
+ end
# Create a string representation of the unoptimized query plan.
#
# @return [String]
def describe_plan
@@ -259,13 +316,32 @@
end
# Create a string representation of the optimized query plan.
#
# @return [String]
- # def describe_optimized_plan
- # end
+ def describe_optimized_plan(
+ type_coercion: true,
+ predicate_pushdown: true,
+ projection_pushdown: true,
+ simplify_expression: true,
+ slice_pushdown: true,
+ common_subplan_elimination: true,
+ allow_streaming: false
+ )
+ ldf = _ldf.optimization_toggle(
+ type_coercion,
+ predicate_pushdown,
+ projection_pushdown,
+ simplify_expression,
+ slice_pushdown,
+ common_subplan_elimination,
+ allow_streaming,
+ )
+ ldf.describe_optimized_plan
+ end
+
# def show_graph
# end
# Sort the DataFrame.
#
@@ -724,19 +800,547 @@
rbexprs_by = Utils.selection_to_rbexpr_list(by)
lgb = _ldf.groupby(rbexprs_by, maintain_order)
LazyGroupBy.new(lgb, self.class)
end
- # def groupby_rolling
- # end
+ # Create rolling groups based on a time column.
+ #
+ # Also works for index values of type `:i32` or `:i64`.
+ #
+ # Different from a `dynamic_groupby` the windows are now determined by the
+ # individual values and are not of constant intervals. For constant intervals
+ # use *groupby_dynamic*.
+ #
+ # The `period` and `offset` arguments are created either from a timedelta, or
+ # by using the following string language:
+ #
+ # - 1ns (1 nanosecond)
+ # - 1us (1 microsecond)
+ # - 1ms (1 millisecond)
+ # - 1s (1 second)
+ # - 1m (1 minute)
+ # - 1h (1 hour)
+ # - 1d (1 day)
+ # - 1w (1 week)
+ # - 1mo (1 calendar month)
+ # - 1y (1 calendar year)
+ # - 1i (1 index count)
+ #
+ # Or combine them:
+ # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
+ #
+ # In case of a groupby_rolling on an integer column, the windows are defined by:
+ #
+ # - "1i" # length 1
+ # - "10i" # length 10
+ #
+ # @param index_column [Object]
+ # Column used to group based on the time window.
+ # Often to type Date/Datetime
+ # This column must be sorted in ascending order. If not the output will not
+ # make sense.
+ #
+ # In case of a rolling groupby on indices, dtype needs to be one of
+ # `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
+ # performance matters use an `:i64` column.
+ # @param period [Object]
+ # Length of the window.
+ # @param offset [Object]
+ # Offset of the window. Default is -period.
+ # @param closed ["right", "left", "both", "none"]
+ # Define whether the temporal window interval is closed or not.
+ # @param by [Object]
+ # Also group by this column/these columns.
+ #
+ # @return [LazyFrame]
+ #
+ # @example
+ # dates = [
+ # "2020-01-01 13:45:48",
+ # "2020-01-01 16:42:13",
+ # "2020-01-01 16:45:09",
+ # "2020-01-02 18:12:48",
+ # "2020-01-03 19:45:32",
+ # "2020-01-08 23:16:43"
+ # ]
+ # df = Polars::DataFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
+ # Polars.col("dt").str.strptime(:datetime)
+ # )
+ # df.groupby_rolling(index_column: "dt", period: "2d").agg(
+ # [
+ # Polars.sum("a").alias("sum_a"),
+ # Polars.min("a").alias("min_a"),
+ # Polars.max("a").alias("max_a")
+ # ]
+ # )
+ # # =>
+ # # shape: (6, 4)
+ # # ┌─────────────────────┬───────┬───────┬───────┐
+ # # │ dt ┆ sum_a ┆ min_a ┆ max_a │
+ # # │ --- ┆ --- ┆ --- ┆ --- │
+ # # │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │
+ # # ╞═════════════════════╪═══════╪═══════╪═══════╡
+ # # │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
+ # # │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
+ # # │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
+ # # │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
+ # # │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
+ # # │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │
+ # # └─────────────────────┴───────┴───────┴───────┘
+ def groupby_rolling(
+ index_column:,
+ period:,
+ offset: nil,
+ closed: "right",
+ by: nil
+ )
+ if offset.nil?
+ offset = "-#{period}"
+ end
- # def groupby_dynamic
- # end
+ rbexprs_by = by.nil? ? [] : Utils.selection_to_rbexpr_list(by)
+ period = Utils._timedelta_to_pl_duration(period)
+ offset = Utils._timedelta_to_pl_duration(offset)
- # def join_asof
- # end
+ lgb = _ldf.groupby_rolling(
+ index_column, period, offset, closed, rbexprs_by
+ )
+ LazyGroupBy.new(lgb, self.class)
+ end
+ # Group based on a time value (or index value of type `:i32`, `:i64`).
+ #
+ # Time windows are calculated and rows are assigned to windows. Different from a
+ # normal groupby is that a row can be member of multiple groups. The time/index
+ # window could be seen as a rolling window, with a window size determined by
+ # dates/times/values instead of slots in the DataFrame.
+ #
+ # A window is defined by:
+ #
+ # - every: interval of the window
+ # - period: length of the window
+ # - offset: offset of the window
+ #
+ # The `every`, `period` and `offset` arguments are created with
+ # the following string language:
+ #
+ # - 1ns (1 nanosecond)
+ # - 1us (1 microsecond)
+ # - 1ms (1 millisecond)
+ # - 1s (1 second)
+ # - 1m (1 minute)
+ # - 1h (1 hour)
+ # - 1d (1 day)
+ # - 1w (1 week)
+ # - 1mo (1 calendar month)
+ # - 1y (1 calendar year)
+ # - 1i (1 index count)
+ #
+ # Or combine them:
+ # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
+ #
+ # In case of a groupby_dynamic on an integer column, the windows are defined by:
+ #
+ # - "1i" # length 1
+ # - "10i" # length 10
+ #
+ # @param index_column
+ # Column used to group based on the time window.
+ # Often to type Date/Datetime
+ # This column must be sorted in ascending order. If not the output will not
+ # make sense.
+ #
+ # In case of a dynamic groupby on indices, dtype needs to be one of
+ # `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
+ # performance matters use an `:i64` column.
+ # @param every
+ # Interval of the window.
+ # @param period
+ # Length of the window, if None it is equal to 'every'.
+ # @param offset
+ # Offset of the window if None and period is None it will be equal to negative
+ # `every`.
+ # @param truncate
+ # Truncate the time value to the window lower bound.
+ # @param include_boundaries
+ # Add the lower and upper bound of the window to the "_lower_bound" and
+ # "_upper_bound" columns. This will impact performance because it's harder to
+ # parallelize
+ # @param closed ["right", "left", "both", "none"]
+ # Define whether the temporal window interval is closed or not.
+ # @param by
+ # Also group by this column/these columns
+ #
+ # @return [DataFrame]
+ #
+ # @example
+ # df = Polars::DataFrame.new(
+ # {
+ # "time" => Polars.date_range(
+ # DateTime.new(2021, 12, 16),
+ # DateTime.new(2021, 12, 16, 3),
+ # "30m"
+ # ),
+ # "n" => 0..6
+ # }
+ # )
+ # # =>
+ # # shape: (7, 2)
+ # # ┌─────────────────────┬─────┐
+ # # │ time ┆ n │
+ # # │ --- ┆ --- │
+ # # │ datetime[μs] ┆ i64 │
+ # # ╞═════════════════════╪═════╡
+ # # │ 2021-12-16 00:00:00 ┆ 0 │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ 2021-12-16 00:30:00 ┆ 1 │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ 2021-12-16 01:00:00 ┆ 2 │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ 2021-12-16 01:30:00 ┆ 3 │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ 2021-12-16 02:00:00 ┆ 4 │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ 2021-12-16 02:30:00 ┆ 5 │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ 2021-12-16 03:00:00 ┆ 6 │
+ # # └─────────────────────┴─────┘
+ #
+ # @example Group by windows of 1 hour starting at 2021-12-16 00:00:00.
+ # df.groupby_dynamic("time", every: "1h", closed: "right").agg(
+ # [
+ # Polars.col("time").min.alias("time_min"),
+ # Polars.col("time").max.alias("time_max")
+ # ]
+ # )
+ # # =>
+ # # shape: (4, 3)
+ # # ┌─────────────────────┬─────────────────────┬─────────────────────┐
+ # # │ time ┆ time_min ┆ time_max │
+ # # │ --- ┆ --- ┆ --- │
+ # # │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │
+ # # ╞═════════════════════╪═════════════════════╪═════════════════════╡
+ # # │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
+ # # │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
+ # # │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
+ # # │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │
+ # # └─────────────────────┴─────────────────────┴─────────────────────┘
+ #
+ # @example The window boundaries can also be added to the aggregation result.
+ # df.groupby_dynamic(
+ # "time", every: "1h", include_boundaries: true, closed: "right"
+ # ).agg([Polars.col("time").count.alias("time_count")])
+ # # =>
+ # # shape: (4, 4)
+ # # ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐
+ # # │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │
+ # # │ --- ┆ --- ┆ --- ┆ --- │
+ # # │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │
+ # # ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡
+ # # │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
+ # # │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
+ # # │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
+ # # │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │
+ # # └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
+ #
+ # @example When closed="left", should not include right end of interval.
+ # df.groupby_dynamic("time", every: "1h", closed: "left").agg(
+ # [
+ # Polars.col("time").count.alias("time_count"),
+ # Polars.col("time").list.alias("time_agg_list")
+ # ]
+ # )
+ # # =>
+ # # shape: (4, 3)
+ # # ┌─────────────────────┬────────────┬─────────────────────────────────────┐
+ # # │ time ┆ time_count ┆ time_agg_list │
+ # # │ --- ┆ --- ┆ --- │
+ # # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │
+ # # ╞═════════════════════╪════════════╪═════════════════════════════════════╡
+ # # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16... │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
+ # # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16... │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
+ # # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16... │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
+ # # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │
+ # # └─────────────────────┴────────────┴─────────────────────────────────────┘
+ #
+ # @example When closed="both" the time values at the window boundaries belong to 2 groups.
+ # df.groupby_dynamic("time", every: "1h", closed: "both").agg(
+ # [Polars.col("time").count.alias("time_count")]
+ # )
+ # # =>
+ # # shape: (5, 2)
+ # # ┌─────────────────────┬────────────┐
+ # # │ time ┆ time_count │
+ # # │ --- ┆ --- │
+ # # │ datetime[μs] ┆ u32 │
+ # # ╞═════════════════════╪════════════╡
+ # # │ 2021-12-15 23:00:00 ┆ 1 │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
+ # # │ 2021-12-16 00:00:00 ┆ 3 │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
+ # # │ 2021-12-16 01:00:00 ┆ 3 │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
+ # # │ 2021-12-16 02:00:00 ┆ 3 │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
+ # # │ 2021-12-16 03:00:00 ┆ 1 │
+ # # └─────────────────────┴────────────┘
+ #
+ # @example Dynamic groupbys can also be combined with grouping on normal keys.
+ # df = Polars::DataFrame.new(
+ # {
+ # "time" => Polars.date_range(
+ # DateTime.new(2021, 12, 16),
+ # DateTime.new(2021, 12, 16, 3),
+ # "30m"
+ # ),
+ # "groups" => ["a", "a", "a", "b", "b", "a", "a"]
+ # }
+ # )
+ # df.groupby_dynamic(
+ # "time",
+ # every: "1h",
+ # closed: "both",
+ # by: "groups",
+ # include_boundaries: true
+ # ).agg([Polars.col("time").count.alias("time_count")])
+ # # =>
+ # # shape: (7, 5)
+ # # ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐
+ # # │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
+ # # │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │
+ # # ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡
+ # # │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
+ # # │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
+ # # │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
+ # # │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
+ # # │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
+ # # │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
+ # # │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │
+ # # └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
+ #
+ # @example Dynamic groupby on an index column.
+ # df = Polars::DataFrame.new(
+ # {
+ # "idx" => Polars.arange(0, 6, eager: true),
+ # "A" => ["A", "A", "B", "B", "B", "C"]
+ # }
+ # )
+ # df.groupby_dynamic(
+ # "idx",
+ # every: "2i",
+ # period: "3i",
+ # include_boundaries: true,
+ # closed: "right"
+ # ).agg(Polars.col("A").list.alias("A_agg_list"))
+ # # =>
+ # # shape: (3, 4)
+ # # ┌─────────────────┬─────────────────┬─────┬─────────────────┐
+ # # │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │
+ # # │ --- ┆ --- ┆ --- ┆ --- │
+ # # │ i64 ┆ i64 ┆ i64 ┆ list[str] │
+ # # ╞═════════════════╪═════════════════╪═════╪═════════════════╡
+ # # │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
+ # # │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
+ # # │ 4 ┆ 7 ┆ 4 ┆ ["C"] │
+ # # └─────────────────┴─────────────────┴─────┴─────────────────┘
+ def groupby_dynamic(
+ index_column,
+ every:,
+ period: nil,
+ offset: nil,
+ truncate: true,
+ include_boundaries: false,
+ closed: "left",
+ by: nil
+ )
+ if offset.nil?
+ if period.nil?
+ offset = "-#{every}"
+ else
+ offset = "0ns"
+ end
+ end
+
+ if period.nil?
+ period = every
+ end
+
+ period = Utils._timedelta_to_pl_duration(period)
+ offset = Utils._timedelta_to_pl_duration(offset)
+ every = Utils._timedelta_to_pl_duration(every)
+
+ rbexprs_by = by.nil? ? [] : Utils.selection_to_rbexpr_list(by)
+ lgb = _ldf.groupby_dynamic(
+ index_column,
+ every,
+ period,
+ offset,
+ truncate,
+ include_boundaries,
+ closed,
+ rbexprs_by
+ )
+ LazyGroupBy.new(lgb, self.class)
+ end
+
+ # Perform an asof join.
+ #
+ # This is similar to a left-join except that we match on nearest key rather than
+ # equal keys.
+ #
+ # Both DataFrames must be sorted by the join_asof key.
+ #
+ # For each row in the left DataFrame:
+ #
+ # - A "backward" search selects the last row in the right DataFrame whose 'on' key is less than or equal to the left's key.
+ # - A "forward" search selects the first row in the right DataFrame whose 'on' key is greater than or equal to the left's key.
+ #
+ # The default is "backward".
+ #
+ # @param other [LazyFrame]
+ # Lazy DataFrame to join with.
+ # @param left_on [String]
+ # Join column of the left DataFrame.
+ # @param right_on [String]
+ # Join column of the right DataFrame.
+ # @param on [String]
+ # Join column of both DataFrames. If set, `left_on` and `right_on` should be
+ # None.
+ # @param by [Object]
+ # Join on these columns before doing asof join.
+ # @param by_left [Object]
+ # Join on these columns before doing asof join.
+ # @param by_right [Object]
+ # Join on these columns before doing asof join.
+ # @param strategy ["backward", "forward"]
+ # Join strategy.
+ # @param suffix [String]
+ # Suffix to append to columns with a duplicate name.
+ # @param tolerance [Object]
+ # Numeric tolerance. By setting this the join will only be done if the near
+ # keys are within this distance. If an asof join is done on columns of dtype
+ # "Date", "Datetime", "Duration" or "Time" you use the following string
+ # language:
+ #
+ # - 1ns (1 nanosecond)
+ # - 1us (1 microsecond)
+ # - 1ms (1 millisecond)
+ # - 1s (1 second)
+ # - 1m (1 minute)
+ # - 1h (1 hour)
+ # - 1d (1 day)
+ # - 1w (1 week)
+ # - 1mo (1 calendar month)
+ # - 1y (1 calendar year)
+ # - 1i (1 index count)
+ #
+ # Or combine them:
+ # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
+ #
+ # @param allow_parallel [Boolean]
+ # Allow the physical plan to optionally evaluate the computation of both
+ # DataFrames up to the join in parallel.
+ # @param force_parallel [Boolean]
+ # Force the physical plan to evaluate the computation of both DataFrames up to
+ # the join in parallel.
+ #
+ # @return [LazyFrame]
+ def join_asof(
+ other,
+ left_on: nil,
+ right_on: nil,
+ on: nil,
+ by_left: nil,
+ by_right: nil,
+ by: nil,
+ strategy: "backward",
+ suffix: "_right",
+ tolerance: nil,
+ allow_parallel: true,
+ force_parallel: false
+ )
+ if !other.is_a?(LazyFrame)
+ raise ArgumentError, "Expected a `LazyFrame` as join table, got #{other.class.name}"
+ end
+
+ if on.is_a?(String)
+ left_on = on
+ right_on = on
+ end
+
+ if left_on.nil? || right_on.nil?
+ raise ArgumentError, "You should pass the column to join on as an argument."
+ end
+
+ if by_left.is_a?(String) || by_left.is_a?(Expr)
+ by_left_ = [by_left]
+ else
+ by_left_ = by_left
+ end
+
+ if by_right.is_a?(String) || by_right.is_a?(Expr)
+ by_right_ = [by_right]
+ else
+ by_right_ = by_right
+ end
+
+ if by.is_a?(String)
+ by_left_ = [by]
+ by_right_ = [by]
+ elsif by.is_a?(Array)
+ by_left_ = by
+ by_right_ = by
+ end
+
+ tolerance_str = nil
+ tolerance_num = nil
+ if tolerance.is_a?(String)
+ tolerance_str = tolerance
+ else
+ tolerance_num = tolerance
+ end
+
+ _from_rbldf(
+ _ldf.join_asof(
+ other._ldf,
+ Polars.col(left_on)._rbexpr,
+ Polars.col(right_on)._rbexpr,
+ by_left_,
+ by_right_,
+ allow_parallel,
+ force_parallel,
+ suffix,
+ strategy,
+ tolerance_num,
+ tolerance_str
+ )
+ )
+ end
+
# Add a join operation to the Logical Plan.
#
# @param other [LazyFrame]
# Lazy DataFrame to join with.
# @param left_on [Object]
@@ -951,13 +1555,49 @@
end
_from_rbldf(_ldf.with_columns(rbexprs))
end
- # def with_context
- # end
+ # Add an external context to the computation graph.
+ #
+ # This allows expressions to also access columns from DataFrames
+ # that are not part of this one.
+ #
+ # @param other [Object]
+ # Lazy DataFrame to join with.
+ #
+ # @return [LazyFrame]
+ #
+ # @example
+ # df_a = Polars::DataFrame.new({"a" => [1, 2, 3], "b" => ["a", "c", nil]}).lazy
+ # df_other = Polars::DataFrame.new({"c" => ["foo", "ham"]})
+ # (
+ # df_a.with_context(df_other.lazy).select(
+ # [Polars.col("b") + Polars.col("c").first]
+ # )
+ # ).collect
+ # # =>
+ # # shape: (3, 1)
+ # # ┌──────┐
+ # # │ b │
+ # # │ --- │
+ # # │ str │
+ # # ╞══════╡
+ # # │ afoo │
+ # # ├╌╌╌╌╌╌┤
+ # # │ cfoo │
+ # # ├╌╌╌╌╌╌┤
+ # # │ null │
+ # # └──────┘
+ def with_context(other)
+ if !other.is_a?(Array)
+ other = [other]
+ end
+ _from_rbldf(_ldf.with_context(other.map(&:_ldf)))
+ end
+
# Add or overwrite column in a DataFrame.
#
# @param column [Object]
# Expression that evaluates to column or a Series to use.
#
@@ -1229,12 +1869,47 @@
# @return [LazyFrame]
def first
slice(0, 1)
end
- # def with_row_count
- # end
+ # Add a column at index 0 that counts the rows.
+ #
+ # @param name [String]
+ # Name of the column to add.
+ # @param offset [Integer]
+ # Start the row count at this offset.
+ #
+ # @return [LazyFrame]
+ #
+ # @note
+ # This can have a negative effect on query performance.
+ # This may, for instance, block predicate pushdown optimization.
+ #
+ # @example
+ # df = Polars::DataFrame.new(
+ # {
+ # "a" => [1, 3, 5],
+ # "b" => [2, 4, 6]
+ # }
+ # ).lazy
+ # df.with_row_count.collect
+ # # =>
+ # # shape: (3, 3)
+ # # ┌────────┬─────┬─────┐
+ # # │ row_nr ┆ a ┆ b │
+ # # │ --- ┆ --- ┆ --- │
+ # # │ u32 ┆ i64 ┆ i64 │
+ # # ╞════════╪═════╪═════╡
+ # # │ 0 ┆ 1 ┆ 2 │
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ 1 ┆ 3 ┆ 4 │
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ 2 ┆ 5 ┆ 6 │
+ # # └────────┴─────┴─────┘
+ def with_row_count(name: "row_nr", offset: 0)
+ _from_rbldf(_ldf.with_row_count(name, offset))
+ end
# Take every nth row in the LazyFrame and return as a new LazyFrame.
#
# @return [LazyFrame]
#
@@ -1552,14 +2227,109 @@
subset = [subset]
end
_from_rbldf(_ldf.unique(maintain_order, subset, keep))
end
- # def drop_nulls
- # end
+ # Drop rows with null values from this LazyFrame.
+ #
+ # @param subset [Object]
+ # Subset of column(s) on which `drop_nulls` will be applied.
+ #
+ # @return [LazyFrame]
+ #
+ # @example
+ # df = Polars::DataFrame.new(
+ # {
+ # "foo" => [1, 2, 3],
+ # "bar" => [6, nil, 8],
+ # "ham" => ["a", "b", "c"]
+ # }
+ # )
+ # df.lazy.drop_nulls.collect
+ # # =>
+ # # shape: (2, 3)
+ # # ┌─────┬─────┬─────┐
+ # # │ foo ┆ bar ┆ ham │
+ # # │ --- ┆ --- ┆ --- │
+ # # │ i64 ┆ i64 ┆ str │
+ # # ╞═════╪═════╪═════╡
+ # # │ 1 ┆ 6 ┆ a │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ 3 ┆ 8 ┆ c │
+ # # └─────┴─────┴─────┘
+ def drop_nulls(subset: nil)
+ if !subset.nil? && !subset.is_a?(Array)
+ subset = [subset]
+ end
+ _from_rbldf(_ldf.drop_nulls(subset))
+ end
- # def melt
- # end
+ # Unpivot a DataFrame from wide to long format.
+ #
+ # Optionally leaves identifiers set.
+ #
+ # This function is useful to massage a DataFrame into a format where one or more
+ # columns are identifier variables (id_vars), while all other columns, considered
+ # measured variables (value_vars), are "unpivoted" to the row axis, leaving just
+ # two non-identifier columns, 'variable' and 'value'.
+ #
+ # @param id_vars [Object]
+ # Columns to use as identifier variables.
+ # @param value_vars [Object]
+ # Values to use as identifier variables.
+ # If `value_vars` is empty all columns that are not in `id_vars` will be used.
+ # @param variable_name [String]
+ # Name to give to the `value` column. Defaults to "variable"
+ # @param value_name [String]
+ # Name to give to the `value` column. Defaults to "value"
+ #
+ # @return [LazyFrame]
+ #
+ # @example
+ # df = Polars::DataFrame.new(
+ # {
+ # "a" => ["x", "y", "z"],
+ # "b" => [1, 3, 5],
+ # "c" => [2, 4, 6]
+ # }
+ # ).lazy
+ # df.melt(id_vars: "a", value_vars: ["b", "c"]).collect
+ # # =>
+ # # shape: (6, 3)
+ # # ┌─────┬──────────┬───────┐
+ # # │ a ┆ variable ┆ value │
+ # # │ --- ┆ --- ┆ --- │
+ # # │ str ┆ str ┆ i64 │
+ # # ╞═════╪══════════╪═══════╡
+ # # │ x ┆ b ┆ 1 │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
+ # # │ y ┆ b ┆ 3 │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
+ # # │ z ┆ b ┆ 5 │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
+ # # │ x ┆ c ┆ 2 │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
+ # # │ y ┆ c ┆ 4 │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
+ # # │ z ┆ c ┆ 6 │
+ # # └─────┴──────────┴───────┘
+ def melt(id_vars: nil, value_vars: nil, variable_name: nil, value_name: nil)
+ if value_vars.is_a?(String)
+ value_vars = [value_vars]
+ end
+ if id_vars.is_a?(String)
+ id_vars = [id_vars]
+ end
+ if value_vars.nil?
+ value_vars = []
+ end
+ if id_vars.nil?
+ id_vars = []
+ end
+ _from_rbldf(
+ _ldf.melt(id_vars, value_vars, value_name, variable_name)
+ )
+ end
# def map
# end
# Interpolate intermediate values. The interpolation method is linear.