lazy_frame.rb in polars-df-0.1.5

- old
+ new

@@ -147,13 +147,24 @@
     end
 
     # def self.from_json
     # end
 
-    # def self.read_json
-    # end
+    # Read a logical plan from a JSON file to construct a LazyFrame.
+    #
+    # @param file [String]
+    #   Path to a file or a file-like object.
+    #
+    # @return [LazyFrame]
+    def self.read_json(file)
+      if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
+        file = Utils.format_path(file)
+      end
 
+      Utils.wrap_ldf(RbLazyFrame.read_json(file))
+    end
+
     # Get or set column names.
     #
     # @return [Array]
     #
     # @example
@@ -243,15 +254,61 @@
 
         #{describe_plan}
       EOS
     end
 
-    # def write_json
-    # end
+    # Write the logical plan of this LazyFrame to a file or string in JSON format.
+    #
+    # @param file [String]
+    #   File path to which the result should be written.
+    #
+    # @return [nil]
+    def write_json(file)
+      if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
+        file = Utils.format_path(file)
+      end
+      _ldf.write_json(file)
+      nil
+    end
 
-    # def pipe
-    # end
+    # Offers a structured way to apply a sequence of user-defined functions (UDFs).
+    #
+    # @param func [Object]
+    #   Callable; will receive the frame as the first parameter,
+    #   followed by any given args/kwargs.
+    # @param args [Object]
+    #   Arguments to pass to the UDF.
+    # @param kwargs [Object]
+    #   Keyword arguments to pass to the UDF.
+    #
+    # @return [LazyFrame]
+    #
+    # @example
+    #   cast_str_to_int = lambda do |data, col_name:|
+    #     data.with_column(Polars.col(col_name).cast(:i64))
+    #   end
+    #
+    #   df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => ["10", "20", "30", "40"]}).lazy
+    #   df.pipe(cast_str_to_int, col_name: "b").collect()
+    #   # =>
+    #   # shape: (4, 2)
+    #   # ┌─────┬─────┐
+    #   # │ a   ┆ b   │
+    #   # │ --- ┆ --- │
+    #   # │ i64 ┆ i64 │
+    #   # ╞═════╪═════╡
+    #   # │ 1   ┆ 10  │
+    #   # ├╌╌╌╌╌┼╌╌╌╌╌┤
+    #   # │ 2   ┆ 20  │
+    #   # ├╌╌╌╌╌┼╌╌╌╌╌┤
+    #   # │ 3   ┆ 30  │
+    #   # ├╌╌╌╌╌┼╌╌╌╌╌┤
+    #   # │ 4   ┆ 40  │
+    #   # └─────┴─────┘
+    def pipe(func, *args, **kwargs, &block)
+      func.call(self, *args, **kwargs, &block)
+    end
 
     # Create a string representation of the unoptimized query plan.
     #
     # @return [String]
     def describe_plan
@@ -259,13 +316,32 @@
     end
 
     # Create a string representation of the optimized query plan.
     #
     # @return [String]
-    # def describe_optimized_plan
-    # end
+    def describe_optimized_plan(
+      type_coercion: true,
+      predicate_pushdown: true,
+      projection_pushdown: true,
+      simplify_expression: true,
+      slice_pushdown: true,
+      common_subplan_elimination: true,
+      allow_streaming: false
+    )
+      ldf = _ldf.optimization_toggle(
+        type_coercion,
+        predicate_pushdown,
+        projection_pushdown,
+        simplify_expression,
+        slice_pushdown,
+        common_subplan_elimination,
+        allow_streaming,
+      )
 
+      ldf.describe_optimized_plan
+    end
+
     # def show_graph
     # end
 
     # Sort the DataFrame.
     #
@@ -724,19 +800,547 @@
       rbexprs_by = Utils.selection_to_rbexpr_list(by)
       lgb = _ldf.groupby(rbexprs_by, maintain_order)
       LazyGroupBy.new(lgb, self.class)
     end
 
-    # def groupby_rolling
-    # end
+    # Create rolling groups based on a time column.
+    #
+    # Also works for index values of type `:i32` or `:i64`.
+    #
+    # Different from a `dynamic_groupby` the windows are now determined by the
+    # individual values and are not of constant intervals. For constant intervals
+    # use *groupby_dynamic*.
+    #
+    # The `period` and `offset` arguments are created either from a timedelta, or
+    # by using the following string language:
+    #
+    # - 1ns   (1 nanosecond)
+    # - 1us   (1 microsecond)
+    # - 1ms   (1 millisecond)
+    # - 1s    (1 second)
+    # - 1m    (1 minute)
+    # - 1h    (1 hour)
+    # - 1d    (1 day)
+    # - 1w    (1 week)
+    # - 1mo   (1 calendar month)
+    # - 1y    (1 calendar year)
+    # - 1i    (1 index count)
+    #
+    # Or combine them:
+    # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
+    #
+    # In case of a groupby_rolling on an integer column, the windows are defined by:
+    #
+    # - "1i"      # length 1
+    # - "10i"     # length 10
+    #
+    # @param index_column [Object]
+    #   Column used to group based on the time window.
+    #   Often to type Date/Datetime
+    #   This column must be sorted in ascending order. If not the output will not
+    #   make sense.
+    #
+    #   In case of a rolling groupby on indices, dtype needs to be one of
+    #   `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
+    #   performance matters use an `:i64` column.
+    # @param period [Object]
+    #   Length of the window.
+    # @param offset [Object]
+    #   Offset of the window. Default is -period.
+    # @param closed ["right", "left", "both", "none"]
+    #   Define whether the temporal window interval is closed or not.
+    # @param by [Object]
+    #   Also group by this column/these columns.
+    #
+    # @return [LazyFrame]
+    #
+    # @example
+    #   dates = [
+    #     "2020-01-01 13:45:48",
+    #     "2020-01-01 16:42:13",
+    #     "2020-01-01 16:45:09",
+    #     "2020-01-02 18:12:48",
+    #     "2020-01-03 19:45:32",
+    #     "2020-01-08 23:16:43"
+    #   ]
+    #   df = Polars::DataFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
+    #     Polars.col("dt").str.strptime(:datetime)
+    #   )
+    #   df.groupby_rolling(index_column: "dt", period: "2d").agg(
+    #     [
+    #       Polars.sum("a").alias("sum_a"),
+    #       Polars.min("a").alias("min_a"),
+    #       Polars.max("a").alias("max_a")
+    #     ]
+    #   )
+    #   # =>
+    #   # shape: (6, 4)
+    #   # ┌─────────────────────┬───────┬───────┬───────┐
+    #   # │ dt                  ┆ sum_a ┆ min_a ┆ max_a │
+    #   # │ ---                 ┆ ---   ┆ ---   ┆ ---   │
+    #   # │ datetime[μs]        ┆ i64   ┆ i64   ┆ i64   │
+    #   # ╞═════════════════════╪═══════╪═══════╪═══════╡
+    #   # │ 2020-01-01 13:45:48 ┆ 3     ┆ 3     ┆ 3     │
+    #   # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
+    #   # │ 2020-01-01 16:42:13 ┆ 10    ┆ 3     ┆ 7     │
+    #   # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
+    #   # │ 2020-01-01 16:45:09 ┆ 15    ┆ 3     ┆ 7     │
+    #   # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
+    #   # │ 2020-01-02 18:12:48 ┆ 24    ┆ 3     ┆ 9     │
+    #   # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
+    #   # │ 2020-01-03 19:45:32 ┆ 11    ┆ 2     ┆ 9     │
+    #   # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
+    #   # │ 2020-01-08 23:16:43 ┆ 1     ┆ 1     ┆ 1     │
+    #   # └─────────────────────┴───────┴───────┴───────┘
+    def groupby_rolling(
+      index_column:,
+      period:,
+      offset: nil,
+      closed: "right",
+      by: nil
+    )
+      if offset.nil?
+        offset = "-#{period}"
+      end
 
-    # def groupby_dynamic
-    # end
+      rbexprs_by = by.nil? ? [] : Utils.selection_to_rbexpr_list(by)
+      period = Utils._timedelta_to_pl_duration(period)
+      offset = Utils._timedelta_to_pl_duration(offset)
 
-    # def join_asof
-    # end
+      lgb = _ldf.groupby_rolling(
+        index_column, period, offset, closed, rbexprs_by
+      )
+      LazyGroupBy.new(lgb, self.class)
+    end
 
+    # Group based on a time value (or index value of type `:i32`, `:i64`).
+    #
+    # Time windows are calculated and rows are assigned to windows. Different from a
+    # normal groupby is that a row can be member of multiple groups. The time/index
+    # window could be seen as a rolling window, with a window size determined by
+    # dates/times/values instead of slots in the DataFrame.
+    #
+    # A window is defined by:
+    #
+    # - every: interval of the window
+    # - period: length of the window
+    # - offset: offset of the window
+    #
+    # The `every`, `period` and `offset` arguments are created with
+    # the following string language:
+    #
+    # - 1ns   (1 nanosecond)
+    # - 1us   (1 microsecond)
+    # - 1ms   (1 millisecond)
+    # - 1s    (1 second)
+    # - 1m    (1 minute)
+    # - 1h    (1 hour)
+    # - 1d    (1 day)
+    # - 1w    (1 week)
+    # - 1mo   (1 calendar month)
+    # - 1y    (1 calendar year)
+    # - 1i    (1 index count)
+    #
+    # Or combine them:
+    # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
+    #
+    # In case of a groupby_dynamic on an integer column, the windows are defined by:
+    #
+    # - "1i"      # length 1
+    # - "10i"     # length 10
+    #
+    # @param index_column
+    #   Column used to group based on the time window.
+    #   Often to type Date/Datetime
+    #   This column must be sorted in ascending order. If not the output will not
+    #   make sense.
+    #
+    #   In case of a dynamic groupby on indices, dtype needs to be one of
+    #   `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
+    #   performance matters use an `:i64` column.
+    # @param every
+    #   Interval of the window.
+    # @param period
+    #   Length of the window, if None it is equal to 'every'.
+    # @param offset
+    #   Offset of the window if None and period is None it will be equal to negative
+    #   `every`.
+    # @param truncate
+    #   Truncate the time value to the window lower bound.
+    # @param include_boundaries
+    #   Add the lower and upper bound of the window to the "_lower_bound" and
+    #   "_upper_bound" columns. This will impact performance because it's harder to
+    #   parallelize
+    # @param closed ["right", "left", "both", "none"]
+    #   Define whether the temporal window interval is closed or not.
+    # @param by
+    #   Also group by this column/these columns
+    #
+    # @return [DataFrame]
+    #
+    # @example
+    #   df = Polars::DataFrame.new(
+    #     {
+    #       "time" => Polars.date_range(
+    #         DateTime.new(2021, 12, 16),
+    #         DateTime.new(2021, 12, 16, 3),
+    #         "30m"
+    #       ),
+    #       "n" => 0..6
+    #     }
+    #   )
+    #   # =>
+    #   # shape: (7, 2)
+    #   # ┌─────────────────────┬─────┐
+    #   # │ time                ┆ n   │
+    #   # │ ---                 ┆ --- │
+    #   # │ datetime[μs]        ┆ i64 │
+    #   # ╞═════════════════════╪═════╡
+    #   # │ 2021-12-16 00:00:00 ┆ 0   │
+    #   # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
+    #   # │ 2021-12-16 00:30:00 ┆ 1   │
+    #   # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
+    #   # │ 2021-12-16 01:00:00 ┆ 2   │
+    #   # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
+    #   # │ 2021-12-16 01:30:00 ┆ 3   │
+    #   # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
+    #   # │ 2021-12-16 02:00:00 ┆ 4   │
+    #   # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
+    #   # │ 2021-12-16 02:30:00 ┆ 5   │
+    #   # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
+    #   # │ 2021-12-16 03:00:00 ┆ 6   │
+    #   # └─────────────────────┴─────┘
+    #
+    # @example Group by windows of 1 hour starting at 2021-12-16 00:00:00.
+    #   df.groupby_dynamic("time", every: "1h", closed: "right").agg(
+    #     [
+    #       Polars.col("time").min.alias("time_min"),
+    #       Polars.col("time").max.alias("time_max")
+    #     ]
+    #   )
+    #   # =>
+    #   # shape: (4, 3)
+    #   # ┌─────────────────────┬─────────────────────┬─────────────────────┐
+    #   # │ time                ┆ time_min            ┆ time_max            │
+    #   # │ ---                 ┆ ---                 ┆ ---                 │
+    #   # │ datetime[μs]        ┆ datetime[μs]        ┆ datetime[μs]        │
+    #   # ╞═════════════════════╪═════════════════════╪═════════════════════╡
+    #   # │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │
+    #   # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
+    #   # │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │
+    #   # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
+    #   # │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │
+    #   # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
+    #   # │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │
+    #   # └─────────────────────┴─────────────────────┴─────────────────────┘
+    #
+    # @example The window boundaries can also be added to the aggregation result.
+    #   df.groupby_dynamic(
+    #     "time", every: "1h", include_boundaries: true, closed: "right"
+    #   ).agg([Polars.col("time").count.alias("time_count")])
+    #   # =>
+    #   # shape: (4, 4)
+    #   # ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐
+    #   # │ _lower_boundary     ┆ _upper_boundary     ┆ time                ┆ time_count │
+    #   # │ ---                 ┆ ---                 ┆ ---                 ┆ ---        │
+    #   # │ datetime[μs]        ┆ datetime[μs]        ┆ datetime[μs]        ┆ u32        │
+    #   # ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡
+    #   # │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1          │
+    #   # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
+    #   # │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2          │
+    #   # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
+    #   # │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2          │
+    #   # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
+    #   # │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2          │
+    #   # └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
+    #
+    # @example When closed="left", should not include right end of interval.
+    #   df.groupby_dynamic("time", every: "1h", closed: "left").agg(
+    #     [
+    #       Polars.col("time").count.alias("time_count"),
+    #       Polars.col("time").list.alias("time_agg_list")
+    #     ]
+    #   )
+    #   # =>
+    #   # shape: (4, 3)
+    #   # ┌─────────────────────┬────────────┬─────────────────────────────────────┐
+    #   # │ time                ┆ time_count ┆ time_agg_list                       │
+    #   # │ ---                 ┆ ---        ┆ ---                                 │
+    #   # │ datetime[μs]        ┆ u32        ┆ list[datetime[μs]]                  │
+    #   # ╞═════════════════════╪════════════╪═════════════════════════════════════╡
+    #   # │ 2021-12-16 00:00:00 ┆ 2          ┆ [2021-12-16 00:00:00, 2021-12-16... │
+    #   # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
+    #   # │ 2021-12-16 01:00:00 ┆ 2          ┆ [2021-12-16 01:00:00, 2021-12-16... │
+    #   # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
+    #   # │ 2021-12-16 02:00:00 ┆ 2          ┆ [2021-12-16 02:00:00, 2021-12-16... │
+    #   # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
+    #   # │ 2021-12-16 03:00:00 ┆ 1          ┆ [2021-12-16 03:00:00]               │
+    #   # └─────────────────────┴────────────┴─────────────────────────────────────┘
+    #
+    # @example When closed="both" the time values at the window boundaries belong to 2 groups.
+    #   df.groupby_dynamic("time", every: "1h", closed: "both").agg(
+    #     [Polars.col("time").count.alias("time_count")]
+    #   )
+    #   # =>
+    #   # shape: (5, 2)
+    #   # ┌─────────────────────┬────────────┐
+    #   # │ time                ┆ time_count │
+    #   # │ ---                 ┆ ---        │
+    #   # │ datetime[μs]        ┆ u32        │
+    #   # ╞═════════════════════╪════════════╡
+    #   # │ 2021-12-15 23:00:00 ┆ 1          │
+    #   # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
+    #   # │ 2021-12-16 00:00:00 ┆ 3          │
+    #   # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
+    #   # │ 2021-12-16 01:00:00 ┆ 3          │
+    #   # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
+    #   # │ 2021-12-16 02:00:00 ┆ 3          │
+    #   # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
+    #   # │ 2021-12-16 03:00:00 ┆ 1          │
+    #   # └─────────────────────┴────────────┘
+    #
+    # @example Dynamic groupbys can also be combined with grouping on normal keys.
+    #   df = Polars::DataFrame.new(
+    #     {
+    #       "time" => Polars.date_range(
+    #         DateTime.new(2021, 12, 16),
+    #         DateTime.new(2021, 12, 16, 3),
+    #         "30m"
+    #       ),
+    #       "groups" => ["a", "a", "a", "b", "b", "a", "a"]
+    #     }
+    #   )
+    #   df.groupby_dynamic(
+    #     "time",
+    #     every: "1h",
+    #     closed: "both",
+    #     by: "groups",
+    #     include_boundaries: true
+    #   ).agg([Polars.col("time").count.alias("time_count")])
+    #   # =>
+    #   # shape: (7, 5)
+    #   # ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐
+    #   # │ groups ┆ _lower_boundary     ┆ _upper_boundary     ┆ time                ┆ time_count │
+    #   # │ ---    ┆ ---                 ┆ ---                 ┆ ---                 ┆ ---        │
+    #   # │ str    ┆ datetime[μs]        ┆ datetime[μs]        ┆ datetime[μs]        ┆ u32        │
+    #   # ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡
+    #   # │ a      ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1          │
+    #   # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
+    #   # │ a      ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3          │
+    #   # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
+    #   # │ a      ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1          │
+    #   # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
+    #   # │ a      ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2          │
+    #   # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
+    #   # │ a      ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1          │
+    #   # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
+    #   # │ b      ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2          │
+    #   # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
+    #   # │ b      ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1          │
+    #   # └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
+    #
+    # @example Dynamic groupby on an index column.
+    #   df = Polars::DataFrame.new(
+    #     {
+    #       "idx" => Polars.arange(0, 6, eager: true),
+    #       "A" => ["A", "A", "B", "B", "B", "C"]
+    #     }
+    #   )
+    #   df.groupby_dynamic(
+    #     "idx",
+    #     every: "2i",
+    #     period: "3i",
+    #     include_boundaries: true,
+    #     closed: "right"
+    #   ).agg(Polars.col("A").list.alias("A_agg_list"))
+    #   # =>
+    #   # shape: (3, 4)
+    #   # ┌─────────────────┬─────────────────┬─────┬─────────────────┐
+    #   # │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list      │
+    #   # │ ---             ┆ ---             ┆ --- ┆ ---             │
+    #   # │ i64             ┆ i64             ┆ i64 ┆ list[str]       │
+    #   # ╞═════════════════╪═════════════════╪═════╪═════════════════╡
+    #   # │ 0               ┆ 3               ┆ 0   ┆ ["A", "B", "B"] │
+    #   # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
+    #   # │ 2               ┆ 5               ┆ 2   ┆ ["B", "B", "C"] │
+    #   # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
+    #   # │ 4               ┆ 7               ┆ 4   ┆ ["C"]           │
+    #   # └─────────────────┴─────────────────┴─────┴─────────────────┘
+    def groupby_dynamic(
+      index_column,
+      every:,
+      period: nil,
+      offset: nil,
+      truncate: true,
+      include_boundaries: false,
+      closed: "left",
+      by: nil
+    )
+      if offset.nil?
+        if period.nil?
+          offset = "-#{every}"
+        else
+          offset = "0ns"
+        end
+      end
+
+      if period.nil?
+        period = every
+      end
+
+      period = Utils._timedelta_to_pl_duration(period)
+      offset = Utils._timedelta_to_pl_duration(offset)
+      every = Utils._timedelta_to_pl_duration(every)
+
+      rbexprs_by = by.nil? ? [] : Utils.selection_to_rbexpr_list(by)
+      lgb = _ldf.groupby_dynamic(
+        index_column,
+        every,
+        period,
+        offset,
+        truncate,
+        include_boundaries,
+        closed,
+        rbexprs_by
+      )
+      LazyGroupBy.new(lgb, self.class)
+    end
+
+    # Perform an asof join.
+    #
+    # This is similar to a left-join except that we match on nearest key rather than
+    # equal keys.
+    #
+    # Both DataFrames must be sorted by the join_asof key.
+    #
+    # For each row in the left DataFrame:
+    #
+    # - A "backward" search selects the last row in the right DataFrame whose 'on' key is less than or equal to the left's key.
+    # - A "forward" search selects the first row in the right DataFrame whose 'on' key is greater than or equal to the left's key.
+    #
+    # The default is "backward".
+    #
+    # @param other [LazyFrame]
+    #   Lazy DataFrame to join with.
+    # @param left_on [String]
+    #   Join column of the left DataFrame.
+    # @param right_on [String]
+    #   Join column of the right DataFrame.
+    # @param on [String]
+    #   Join column of both DataFrames. If set, `left_on` and `right_on` should be
+    #   None.
+    # @param by [Object]
+    #   Join on these columns before doing asof join.
+    # @param by_left [Object]
+    #   Join on these columns before doing asof join.
+    # @param by_right [Object]
+    #   Join on these columns before doing asof join.
+    # @param strategy ["backward", "forward"]
+    #   Join strategy.
+    # @param suffix [String]
+    #   Suffix to append to columns with a duplicate name.
+    # @param tolerance [Object]
+    #   Numeric tolerance. By setting this the join will only be done if the near
+    #   keys are within this distance. If an asof join is done on columns of dtype
+    #   "Date", "Datetime", "Duration" or "Time" you use the following string
+    #   language:
+    #
+    #   - 1ns   (1 nanosecond)
+    #   - 1us   (1 microsecond)
+    #   - 1ms   (1 millisecond)
+    #   - 1s    (1 second)
+    #   - 1m    (1 minute)
+    #   - 1h    (1 hour)
+    #   - 1d    (1 day)
+    #   - 1w    (1 week)
+    #   - 1mo   (1 calendar month)
+    #   - 1y    (1 calendar year)
+    #   - 1i    (1 index count)
+    #
+    #   Or combine them:
+    #   "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
+    #
+    # @param allow_parallel [Boolean]
+    #   Allow the physical plan to optionally evaluate the computation of both
+    #   DataFrames up to the join in parallel.
+    # @param force_parallel [Boolean]
+    #   Force the physical plan to evaluate the computation of both DataFrames up to
+    #   the join in parallel.
+    #
+    # @return [LazyFrame]
+    def join_asof(
+      other,
+      left_on: nil,
+      right_on: nil,
+      on: nil,
+      by_left: nil,
+      by_right: nil,
+      by: nil,
+      strategy: "backward",
+      suffix: "_right",
+      tolerance: nil,
+      allow_parallel: true,
+      force_parallel: false
+    )
+      if !other.is_a?(LazyFrame)
+        raise ArgumentError, "Expected a `LazyFrame` as join table, got #{other.class.name}"
+      end
+
+      if on.is_a?(String)
+        left_on = on
+        right_on = on
+      end
+
+      if left_on.nil? || right_on.nil?
+        raise ArgumentError, "You should pass the column to join on as an argument."
+      end
+
+      if by_left.is_a?(String) || by_left.is_a?(Expr)
+        by_left_ = [by_left]
+      else
+        by_left_ = by_left
+      end
+
+      if by_right.is_a?(String) || by_right.is_a?(Expr)
+        by_right_ = [by_right]
+      else
+        by_right_ = by_right
+      end
+
+      if by.is_a?(String)
+        by_left_ = [by]
+        by_right_ = [by]
+      elsif by.is_a?(Array)
+        by_left_ = by
+        by_right_ = by
+      end
+
+      tolerance_str = nil
+      tolerance_num = nil
+      if tolerance.is_a?(String)
+        tolerance_str = tolerance
+      else
+        tolerance_num = tolerance
+      end
+
+      _from_rbldf(
+        _ldf.join_asof(
+          other._ldf,
+          Polars.col(left_on)._rbexpr,
+          Polars.col(right_on)._rbexpr,
+          by_left_,
+          by_right_,
+          allow_parallel,
+          force_parallel,
+          suffix,
+          strategy,
+          tolerance_num,
+          tolerance_str
+        )
+      )
+    end
+
     # Add a join operation to the Logical Plan.
     #
     # @param other [LazyFrame]
     #   Lazy DataFrame to join with.
     # @param left_on [Object]
@@ -951,13 +1555,49 @@
       end
 
       _from_rbldf(_ldf.with_columns(rbexprs))
     end
 
-    # def with_context
-    # end
+    # Add an external context to the computation graph.
+    #
+    # This allows expressions to also access columns from DataFrames
+    # that are not part of this one.
+    #
+    # @param other [Object]
+    #   Lazy DataFrame to join with.
+    #
+    # @return [LazyFrame]
+    #
+    # @example
+    #   df_a = Polars::DataFrame.new({"a" => [1, 2, 3], "b" => ["a", "c", nil]}).lazy
+    #   df_other = Polars::DataFrame.new({"c" => ["foo", "ham"]})
+    #   (
+    #     df_a.with_context(df_other.lazy).select(
+    #       [Polars.col("b") + Polars.col("c").first]
+    #     )
+    #   ).collect
+    #   # =>
+    #   # shape: (3, 1)
+    #   # ┌──────┐
+    #   # │ b    │
+    #   # │ ---  │
+    #   # │ str  │
+    #   # ╞══════╡
+    #   # │ afoo │
+    #   # ├╌╌╌╌╌╌┤
+    #   # │ cfoo │
+    #   # ├╌╌╌╌╌╌┤
+    #   # │ null │
+    #   # └──────┘
+    def with_context(other)
+      if !other.is_a?(Array)
+        other = [other]
+      end
 
+      _from_rbldf(_ldf.with_context(other.map(&:_ldf)))
+    end
+
     # Add or overwrite column in a DataFrame.
     #
     # @param column [Object]
     #   Expression that evaluates to column or a Series to use.
     #
@@ -1229,12 +1869,47 @@
     # @return [LazyFrame]
     def first
       slice(0, 1)
     end
 
-    # def with_row_count
-    # end
+    # Add a column at index 0 that counts the rows.
+    #
+    # @param name [String]
+    #   Name of the column to add.
+    # @param offset [Integer]
+    #   Start the row count at this offset.
+    #
+    # @return [LazyFrame]
+    #
+    # @note
+    #   This can have a negative effect on query performance.
+    #   This may, for instance, block predicate pushdown optimization.
+    #
+    # @example
+    #   df = Polars::DataFrame.new(
+    #     {
+    #       "a" => [1, 3, 5],
+    #       "b" => [2, 4, 6]
+    #     }
+    #   ).lazy
+    #   df.with_row_count.collect
+    #   # =>
+    #   # shape: (3, 3)
+    #   # ┌────────┬─────┬─────┐
+    #   # │ row_nr ┆ a   ┆ b   │
+    #   # │ ---    ┆ --- ┆ --- │
+    #   # │ u32    ┆ i64 ┆ i64 │
+    #   # ╞════════╪═════╪═════╡
+    #   # │ 0      ┆ 1   ┆ 2   │
+    #   # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
+    #   # │ 1      ┆ 3   ┆ 4   │
+    #   # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
+    #   # │ 2      ┆ 5   ┆ 6   │
+    #   # └────────┴─────┴─────┘
+    def with_row_count(name: "row_nr", offset: 0)
+      _from_rbldf(_ldf.with_row_count(name, offset))
+    end
 
     # Take every nth row in the LazyFrame and return as a new LazyFrame.
     #
     # @return [LazyFrame]
     #
@@ -1552,14 +2227,109 @@
         subset = [subset]
       end
       _from_rbldf(_ldf.unique(maintain_order, subset, keep))
     end
 
-    # def drop_nulls
-    # end
+    # Drop rows with null values from this LazyFrame.
+    #
+    # @param subset [Object]
+    #   Subset of column(s) on which `drop_nulls` will be applied.
+    #
+    # @return [LazyFrame]
+    #
+    # @example
+    #   df = Polars::DataFrame.new(
+    #     {
+    #       "foo" => [1, 2, 3],
+    #       "bar" => [6, nil, 8],
+    #       "ham" => ["a", "b", "c"]
+    #     }
+    #   )
+    #   df.lazy.drop_nulls.collect
+    #   # =>
+    #   # shape: (2, 3)
+    #   # ┌─────┬─────┬─────┐
+    #   # │ foo ┆ bar ┆ ham │
+    #   # │ --- ┆ --- ┆ --- │
+    #   # │ i64 ┆ i64 ┆ str │
+    #   # ╞═════╪═════╪═════╡
+    #   # │ 1   ┆ 6   ┆ a   │
+    #   # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
+    #   # │ 3   ┆ 8   ┆ c   │
+    #   # └─────┴─────┴─────┘
+    def drop_nulls(subset: nil)
+      if !subset.nil? && !subset.is_a?(Array)
+        subset = [subset]
+      end
+      _from_rbldf(_ldf.drop_nulls(subset))
+    end
 
-    # def melt
-    # end
+    # Unpivot a DataFrame from wide to long format.
+    #
+    # Optionally leaves identifiers set.
+    #
+    # This function is useful to massage a DataFrame into a format where one or more
+    # columns are identifier variables (id_vars), while all other columns, considered
+    # measured variables (value_vars), are "unpivoted" to the row axis, leaving just
+    # two non-identifier columns, 'variable' and 'value'.
+    #
+    # @param id_vars [Object]
+    #   Columns to use as identifier variables.
+    # @param value_vars [Object]
+    #   Values to use as identifier variables.
+    #   If `value_vars` is empty all columns that are not in `id_vars` will be used.
+    # @param variable_name [String]
+    #   Name to give to the `value` column. Defaults to "variable"
+    # @param value_name [String]
+    #   Name to give to the `value` column. Defaults to "value"
+    #
+    # @return [LazyFrame]
+    #
+    # @example
+    #   df = Polars::DataFrame.new(
+    #     {
+    #       "a" => ["x", "y", "z"],
+    #       "b" => [1, 3, 5],
+    #       "c" => [2, 4, 6]
+    #     }
+    #   ).lazy
+    #   df.melt(id_vars: "a", value_vars: ["b", "c"]).collect
+    #   # =>
+    #   # shape: (6, 3)
+    #   # ┌─────┬──────────┬───────┐
+    #   # │ a   ┆ variable ┆ value │
+    #   # │ --- ┆ ---      ┆ ---   │
+    #   # │ str ┆ str      ┆ i64   │
+    #   # ╞═════╪══════════╪═══════╡
+    #   # │ x   ┆ b        ┆ 1     │
+    #   # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
+    #   # │ y   ┆ b        ┆ 3     │
+    #   # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
+    #   # │ z   ┆ b        ┆ 5     │
+    #   # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
+    #   # │ x   ┆ c        ┆ 2     │
+    #   # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
+    #   # │ y   ┆ c        ┆ 4     │
+    #   # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
+    #   # │ z   ┆ c        ┆ 6     │
+    #   # └─────┴──────────┴───────┘
+    def melt(id_vars: nil, value_vars: nil, variable_name: nil, value_name: nil)
+      if value_vars.is_a?(String)
+        value_vars = [value_vars]
+      end
+      if id_vars.is_a?(String)
+        id_vars = [id_vars]
+      end
+      if value_vars.nil?
+        value_vars = []
+      end
+      if id_vars.nil?
+        id_vars = []
+      end
+      _from_rbldf(
+        _ldf.melt(id_vars, value_vars, value_name, variable_name)
+      )
+    end
 
     # def map
     # end
 
     # Interpolate intermediate values. The interpolation method is linear.