lib/polars/lazy_frame.rb in polars-df-0.1.2 vs lib/polars/lazy_frame.rb in polars-df-0.1.3
- old
+ new
@@ -150,56 +150,228 @@
# end
# def self.read_json
# end
- # def columns
- # end
+ # Get or set column names.
+ #
+ # @return [Array]
+ #
+ # @example
+ # df = (
+ # Polars::DataFrame.new(
+ # {
+ # "foo" => [1, 2, 3],
+ # "bar" => [6, 7, 8],
+ # "ham" => ["a", "b", "c"]
+ # }
+ # )
+ # .lazy
+ # .select(["foo", "bar"])
+ # )
+ # df.columns
+ # # => ["foo", "bar"]
+ def columns
+ _ldf.columns
+ end
- # def dtypes
- # end
+ # Get dtypes of columns in LazyFrame.
+ #
+ # @return [Array]
+ #
+ # @example
+ # lf = Polars::DataFrame.new(
+ # {
+ # "foo" => [1, 2, 3],
+ # "bar" => [6.0, 7.0, 8.0],
+ # "ham" => ["a", "b", "c"]
+ # }
+ # ).lazy
+ # lf.dtypes
+ # # => [:i64, :f64, :str]
+ def dtypes
+ _ldf.dtypes
+ end
- # def schema
- # end
+ # Get the schema.
+ #
+ # @return [Hash]
+ #
+ # @example
+ # lf = Polars::DataFrame.new(
+ # {
+ # "foo" => [1, 2, 3],
+ # "bar" => [6.0, 7.0, 8.0],
+ # "ham" => ["a", "b", "c"]
+ # }
+ # ).lazy
+ # lf.schema
+ # # => {"foo"=>:i64, "bar"=>:f64, "ham"=>:str}
+ def schema
+ _ldf.schema
+ end
- # def width
- # end
+ # Get the width of the LazyFrame.
+ #
+ # @return [Integer]
+ #
+ # @example
+ # lf = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]}).lazy
+ # lf.width
+ # # => 2
+ def width
+ _ldf.width
+ end
- # def include?(key)
- # end
+ # Check if LazyFrame includes key.
+ #
+ # @return [Boolean]
+ def include?(key)
+ columns.include?(key)
+ end
# clone handled by initialize_copy
# def [](item)
# end
- # def to_s
- # end
- # alias_method :inspect, :to_s
+ # Returns a string representing the LazyFrame.
+ #
+ # @return [String]
+ def to_s
+ <<~EOS
+ naive plan: (run LazyFrame#describe_optimized_plan to see the optimized plan)
+ #{describe_plan}
+ EOS
+ end
+
# def write_json
# end
# def pipe
# end
- # def describe_plan
- # end
+ # Create a string representation of the unoptimized query plan.
+ #
+ # @return [String]
+ def describe_plan
+ _ldf.describe_plan
+ end
+ # Create a string representation of the optimized query plan.
+ #
+ # @return [String]
# def describe_optimized_plan
# end
# def show_graph
# end
- # def sort
- # end
+ # Sort the DataFrame.
+ #
+ # Sorting can be done by:
+ #
+ # - A single column name
+ # - An expression
+ # - Multiple expressions
+ #
+ # @param by [Object]
+ # Column (expressions) to sort by.
+ # @param reverse [Boolean]
+ # Sort in descending order.
+ # @param nulls_last [Boolean]
+ # Place null values last. Can only be used if sorted by a single column.
+ #
+ # @return [LazyFrame]
+ #
+ # @example
+ # df = Polars::DataFrame.new(
+ # {
+ # "foo" => [1, 2, 3],
+ # "bar" => [6.0, 7.0, 8.0],
+ # "ham" => ["a", "b", "c"]
+ # }
+ # ).lazy
+ # df.sort("foo", reverse: true).collect
+ # # =>
+ # # shape: (3, 3)
+ # # ┌─────┬─────┬─────┐
+ # # │ foo ┆ bar ┆ ham │
+ # # │ --- ┆ --- ┆ --- │
+ # # │ i64 ┆ f64 ┆ str │
+ # # ╞═════╪═════╪═════╡
+ # # │ 3 ┆ 8.0 ┆ c │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ 2 ┆ 7.0 ┆ b │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ 1 ┆ 6.0 ┆ a │
+ # # └─────┴─────┴─────┘
+ def sort(by, reverse: false, nulls_last: false)
+ if by.is_a?(String)
+ _from_rbldf(_ldf.sort(by, reverse, nulls_last))
+ end
+ if Utils.bool?(reverse)
+ reverse = [reverse]
+ end
+ by = Utils.selection_to_rbexpr_list(by)
+ _from_rbldf(_ldf.sort_by_exprs(by, reverse, nulls_last))
+ end
+
# def profile
# end
+ # Collect into a DataFrame.
#
+ # Note: use {#fetch} if you want to run your query on the first `n` rows
+ # only. This can be a huge time saver in debugging queries.
+ #
+ # @param type_coercion [Boolean]
+ # Do type coercion optimization.
+ # @param predicate_pushdown [Boolean]
+ # Do predicate pushdown optimization.
+ # @param projection_pushdown [Boolean]
+ # Do projection pushdown optimization.
+ # @param simplify_expression [Boolean]
+ # Run simplify expressions optimization.
+ # @param string_cache [Boolean]
+ # This argument is deprecated. Please set the string cache globally.
+ # The argument will be ignored
+ # @param no_optimization [Boolean]
+ # Turn off (certain) optimizations.
+ # @param slice_pushdown [Boolean]
+ # Slice pushdown optimization.
+ # @param common_subplan_elimination [Boolean]
+ # Will try to cache branching subplans that occur on self-joins or unions.
+ # @param allow_streaming [Boolean]
+ # Run parts of the query in a streaming fashion (this is in an alpha state)
+ #
+ # @return [DataFrame]
+ #
+ # @example
+ # df = Polars::DataFrame.new(
+ # {
+ # "a" => ["a", "b", "a", "b", "b", "c"],
+ # "b" => [1, 2, 3, 4, 5, 6],
+ # "c" => [6, 5, 4, 3, 2, 1]
+ # }
+ # ).lazy
+ # df.groupby("a", maintain_order: true).agg(Polars.all.sum).collect
+ # # =>
+ # # shape: (3, 3)
+ # # ┌─────┬─────┬─────┐
+ # # │ a ┆ b ┆ c │
+ # # │ --- ┆ --- ┆ --- │
+ # # │ str ┆ i64 ┆ i64 │
+ # # ╞═════╪═════╪═════╡
+ # # │ a ┆ 4 ┆ 10 │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ b ┆ 11 ┆ 10 │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ c ┆ 6 ┆ 1 │
+ # # └─────┴─────┴─────┘
def collect(
type_coercion: true,
predicate_pushdown: true,
projection_pushdown: true,
simplify_expression: true,
@@ -230,38 +402,326 @@
allow_streaming
)
Utils.wrap_df(ldf.collect)
end
- # def fetch
- # end
+ # Collect a small number of rows for debugging purposes.
+ #
+ # Fetch is like a {#collect} operation, but it overwrites the number of rows
+ # read by every scan operation. This is a utility that helps debug a query on a
+ # smaller number of rows.
+ #
+ # Note that the fetch does not guarantee the final number of rows in the
+ # DataFrame. Filter, join operations and a lower number of rows available in the
+ # scanned file influence the final number of rows.
+ #
+ # @param n_rows [Integer]
+ # Collect n_rows from the data sources.
+ # @param type_coercion [Boolean]
+ # Run type coercion optimization.
+ # @param predicate_pushdown [Boolean]
+ # Run predicate pushdown optimization.
+ # @param projection_pushdown [Boolean]
+ # Run projection pushdown optimization.
+ # @param simplify_expression [Boolean]
+ # Run simplify expressions optimization.
+ # @param string_cache [Boolean]
+ # This argument is deprecated. Please set the string cache globally.
+ # The argument will be ignored
+ # @param no_optimization [Boolean]
+ # Turn off optimizations.
+ # @param slice_pushdown [Boolean]
+ # Slice pushdown optimization
+ # @param common_subplan_elimination [Boolean]
+ # Will try to cache branching subplans that occur on self-joins or unions.
+ # @param allow_streaming [Boolean]
+ # Run parts of the query in a streaming fashion (this is in an alpha state)
+ #
+ # @return [DataFrame]
+ #
+ # @example
+ # df = Polars::DataFrame.new(
+ # {
+ # "a" => ["a", "b", "a", "b", "b", "c"],
+ # "b" => [1, 2, 3, 4, 5, 6],
+ # "c" => [6, 5, 4, 3, 2, 1]
+ # }
+ # ).lazy
+ # df.groupby("a", maintain_order: true).agg(Polars.all.sum).fetch(2)
+ # # =>
+ # # shape: (2, 3)
+ # # ┌─────┬─────┬─────┐
+ # # │ a ┆ b ┆ c │
+ # # │ --- ┆ --- ┆ --- │
+ # # │ str ┆ i64 ┆ i64 │
+ # # ╞═════╪═════╪═════╡
+ # # │ a ┆ 1 ┆ 6 │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ b ┆ 2 ┆ 5 │
+ # # └─────┴─────┴─────┘
+ def fetch(
+ n_rows = 500,
+ type_coercion: true,
+ predicate_pushdown: true,
+ projection_pushdown: true,
+ simplify_expression: true,
+ string_cache: false,
+ no_optimization: false,
+ slice_pushdown: true,
+ common_subplan_elimination: true,
+ allow_streaming: false
+ )
+ if no_optimization
+ predicate_pushdown = false
+ projection_pushdown = false
+ slice_pushdown = false
+ common_subplan_elimination = false
+ end
+ ldf = _ldf.optimization_toggle(
+ type_coercion,
+ predicate_pushdown,
+ projection_pushdown,
+ simplify_expression,
+ slice_pushdown,
+ common_subplan_elimination,
+ allow_streaming
+ )
+ Utils.wrap_df(ldf.fetch(n_rows))
+ end
+
+ # Return lazy representation, i.e. itself.
#
+ # Useful for writing code that expects either a `DataFrame` or
+ # `LazyFrame`.
+ #
+ # @return [LazyFrame]
+ #
+ # @example
+ # df = Polars::DataFrame.new(
+ # {
+ # "a" => [nil, 2, 3, 4],
+ # "b" => [0.5, nil, 2.5, 13],
+ # "c" => [true, true, false, nil]
+ # }
+ # )
+ # df.lazy
def lazy
self
end
- # def cache
- # end
+ # Cache the result once the execution of the physical plan hits this node.
+ #
+ # @return [LazyFrame]
+ def cache
+ _from_rbldf(_ldf.cache)
+ end
- # def cleared
- # end
+ # Create an empty copy of the current LazyFrame.
+ #
+ # The copy has an identical schema but no data.
+ #
+ # @return [LazyFrame]
+ #
+ # @example
+ # df = Polars::DataFrame.new(
+ # {
+ # "a" => [nil, 2, 3, 4],
+ # "b" => [0.5, nil, 2.5, 13],
+ # "c" => [true, true, false, nil],
+ # }
+ # ).lazy
+ # df.cleared.fetch
+ # # =>
+ # # shape: (0, 3)
+ # # ┌─────┬─────┬──────┐
+ # # │ a ┆ b ┆ c │
+ # # │ --- ┆ --- ┆ --- │
+ # # │ i64 ┆ f64 ┆ bool │
+ # # ╞═════╪═════╪══════╡
+ # # └─────┴─────┴──────┘
+ def cleared
+ DataFrame.new(columns: schema).lazy
+ end
+ # Filter the rows in the DataFrame based on a predicate expression.
#
+ # @param predicate [Object]
+ # Expression that evaluates to a boolean Series.
+ #
+ # @return [LazyFrame]
+ #
+ # @example Filter on one condition:
+ # lf = Polars::DataFrame.new(
+ # {
+ # "foo" => [1, 2, 3],
+ # "bar" => [6, 7, 8],
+ # "ham" => ["a", "b", "c"]
+ # }
+ # ).lazy
+ # lf.filter(Polars.col("foo") < 3).collect()
+ # # =>
+ # # shape: (2, 3)
+ # # ┌─────┬─────┬─────┐
+ # # │ foo ┆ bar ┆ ham │
+ # # │ --- ┆ --- ┆ --- │
+ # # │ i64 ┆ i64 ┆ str │
+ # # ╞═════╪═════╪═════╡
+ # # │ 1 ┆ 6 ┆ a │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ 2 ┆ 7 ┆ b │
+ # # └─────┴─────┴─────┘
+ #
+ # @example Filter on multiple conditions:
+ # lf.filter((Polars.col("foo") < 3) & (Polars.col("ham") == "a")).collect
+ # # =>
+ # # shape: (1, 3)
+ # # ┌─────┬─────┬─────┐
+ # # │ foo ┆ bar ┆ ham │
+ # # │ --- ┆ --- ┆ --- │
+ # # │ i64 ┆ i64 ┆ str │
+ # # ╞═════╪═════╪═════╡
+ # # │ 1 ┆ 6 ┆ a │
+ # # └─────┴─────┴─────┘
def filter(predicate)
_from_rbldf(
_ldf.filter(
Utils.expr_to_lit_or_expr(predicate, str_to_lit: false)._rbexpr
)
)
end
+ # Select columns from this DataFrame.
+ #
+ # @param exprs [Object]
+ # Column or columns to select.
+ #
+ # @return [LazyFrame]
+ #
+ # @example
+ # df = Polars::DataFrame.new(
+ # {
+ # "foo" => [1, 2, 3],
+ # "bar" => [6, 7, 8],
+ # "ham" => ["a", "b", "c"],
+ # }
+ # ).lazy
+ # df.select("foo").collect
+ # # =>
+ # # shape: (3, 1)
+ # # ┌─────┐
+ # # │ foo │
+ # # │ --- │
+ # # │ i64 │
+ # # ╞═════╡
+ # # │ 1 │
+ # # ├╌╌╌╌╌┤
+ # # │ 2 │
+ # # ├╌╌╌╌╌┤
+ # # │ 3 │
+ # # └─────┘
+ #
+ # @example
+ # df.select(["foo", "bar"]).collect
+ # # =>
+ # # shape: (3, 2)
+ # # ┌─────┬─────┐
+ # # │ foo ┆ bar │
+ # # │ --- ┆ --- │
+ # # │ i64 ┆ i64 │
+ # # ╞═════╪═════╡
+ # # │ 1 ┆ 6 │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ 2 ┆ 7 │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ 3 ┆ 8 │
+ # # └─────┴─────┘
+ #
+ # @example
+ # df.select(Polars.col("foo") + 1).collect
+ # # =>
+ # # shape: (3, 1)
+ # # ┌─────┐
+ # # │ foo │
+ # # │ --- │
+ # # │ i64 │
+ # # ╞═════╡
+ # # │ 2 │
+ # # ├╌╌╌╌╌┤
+ # # │ 3 │
+ # # ├╌╌╌╌╌┤
+ # # │ 4 │
+ # # └─────┘
+ #
+ # @example
+ # df.select([Polars.col("foo") + 1, Polars.col("bar") + 1]).collect
+ # # =>
+ # # shape: (3, 2)
+ # # ┌─────┬─────┐
+ # # │ foo ┆ bar │
+ # # │ --- ┆ --- │
+ # # │ i64 ┆ i64 │
+ # # ╞═════╪═════╡
+ # # │ 2 ┆ 7 │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ 3 ┆ 8 │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ 4 ┆ 9 │
+ # # └─────┴─────┘
+ #
+ # @example
+ # df.select(Polars.when(Polars.col("foo") > 2).then(10).otherwise(0)).collect
+ # # =>
+ # # shape: (3, 1)
+ # # ┌─────────┐
+ # # │ literal │
+ # # │ --- │
+ # # │ i64 │
+ # # ╞═════════╡
+ # # │ 0 │
+ # # ├╌╌╌╌╌╌╌╌╌┤
+ # # │ 0 │
+ # # ├╌╌╌╌╌╌╌╌╌┤
+ # # │ 10 │
+ # # └─────────┘
def select(exprs)
exprs = Utils.selection_to_rbexpr_list(exprs)
_from_rbldf(_ldf.select(exprs))
end
+ # Start a groupby operation.
+ #
+ # @param by [Object]
+ # Column(s) to group by.
+ # @param maintain_order [Boolean]
+ # Make sure that the order of the groups remain consistent. This is more
+ # expensive than a default groupby.
+ #
+ # @return [LazyGroupBy]
+ #
+ # @example
+ # df = Polars::DataFrame.new(
+ # {
+ # "a" => ["a", "b", "a", "b", "b", "c"],
+ # "b" => [1, 2, 3, 4, 5, 6],
+ # "c" => [6, 5, 4, 3, 2, 1]
+ # }
+ # ).lazy
+ # df.groupby("a", maintain_order: true).agg(Polars.col("b").sum).collect
+ # # =>
+ # # shape: (3, 2)
+ # # ┌─────┬─────┐
+ # # │ a ┆ b │
+ # # │ --- ┆ --- │
+ # # │ str ┆ i64 │
+ # # ╞═════╪═════╡
+ # # │ a ┆ 4 │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ b ┆ 11 │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ c ┆ 6 │
+ # # └─────┴─────┘
def groupby(by, maintain_order: false)
rbexprs_by = Utils.selection_to_rbexpr_list(by)
lgb = _ldf.groupby(rbexprs_by, maintain_order)
LazyGroupBy.new(lgb, self.class)
end
@@ -273,11 +733,120 @@
# end
# def join_asof
# end
+ # Add a join operation to the Logical Plan.
#
+ # @param other [LazyFrame]
+ # Lazy DataFrame to join with.
+ # @param left_on [Object]
+ # Join column of the left DataFrame.
+ # @param right_on [Object]
+ # Join column of the right DataFrame.
+ # @param on Object
+ # Join column of both DataFrames. If set, `left_on` and `right_on` should be
+ # None.
+ # @param how ["inner", "left", "outer", "semi", "anti", "cross"]
+ # Join strategy.
+ # @param suffix [String]
+ # Suffix to append to columns with a duplicate name.
+ # @param allow_parallel [Boolean]
+ # Allow the physical plan to optionally evaluate the computation of both
+ # DataFrames up to the join in parallel.
+ # @param force_parallel [Boolean]
+ # Force the physical plan to evaluate the computation of both DataFrames up to
+ # the join in parallel.
+ #
+ # @return [LazyFrame]
+ #
+ # @example
+ # df = Polars::DataFrame.new(
+ # {
+ # "foo" => [1, 2, 3],
+ # "bar" => [6.0, 7.0, 8.0],
+ # "ham" => ["a", "b", "c"]
+ # }
+ # ).lazy
+ # other_df = Polars::DataFrame.new(
+ # {
+ # "apple" => ["x", "y", "z"],
+ # "ham" => ["a", "b", "d"]
+ # }
+ # ).lazy
+ # df.join(other_df, on: "ham").collect
+ # # =>
+ # # shape: (2, 4)
+ # # ┌─────┬─────┬─────┬───────┐
+ # # │ foo ┆ bar ┆ ham ┆ apple │
+ # # │ --- ┆ --- ┆ --- ┆ --- │
+ # # │ i64 ┆ f64 ┆ str ┆ str │
+ # # ╞═════╪═════╪═════╪═══════╡
+ # # │ 1 ┆ 6.0 ┆ a ┆ x │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
+ # # │ 2 ┆ 7.0 ┆ b ┆ y │
+ # # └─────┴─────┴─────┴───────┘
+ #
+ # @example
+ # df.join(other_df, on: "ham", how: "outer").collect
+ # # =>
+ # # shape: (4, 4)
+ # # ┌──────┬──────┬─────┬───────┐
+ # # │ foo ┆ bar ┆ ham ┆ apple │
+ # # │ --- ┆ --- ┆ --- ┆ --- │
+ # # │ i64 ┆ f64 ┆ str ┆ str │
+ # # ╞══════╪══════╪═════╪═══════╡
+ # # │ 1 ┆ 6.0 ┆ a ┆ x │
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
+ # # │ 2 ┆ 7.0 ┆ b ┆ y │
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
+ # # │ null ┆ null ┆ d ┆ z │
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
+ # # │ 3 ┆ 8.0 ┆ c ┆ null │
+ # # └──────┴──────┴─────┴───────┘
+ #
+ # @example
+ # df.join(other_df, on: "ham", how: "left").collect
+ # # =>
+ # # shape: (3, 4)
+ # # ┌─────┬─────┬─────┬───────┐
+ # # │ foo ┆ bar ┆ ham ┆ apple │
+ # # │ --- ┆ --- ┆ --- ┆ --- │
+ # # │ i64 ┆ f64 ┆ str ┆ str │
+ # # ╞═════╪═════╪═════╪═══════╡
+ # # │ 1 ┆ 6.0 ┆ a ┆ x │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
+ # # │ 2 ┆ 7.0 ┆ b ┆ y │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
+ # # │ 3 ┆ 8.0 ┆ c ┆ null │
+ # # └─────┴─────┴─────┴───────┘
+ #
+ # @example
+ # df.join(other_df, on: "ham", how: "semi").collect
+ # # =>
+ # # shape: (2, 3)
+ # # ┌─────┬─────┬─────┐
+ # # │ foo ┆ bar ┆ ham │
+ # # │ --- ┆ --- ┆ --- │
+ # # │ i64 ┆ f64 ┆ str │
+ # # ╞═════╪═════╪═════╡
+ # # │ 1 ┆ 6.0 ┆ a │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ 2 ┆ 7.0 ┆ b │
+ # # └─────┴─────┴─────┘
+ #
+ # @example
+ # df.join(other_df, on: "ham", how: "anti").collect
+ # # =>
+ # # shape: (1, 3)
+ # # ┌─────┬─────┬─────┐
+ # # │ foo ┆ bar ┆ ham │
+ # # │ --- ┆ --- ┆ --- │
+ # # │ i64 ┆ f64 ┆ str │
+ # # ╞═════╪═════╪═════╡
+ # # │ 3 ┆ 8.0 ┆ c │
+ # # └─────┴─────┴─────┘
def join(
other,
left_on: nil,
right_on: nil,
on: nil,
@@ -320,10 +889,47 @@
suffix,
)
)
end
+ # Add or overwrite multiple columns in a DataFrame.
+ #
+ # @param exprs [Object]
+ # List of Expressions that evaluate to columns.
+ #
+ # @return [LazyFrame]
+ #
+ # @example
+ # ldf = Polars::DataFrame.new(
+ # {
+ # "a" => [1, 2, 3, 4],
+ # "b" => [0.5, 4, 10, 13],
+ # "c" => [true, true, false, true]
+ # }
+ # ).lazy
+ # ldf.with_columns(
+ # [
+ # (Polars.col("a") ** 2).alias("a^2"),
+ # (Polars.col("b") / 2).alias("b/2"),
+ # (Polars.col("c").is_not()).alias("not c")
+ # ]
+ # ).collect
+ # # =>
+ # # shape: (4, 6)
+ # # ┌─────┬──────┬───────┬──────┬──────┬───────┐
+ # # │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
+ # # │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │
+ # # ╞═════╪══════╪═══════╪══════╪══════╪═══════╡
+ # # │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
+ # # │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
+ # # │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
+ # # │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │
+ # # └─────┴──────┴───────┴──────┴──────┴───────┘
def with_columns(exprs)
exprs =
if exprs.nil?
[]
elsif exprs.is_a?(Expr)
@@ -348,115 +954,703 @@
end
# def with_context
# end
+ # Add or overwrite column in a DataFrame.
#
+ # @param column [Object]
+ # Expression that evaluates to column or a Series to use.
+ #
+ # @return [LazyFrame]
+ #
+ # @example
+ # df = Polars::DataFrame.new(
+ # {
+ # "a" => [1, 3, 5],
+ # "b" => [2, 4, 6]
+ # }
+ # ).lazy
+ # df.with_column((Polars.col("b") ** 2).alias("b_squared")).collect
+ # # =>
+ # # shape: (3, 3)
+ # # ┌─────┬─────┬───────────┐
+ # # │ a ┆ b ┆ b_squared │
+ # # │ --- ┆ --- ┆ --- │
+ # # │ i64 ┆ i64 ┆ f64 │
+ # # ╞═════╪═════╪═══════════╡
+ # # │ 1 ┆ 2 ┆ 4.0 │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
+ # # │ 3 ┆ 4 ┆ 16.0 │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
+ # # │ 5 ┆ 6 ┆ 36.0 │
+ # # └─────┴─────┴───────────┘
+ #
+ # @example
+ # df.with_column(Polars.col("a") ** 2).collect
+ # # =>
+ # # shape: (3, 2)
+ # # ┌──────┬─────┐
+ # # │ a ┆ b │
+ # # │ --- ┆ --- │
+ # # │ f64 ┆ i64 │
+ # # ╞══════╪═════╡
+ # # │ 1.0 ┆ 2 │
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ 9.0 ┆ 4 │
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ 25.0 ┆ 6 │
+ # # └──────┴─────┘
def with_column(column)
with_columns([column])
end
- # def drop
- # end
+ # Remove one or multiple columns from a DataFrame.
+ #
+ # @param columns [Object]
+ # - Name of the column that should be removed.
+ # - List of column names.
+ #
+ # @return [LazyFrame]
+ def drop(columns)
+ if columns.is_a?(String)
+ columns = [columns]
+ end
+ _from_rbldf(_ldf.drop_columns(columns))
+ end
+ # Rename column names.
#
+ # @param mapping [Hash]
+ # Key value pairs that map from old name to new name.
+ #
+ # @return [LazyFrame]
def rename(mapping)
existing = mapping.keys
_new = mapping.values
_from_rbldf(_ldf.rename(existing, _new))
end
- # def reverse
- # end
+ # Reverse the DataFrame.
+ #
+ # @return [LazyFrame]
+ def reverse
+ _from_rbldf(_ldf.reverse)
+ end
- # def shift
- # end
+ # Shift the values by a given period.
+ #
+ # @param periods [Integer]
+ # Number of places to shift (may be negative).
+ #
+ # @return [LazyFrame]
+ #
+ # @example
+ # df = Polars::DataFrame.new(
+ # {
+ # "a" => [1, 3, 5],
+ # "b" => [2, 4, 6]
+ # }
+ # ).lazy
+ # df.shift(1).collect
+ # # =>
+ # # shape: (3, 2)
+ # # ┌──────┬──────┐
+ # # │ a ┆ b │
+ # # │ --- ┆ --- │
+ # # │ i64 ┆ i64 │
+ # # ╞══════╪══════╡
+ # # │ null ┆ null │
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
+ # # │ 1 ┆ 2 │
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
+ # # │ 3 ┆ 4 │
+ # # └──────┴──────┘
+ #
+ # @example
+ # df.shift(-1).collect
+ # # =>
+ # # shape: (3, 2)
+ # # ┌──────┬──────┐
+ # # │ a ┆ b │
+ # # │ --- ┆ --- │
+ # # │ i64 ┆ i64 │
+ # # ╞══════╪══════╡
+ # # │ 3 ┆ 4 │
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
+ # # │ 5 ┆ 6 │
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
+ # # │ null ┆ null │
+ # # └──────┴──────┘
+ def shift(periods)
+ _from_rbldf(_ldf.shift(periods))
+ end
- # def shift_and_fill
- # end
+ # Shift the values by a given period and fill the resulting null values.
+ #
+ # @param periods [Integer]
+ # Number of places to shift (may be negative).
+ # @param fill_value [Object]
+ # Fill `nil` values with the result of this expression.
+ #
+ # @return [LazyFrame]
+ #
+ # @example
+ # df = Polars::DataFrame.new(
+ # {
+ # "a" => [1, 3, 5],
+ # "b" => [2, 4, 6]
+ # }
+ # ).lazy
+ # df.shift_and_fill(1, 0).collect
+ # # =>
+ # # shape: (3, 2)
+ # # ┌─────┬─────┐
+ # # │ a ┆ b │
+ # # │ --- ┆ --- │
+ # # │ i64 ┆ i64 │
+ # # ╞═════╪═════╡
+ # # │ 0 ┆ 0 │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ 1 ┆ 2 │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ 3 ┆ 4 │
+ # # └─────┴─────┘
+ #
+ # @example
+ # df.shift_and_fill(-1, 0).collect
+ # # =>
+ # # shape: (3, 2)
+ # # ┌─────┬─────┐
+ # # │ a ┆ b │
+ # # │ --- ┆ --- │
+ # # │ i64 ┆ i64 │
+ # # ╞═════╪═════╡
+ # # │ 3 ┆ 4 │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ 5 ┆ 6 │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ 0 ┆ 0 │
+ # # └─────┴─────┘
+ def shift_and_fill(periods, fill_value)
+ if !fill_value.is_a?(Expr)
+ fill_value = Polars.lit(fill_value)
+ end
+ _from_rbldf(_ldf.shift_and_fill(periods, fill_value._rbexpr))
+ end
- # def slice
- # end
+ # Get a slice of this DataFrame.
+ #
+ # @param offset [Integer]
+ # Start index. Negative indexing is supported.
+ # @param length [Integer]
+ # Length of the slice. If set to `nil`, all rows starting at the offset
+ # will be selected.
+ #
+ # @return [LazyFrame]
+ #
+ # @example
+ # df = Polars::DataFrame.new(
+ # {
+ # "a" => ["x", "y", "z"],
+ # "b" => [1, 3, 5],
+ # "c" => [2, 4, 6]
+ # }
+ # ).lazy
+ # df.slice(1, 2).collect
+ # # =>
+ # # shape: (2, 3)
+ # # ┌─────┬─────┬─────┐
+ # # │ a ┆ b ┆ c │
+ # # │ --- ┆ --- ┆ --- │
+ # # │ str ┆ i64 ┆ i64 │
+ # # ╞═════╪═════╪═════╡
+ # # │ y ┆ 3 ┆ 4 │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ z ┆ 5 ┆ 6 │
+ # # └─────┴─────┴─────┘
+ def slice(offset, length = nil)
+ if length && length < 0
+ raise ArgumentError, "Negative slice lengths (#{length}) are invalid for LazyFrame"
+ end
+ _from_rbldf(_ldf.slice(offset, length))
+ end
- # def limit
- # end
+ # Get the first `n` rows.
+ #
+ # Alias for {#head}.
+ #
+ # @param n [Integer]
+ # Number of rows to return.
+ #
+ # @return [LazyFrame]
+ #
+ # @note
+ # Consider using the {#fetch} operation if you only want to test your
+ # query. The {#fetch} operation will load the first `n` rows at the scan
+ # level, whereas the {#head}/{#limit} are applied at the end.
+ def limit(n = 5)
+ head(5)
+ end
- # def head
- # end
+ # Get the first `n` rows.
+ #
+ # @param n [Integer]
+ # Number of rows to return.
+ #
+ # @return [LazyFrame]
+ #
+ # @note
+ # Consider using the {#fetch} operation if you only want to test your
+ # query. The {#fetch} operation will load the first `n` rows at the scan
+ # level, whereas the {#head}/{#limit} are applied at the end.
+ def head(n = 5)
+ slice(0, n)
+ end
- # def tail
- # end
+ # Get the last `n` rows.
+ #
+ # @param n [Integer]
+ # Number of rows.
+ #
+ # @return [LazyFrame]
+ def tail(n = 5)
+ _from_rbldf(_ldf.tail(n))
+ end
- # def last
- # end
+ # Get the last row of the DataFrame.
+ #
+ # @return [LazyFrame]
+ def last
+ tail(1)
+ end
- # def first
- # end
+ # Get the first row of the DataFrame.
+ #
+ # @return [LazyFrame]
+ def first
+ slice(0, 1)
+ end
# def with_row_count
# end
- # def take_every
- # end
+ # Take every nth row in the LazyFrame and return as a new LazyFrame.
+ #
+ # @return [LazyFrame]
+ #
+ # @example
+ # s = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [5, 6, 7, 8]}).lazy
+ # s.take_every(2).collect
+ # # =>
+ # # shape: (2, 2)
+ # # ┌─────┬─────┐
+ # # │ a ┆ b │
+ # # │ --- ┆ --- │
+ # # │ i64 ┆ i64 │
+ # # ╞═════╪═════╡
+ # # │ 1 ┆ 5 │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ 3 ┆ 7 │
+ # # └─────┴─────┘
+ def take_every(n)
+ select(Utils.col("*").take_every(n))
+ end
# def fill_null
# end
+ # Fill floating point NaN values.
#
+ # @param fill_value [Object]
+ # Value to fill the NaN values with.
+ #
+ # @return [LazyFrame]
+ #
+ # @note
+ # Note that floating point NaN (Not a Number) are not missing values!
+ # To replace missing values, use `fill_null` instead.
+ #
+ # @example
+ # df = Polars::DataFrame.new(
+ # {
+ # "a" => [1.5, 2, Float::NAN, 4],
+ # "b" => [0.5, 4, Float::NAN, 13],
+ # }
+ # ).lazy
+ # df.fill_nan(99).collect
+ # # =>
+ # # shape: (4, 2)
+ # # ┌──────┬──────┐
+ # # │ a ┆ b │
+ # # │ --- ┆ --- │
+ # # │ f64 ┆ f64 │
+ # # ╞══════╪══════╡
+ # # │ 1.5 ┆ 0.5 │
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
+ # # │ 2.0 ┆ 4.0 │
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
+ # # │ 99.0 ┆ 99.0 │
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
+ # # │ 4.0 ┆ 13.0 │
+ # # └──────┴──────┘
def fill_nan(fill_value)
if !fill_value.is_a?(Expr)
fill_value = Utils.lit(fill_value)
end
_from_rbldf(_ldf.fill_nan(fill_value._rbexpr))
end
- # def std
- # end
+ # Aggregate the columns in the DataFrame to their standard deviation value.
+ #
+ # @return [LazyFrame]
+ #
+ # @example
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
+ # df.std.collect
+ # # =>
+ # # shape: (1, 2)
+ # # ┌──────────┬─────┐
+ # # │ a ┆ b │
+ # # │ --- ┆ --- │
+ # # │ f64 ┆ f64 │
+ # # ╞══════════╪═════╡
+ # # │ 1.290994 ┆ 0.5 │
+ # # └──────────┴─────┘
+ #
+ # @example
+ # df.std(ddof: 0).collect
+ # # =>
+ # # shape: (1, 2)
+ # # ┌──────────┬──────────┐
+ # # │ a ┆ b │
+ # # │ --- ┆ --- │
+ # # │ f64 ┆ f64 │
+ # # ╞══════════╪══════════╡
+ # # │ 1.118034 ┆ 0.433013 │
+ # # └──────────┴──────────┘
+ def std(ddof: 1)
+ _from_rbldf(_ldf.std(ddof))
+ end
- # def var
- # end
+ # Aggregate the columns in the DataFrame to their variance value.
+ #
+ # @return [LazyFrame]
+ #
+ # @example
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
+ # df.var.collect
+ # # =>
+ # # shape: (1, 2)
+ # # ┌──────────┬──────┐
+ # # │ a ┆ b │
+ # # │ --- ┆ --- │
+ # # │ f64 ┆ f64 │
+ # # ╞══════════╪══════╡
+ # # │ 1.666667 ┆ 0.25 │
+ # # └──────────┴──────┘
+ #
+ # @example
+ # df.var(ddof: 0).collect
+ # # =>
+ # # shape: (1, 2)
+ # # ┌──────┬────────┐
+ # # │ a ┆ b │
+ # # │ --- ┆ --- │
+ # # │ f64 ┆ f64 │
+ # # ╞══════╪════════╡
+ # # │ 1.25 ┆ 0.1875 │
+ # # └──────┴────────┘
+ def var(ddof: 1)
+ _from_rbldf(_ldf.var(ddof))
+ end
- # def max
- # end
+ # Aggregate the columns in the DataFrame to their maximum value.
+ #
+ # @return [LazyFrame]
+ #
+ # @example
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
+ # df.max.collect
+ # # =>
+ # # shape: (1, 2)
+ # # ┌─────┬─────┐
+ # # │ a ┆ b │
+ # # │ --- ┆ --- │
+ # # │ i64 ┆ i64 │
+ # # ╞═════╪═════╡
+ # # │ 4 ┆ 2 │
+ # # └─────┴─────┘
+ def max
+ _from_rbldf(_ldf.max)
+ end
- # def min
- # end
+ # Aggregate the columns in the DataFrame to their minimum value.
+ #
+ # @return [LazyFrame]
+ #
+ # @example
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
+ # df.min.collect
+ # # =>
+ # # shape: (1, 2)
+ # # ┌─────┬─────┐
+ # # │ a ┆ b │
+ # # │ --- ┆ --- │
+ # # │ i64 ┆ i64 │
+ # # ╞═════╪═════╡
+ # # │ 1 ┆ 1 │
+ # # └─────┴─────┘
+ def min
+ _from_rbldf(_ldf.min)
+ end
- # def sum
- # end
+ # Aggregate the columns in the DataFrame to their sum value.
+ #
+ # @return [LazyFrame]
+ #
+ # @example
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
+ # df.sum.collect
+ # # =>
+ # # shape: (1, 2)
+ # # ┌─────┬─────┐
+ # # │ a ┆ b │
+ # # │ --- ┆ --- │
+ # # │ i64 ┆ i64 │
+ # # ╞═════╪═════╡
+ # # │ 10 ┆ 5 │
+ # # └─────┴─────┘
+ def sum
+ _from_rbldf(_ldf.sum)
+ end
- # def mean
- # end
+ # Aggregate the columns in the DataFrame to their mean value.
+ #
+ # @return [LazyFrame]
+ #
+ # @example
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
+ # df.mean.collect
+ # # =>
+ # # shape: (1, 2)
+ # # ┌─────┬──────┐
+ # # │ a ┆ b │
+ # # │ --- ┆ --- │
+ # # │ f64 ┆ f64 │
+ # # ╞═════╪══════╡
+ # # │ 2.5 ┆ 1.25 │
+ # # └─────┴──────┘
+ def mean
+ _from_rbldf(_ldf.mean)
+ end
- # def median
- # end
+ # Aggregate the columns in the DataFrame to their median value.
+ #
+ # @return [LazyFrame]
+ #
+ # @example
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
+ # df.median.collect
+ # # =>
+ # # shape: (1, 2)
+ # # ┌─────┬─────┐
+ # # │ a ┆ b │
+ # # │ --- ┆ --- │
+ # # │ f64 ┆ f64 │
+ # # ╞═════╪═════╡
+ # # │ 2.5 ┆ 1.0 │
+ # # └─────┴─────┘
+ def median
+ _from_rbldf(_ldf.median)
+ end
- # def quantile
- # end
+ # Aggregate the columns in the DataFrame to their quantile value.
+ #
+ # @param quantile [Float]
+ # Quantile between 0.0 and 1.0.
+ # @param interpolation ["nearest", "higher", "lower", "midpoint", "linear"]
+ # Interpolation method.
+ #
+ # @return [LazyFrame]
+ #
+ # @example
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
+ # df.quantile(0.7).collect
+ # # =>
+ # # shape: (1, 2)
+ # # ┌─────┬─────┐
+ # # │ a ┆ b │
+ # # │ --- ┆ --- │
+ # # │ f64 ┆ f64 │
+ # # ╞═════╪═════╡
+ # # │ 3.0 ┆ 1.0 │
+ # # └─────┴─────┘
+ def quantile(quantile, interpolation: "nearest")
+ _from_rbldf(_ldf.quantile(quantile, interpolation))
+ end
+ # Explode lists to long format.
#
+ # @return [LazyFrame]
+ #
+ # @example
+ # df = Polars::DataFrame.new(
+ # {
+ # "letters" => ["a", "a", "b", "c"],
+ # "numbers" => [[1], [2, 3], [4, 5], [6, 7, 8]],
+ # }
+ # ).lazy
+ # df.explode("numbers").collect
+ # # =>
+ # # shape: (8, 2)
+ # # ┌─────────┬─────────┐
+ # # │ letters ┆ numbers │
+ # # │ --- ┆ --- │
+ # # │ str ┆ i64 │
+ # # ╞═════════╪═════════╡
+ # # │ a ┆ 1 │
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
+ # # │ a ┆ 2 │
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
+ # # │ a ┆ 3 │
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
+ # # │ b ┆ 4 │
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
+ # # │ b ┆ 5 │
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
+ # # │ c ┆ 6 │
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
+ # # │ c ┆ 7 │
+ # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
+ # # │ c ┆ 8 │
+ # # └─────────┴─────────┘
def explode(columns)
columns = Utils.selection_to_rbexpr_list(columns)
_from_rbldf(_ldf.explode(columns))
end
- # def unique
- # end
+ # Drop duplicate rows from this DataFrame.
+ #
+ # Note that this fails if there is a column of type `List` in the DataFrame or
+ # subset.
+ #
+ # @param maintain_order [Boolean]
+ # Keep the same order as the original DataFrame. This requires more work to
+ # compute.
+ # @param subset [Object]
+ # Subset to use to compare rows.
+ # @param keep ["first", "last"]
+ # Which of the duplicate rows to keep.
+ #
+ # @return [LazyFrame]
+ def unique(maintain_order: true, subset: nil, keep: "first")
+ if !subset.nil? && !subset.is_a?(Array)
+ subset = [subset]
+ end
+ _from_rbldf(_ldf.unique(maintain_order, subset, keep))
+ end
# def drop_nulls
# end
# def melt
# end
# def map
# end
- # def interpolate
- # end
+ # Interpolate intermediate values. The interpolation method is linear.
+ #
+ # @return [LazyFrame]
+ #
+ # @example
+ # df = Polars::DataFrame.new(
+ # {
+ # "foo" => [1, nil, 9, 10],
+ # "bar" => [6, 7, 9, nil],
+ # "baz" => [1, nil, nil, 9]
+ # }
+ # ).lazy
+ # df.interpolate.collect
+ # # =>
+ # # shape: (4, 3)
+ # # ┌─────┬──────┬─────┐
+ # # │ foo ┆ bar ┆ baz │
+ # # │ --- ┆ --- ┆ --- │
+ # # │ i64 ┆ i64 ┆ i64 │
+ # # ╞═════╪══════╪═════╡
+ # # │ 1 ┆ 6 ┆ 1 │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ 5 ┆ 7 ┆ 3 │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ 9 ┆ 9 ┆ 6 │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ 10 ┆ null ┆ 9 │
+ # # └─────┴──────┴─────┘
+ def interpolate
+ select(Utils.col("*").interpolate)
+ end
- # def unnest
- # end
+ # Decompose a struct into its fields.
+ #
+ # The fields will be inserted into the `DataFrame` on the location of the
+ # `struct` type.
+ #
+ # @param names [Object]
+ # Names of the struct columns that will be decomposed by its fields
+ #
+ # @return [LazyFrame]
+ #
+ # @example
+ # df = (
+ # Polars::DataFrame.new(
+ # {
+ # "before" => ["foo", "bar"],
+ # "t_a" => [1, 2],
+ # "t_b" => ["a", "b"],
+ # "t_c" => [true, nil],
+ # "t_d" => [[1, 2], [3]],
+ # "after" => ["baz", "womp"]
+ # }
+ # )
+ # .lazy
+ # .select(
+ # ["before", Polars.struct(Polars.col("^t_.$")).alias("t_struct"), "after"]
+ # )
+ # )
+ # df.fetch
+ # # =>
+ # # shape: (2, 3)
+ # # ┌────────┬─────────────────────┬───────┐
+ # # │ before ┆ t_struct ┆ after │
+ # # │ --- ┆ --- ┆ --- │
+ # # │ str ┆ struct[4] ┆ str │
+ # # ╞════════╪═════════════════════╪═══════╡
+ # # │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
+ # # │ bar ┆ {2,"b",null,[3]} ┆ womp │
+ # # └────────┴─────────────────────┴───────┘
+ #
+ # @example
+ # df.unnest("t_struct").fetch
+ # # =>
+ # # shape: (2, 6)
+ # # ┌────────┬─────┬─────┬──────┬───────────┬───────┐
+ # # │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
+ # # │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │
+ # # ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡
+ # # │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
+ # # │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │
+ # # └────────┴─────┴─────┴──────┴───────────┴───────┘
+ def unnest(names)
+ if names.is_a?(String)
+ names = [names]
+ end
+ _from_rbldf(_ldf.unnest(names))
+ end
private
def initialize_copy(other)
super