lib/polars/lazy_frame.rb in polars-df-0.1.2 vs lib/polars/lazy_frame.rb in polars-df-0.1.3

- old
+ new

@@ -150,56 +150,228 @@ # end # def self.read_json # end - # def columns - # end + # Get or set column names. + # + # @return [Array] + # + # @example + # df = ( + # Polars::DataFrame.new( + # { + # "foo" => [1, 2, 3], + # "bar" => [6, 7, 8], + # "ham" => ["a", "b", "c"] + # } + # ) + # .lazy + # .select(["foo", "bar"]) + # ) + # df.columns + # # => ["foo", "bar"] + def columns + _ldf.columns + end - # def dtypes - # end + # Get dtypes of columns in LazyFrame. + # + # @return [Array] + # + # @example + # lf = Polars::DataFrame.new( + # { + # "foo" => [1, 2, 3], + # "bar" => [6.0, 7.0, 8.0], + # "ham" => ["a", "b", "c"] + # } + # ).lazy + # lf.dtypes + # # => [:i64, :f64, :str] + def dtypes + _ldf.dtypes + end - # def schema - # end + # Get the schema. + # + # @return [Hash] + # + # @example + # lf = Polars::DataFrame.new( + # { + # "foo" => [1, 2, 3], + # "bar" => [6.0, 7.0, 8.0], + # "ham" => ["a", "b", "c"] + # } + # ).lazy + # lf.schema + # # => {"foo"=>:i64, "bar"=>:f64, "ham"=>:str} + def schema + _ldf.schema + end - # def width - # end + # Get the width of the LazyFrame. + # + # @return [Integer] + # + # @example + # lf = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]}).lazy + # lf.width + # # => 2 + def width + _ldf.width + end - # def include?(key) - # end + # Check if LazyFrame includes key. + # + # @return [Boolean] + def include?(key) + columns.include?(key) + end # clone handled by initialize_copy # def [](item) # end - # def to_s - # end - # alias_method :inspect, :to_s + # Returns a string representing the LazyFrame. + # + # @return [String] + def to_s + <<~EOS + naive plan: (run LazyFrame#describe_optimized_plan to see the optimized plan) + #{describe_plan} + EOS + end + # def write_json # end # def pipe # end - # def describe_plan - # end + # Create a string representation of the unoptimized query plan. + # + # @return [String] + def describe_plan + _ldf.describe_plan + end + # Create a string representation of the optimized query plan. + # + # @return [String] # def describe_optimized_plan # end # def show_graph # end - # def sort - # end + # Sort the DataFrame. + # + # Sorting can be done by: + # + # - A single column name + # - An expression + # - Multiple expressions + # + # @param by [Object] + # Column (expressions) to sort by. + # @param reverse [Boolean] + # Sort in descending order. + # @param nulls_last [Boolean] + # Place null values last. Can only be used if sorted by a single column. + # + # @return [LazyFrame] + # + # @example + # df = Polars::DataFrame.new( + # { + # "foo" => [1, 2, 3], + # "bar" => [6.0, 7.0, 8.0], + # "ham" => ["a", "b", "c"] + # } + # ).lazy + # df.sort("foo", reverse: true).collect + # # => + # # shape: (3, 3) + # # ┌─────┬─────┬─────┐ + # # │ foo ┆ bar ┆ ham │ + # # │ --- ┆ --- ┆ --- │ + # # │ i64 ┆ f64 ┆ str │ + # # ╞═════╪═════╪═════╡ + # # │ 3 ┆ 8.0 ┆ c │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 2 ┆ 7.0 ┆ b │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 1 ┆ 6.0 ┆ a │ + # # └─────┴─────┴─────┘ + def sort(by, reverse: false, nulls_last: false) + if by.is_a?(String) + _from_rbldf(_ldf.sort(by, reverse, nulls_last)) + end + if Utils.bool?(reverse) + reverse = [reverse] + end + by = Utils.selection_to_rbexpr_list(by) + _from_rbldf(_ldf.sort_by_exprs(by, reverse, nulls_last)) + end + # def profile # end + # Collect into a DataFrame. # + # Note: use {#fetch} if you want to run your query on the first `n` rows + # only. This can be a huge time saver in debugging queries. + # + # @param type_coercion [Boolean] + # Do type coercion optimization. + # @param predicate_pushdown [Boolean] + # Do predicate pushdown optimization. + # @param projection_pushdown [Boolean] + # Do projection pushdown optimization. + # @param simplify_expression [Boolean] + # Run simplify expressions optimization. + # @param string_cache [Boolean] + # This argument is deprecated. Please set the string cache globally. + # The argument will be ignored + # @param no_optimization [Boolean] + # Turn off (certain) optimizations. + # @param slice_pushdown [Boolean] + # Slice pushdown optimization. + # @param common_subplan_elimination [Boolean] + # Will try to cache branching subplans that occur on self-joins or unions. + # @param allow_streaming [Boolean] + # Run parts of the query in a streaming fashion (this is in an alpha state) + # + # @return [DataFrame] + # + # @example + # df = Polars::DataFrame.new( + # { + # "a" => ["a", "b", "a", "b", "b", "c"], + # "b" => [1, 2, 3, 4, 5, 6], + # "c" => [6, 5, 4, 3, 2, 1] + # } + # ).lazy + # df.groupby("a", maintain_order: true).agg(Polars.all.sum).collect + # # => + # # shape: (3, 3) + # # ┌─────┬─────┬─────┐ + # # │ a ┆ b ┆ c │ + # # │ --- ┆ --- ┆ --- │ + # # │ str ┆ i64 ┆ i64 │ + # # ╞═════╪═════╪═════╡ + # # │ a ┆ 4 ┆ 10 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ b ┆ 11 ┆ 10 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ c ┆ 6 ┆ 1 │ + # # └─────┴─────┴─────┘ def collect( type_coercion: true, predicate_pushdown: true, projection_pushdown: true, simplify_expression: true, @@ -230,38 +402,326 @@ allow_streaming ) Utils.wrap_df(ldf.collect) end - # def fetch - # end + # Collect a small number of rows for debugging purposes. + # + # Fetch is like a {#collect} operation, but it overwrites the number of rows + # read by every scan operation. This is a utility that helps debug a query on a + # smaller number of rows. + # + # Note that the fetch does not guarantee the final number of rows in the + # DataFrame. Filter, join operations and a lower number of rows available in the + # scanned file influence the final number of rows. + # + # @param n_rows [Integer] + # Collect n_rows from the data sources. + # @param type_coercion [Boolean] + # Run type coercion optimization. + # @param predicate_pushdown [Boolean] + # Run predicate pushdown optimization. + # @param projection_pushdown [Boolean] + # Run projection pushdown optimization. + # @param simplify_expression [Boolean] + # Run simplify expressions optimization. + # @param string_cache [Boolean] + # This argument is deprecated. Please set the string cache globally. + # The argument will be ignored + # @param no_optimization [Boolean] + # Turn off optimizations. + # @param slice_pushdown [Boolean] + # Slice pushdown optimization + # @param common_subplan_elimination [Boolean] + # Will try to cache branching subplans that occur on self-joins or unions. + # @param allow_streaming [Boolean] + # Run parts of the query in a streaming fashion (this is in an alpha state) + # + # @return [DataFrame] + # + # @example + # df = Polars::DataFrame.new( + # { + # "a" => ["a", "b", "a", "b", "b", "c"], + # "b" => [1, 2, 3, 4, 5, 6], + # "c" => [6, 5, 4, 3, 2, 1] + # } + # ).lazy + # df.groupby("a", maintain_order: true).agg(Polars.all.sum).fetch(2) + # # => + # # shape: (2, 3) + # # ┌─────┬─────┬─────┐ + # # │ a ┆ b ┆ c │ + # # │ --- ┆ --- ┆ --- │ + # # │ str ┆ i64 ┆ i64 │ + # # ╞═════╪═════╪═════╡ + # # │ a ┆ 1 ┆ 6 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ b ┆ 2 ┆ 5 │ + # # └─────┴─────┴─────┘ + def fetch( + n_rows = 500, + type_coercion: true, + predicate_pushdown: true, + projection_pushdown: true, + simplify_expression: true, + string_cache: false, + no_optimization: false, + slice_pushdown: true, + common_subplan_elimination: true, + allow_streaming: false + ) + if no_optimization + predicate_pushdown = false + projection_pushdown = false + slice_pushdown = false + common_subplan_elimination = false + end + ldf = _ldf.optimization_toggle( + type_coercion, + predicate_pushdown, + projection_pushdown, + simplify_expression, + slice_pushdown, + common_subplan_elimination, + allow_streaming + ) + Utils.wrap_df(ldf.fetch(n_rows)) + end + + # Return lazy representation, i.e. itself. # + # Useful for writing code that expects either a `DataFrame` or + # `LazyFrame`. + # + # @return [LazyFrame] + # + # @example + # df = Polars::DataFrame.new( + # { + # "a" => [nil, 2, 3, 4], + # "b" => [0.5, nil, 2.5, 13], + # "c" => [true, true, false, nil] + # } + # ) + # df.lazy def lazy self end - # def cache - # end + # Cache the result once the execution of the physical plan hits this node. + # + # @return [LazyFrame] + def cache + _from_rbldf(_ldf.cache) + end - # def cleared - # end + # Create an empty copy of the current LazyFrame. + # + # The copy has an identical schema but no data. + # + # @return [LazyFrame] + # + # @example + # df = Polars::DataFrame.new( + # { + # "a" => [nil, 2, 3, 4], + # "b" => [0.5, nil, 2.5, 13], + # "c" => [true, true, false, nil], + # } + # ).lazy + # df.cleared.fetch + # # => + # # shape: (0, 3) + # # ┌─────┬─────┬──────┐ + # # │ a ┆ b ┆ c │ + # # │ --- ┆ --- ┆ --- │ + # # │ i64 ┆ f64 ┆ bool │ + # # ╞═════╪═════╪══════╡ + # # └─────┴─────┴──────┘ + def cleared + DataFrame.new(columns: schema).lazy + end + # Filter the rows in the DataFrame based on a predicate expression. # + # @param predicate [Object] + # Expression that evaluates to a boolean Series. + # + # @return [LazyFrame] + # + # @example Filter on one condition: + # lf = Polars::DataFrame.new( + # { + # "foo" => [1, 2, 3], + # "bar" => [6, 7, 8], + # "ham" => ["a", "b", "c"] + # } + # ).lazy + # lf.filter(Polars.col("foo") < 3).collect() + # # => + # # shape: (2, 3) + # # ┌─────┬─────┬─────┐ + # # │ foo ┆ bar ┆ ham │ + # # │ --- ┆ --- ┆ --- │ + # # │ i64 ┆ i64 ┆ str │ + # # ╞═════╪═════╪═════╡ + # # │ 1 ┆ 6 ┆ a │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 2 ┆ 7 ┆ b │ + # # └─────┴─────┴─────┘ + # + # @example Filter on multiple conditions: + # lf.filter((Polars.col("foo") < 3) & (Polars.col("ham") == "a")).collect + # # => + # # shape: (1, 3) + # # ┌─────┬─────┬─────┐ + # # │ foo ┆ bar ┆ ham │ + # # │ --- ┆ --- ┆ --- │ + # # │ i64 ┆ i64 ┆ str │ + # # ╞═════╪═════╪═════╡ + # # │ 1 ┆ 6 ┆ a │ + # # └─────┴─────┴─────┘ def filter(predicate) _from_rbldf( _ldf.filter( Utils.expr_to_lit_or_expr(predicate, str_to_lit: false)._rbexpr ) ) end + # Select columns from this DataFrame. + # + # @param exprs [Object] + # Column or columns to select. + # + # @return [LazyFrame] + # + # @example + # df = Polars::DataFrame.new( + # { + # "foo" => [1, 2, 3], + # "bar" => [6, 7, 8], + # "ham" => ["a", "b", "c"], + # } + # ).lazy + # df.select("foo").collect + # # => + # # shape: (3, 1) + # # ┌─────┐ + # # │ foo │ + # # │ --- │ + # # │ i64 │ + # # ╞═════╡ + # # │ 1 │ + # # ├╌╌╌╌╌┤ + # # │ 2 │ + # # ├╌╌╌╌╌┤ + # # │ 3 │ + # # └─────┘ + # + # @example + # df.select(["foo", "bar"]).collect + # # => + # # shape: (3, 2) + # # ┌─────┬─────┐ + # # │ foo ┆ bar │ + # # │ --- ┆ --- │ + # # │ i64 ┆ i64 │ + # # ╞═════╪═════╡ + # # │ 1 ┆ 6 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 2 ┆ 7 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 3 ┆ 8 │ + # # └─────┴─────┘ + # + # @example + # df.select(Polars.col("foo") + 1).collect + # # => + # # shape: (3, 1) + # # ┌─────┐ + # # │ foo │ + # # │ --- │ + # # │ i64 │ + # # ╞═════╡ + # # │ 2 │ + # # ├╌╌╌╌╌┤ + # # │ 3 │ + # # ├╌╌╌╌╌┤ + # # │ 4 │ + # # └─────┘ + # + # @example + # df.select([Polars.col("foo") + 1, Polars.col("bar") + 1]).collect + # # => + # # shape: (3, 2) + # # ┌─────┬─────┐ + # # │ foo ┆ bar │ + # # │ --- ┆ --- │ + # # │ i64 ┆ i64 │ + # # ╞═════╪═════╡ + # # │ 2 ┆ 7 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 3 ┆ 8 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 4 ┆ 9 │ + # # └─────┴─────┘ + # + # @example + # df.select(Polars.when(Polars.col("foo") > 2).then(10).otherwise(0)).collect + # # => + # # shape: (3, 1) + # # ┌─────────┐ + # # │ literal │ + # # │ --- │ + # # │ i64 │ + # # ╞═════════╡ + # # │ 0 │ + # # ├╌╌╌╌╌╌╌╌╌┤ + # # │ 0 │ + # # ├╌╌╌╌╌╌╌╌╌┤ + # # │ 10 │ + # # └─────────┘ def select(exprs) exprs = Utils.selection_to_rbexpr_list(exprs) _from_rbldf(_ldf.select(exprs)) end + # Start a groupby operation. + # + # @param by [Object] + # Column(s) to group by. + # @param maintain_order [Boolean] + # Make sure that the order of the groups remain consistent. This is more + # expensive than a default groupby. + # + # @return [LazyGroupBy] + # + # @example + # df = Polars::DataFrame.new( + # { + # "a" => ["a", "b", "a", "b", "b", "c"], + # "b" => [1, 2, 3, 4, 5, 6], + # "c" => [6, 5, 4, 3, 2, 1] + # } + # ).lazy + # df.groupby("a", maintain_order: true).agg(Polars.col("b").sum).collect + # # => + # # shape: (3, 2) + # # ┌─────┬─────┐ + # # │ a ┆ b │ + # # │ --- ┆ --- │ + # # │ str ┆ i64 │ + # # ╞═════╪═════╡ + # # │ a ┆ 4 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ b ┆ 11 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ c ┆ 6 │ + # # └─────┴─────┘ def groupby(by, maintain_order: false) rbexprs_by = Utils.selection_to_rbexpr_list(by) lgb = _ldf.groupby(rbexprs_by, maintain_order) LazyGroupBy.new(lgb, self.class) end @@ -273,11 +733,120 @@ # end # def join_asof # end + # Add a join operation to the Logical Plan. # + # @param other [LazyFrame] + # Lazy DataFrame to join with. + # @param left_on [Object] + # Join column of the left DataFrame. + # @param right_on [Object] + # Join column of the right DataFrame. + # @param on Object + # Join column of both DataFrames. If set, `left_on` and `right_on` should be + # None. + # @param how ["inner", "left", "outer", "semi", "anti", "cross"] + # Join strategy. + # @param suffix [String] + # Suffix to append to columns with a duplicate name. + # @param allow_parallel [Boolean] + # Allow the physical plan to optionally evaluate the computation of both + # DataFrames up to the join in parallel. + # @param force_parallel [Boolean] + # Force the physical plan to evaluate the computation of both DataFrames up to + # the join in parallel. + # + # @return [LazyFrame] + # + # @example + # df = Polars::DataFrame.new( + # { + # "foo" => [1, 2, 3], + # "bar" => [6.0, 7.0, 8.0], + # "ham" => ["a", "b", "c"] + # } + # ).lazy + # other_df = Polars::DataFrame.new( + # { + # "apple" => ["x", "y", "z"], + # "ham" => ["a", "b", "d"] + # } + # ).lazy + # df.join(other_df, on: "ham").collect + # # => + # # shape: (2, 4) + # # ┌─────┬─────┬─────┬───────┐ + # # │ foo ┆ bar ┆ ham ┆ apple │ + # # │ --- ┆ --- ┆ --- ┆ --- │ + # # │ i64 ┆ f64 ┆ str ┆ str │ + # # ╞═════╪═════╪═════╪═══════╡ + # # │ 1 ┆ 6.0 ┆ a ┆ x │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + # # │ 2 ┆ 7.0 ┆ b ┆ y │ + # # └─────┴─────┴─────┴───────┘ + # + # @example + # df.join(other_df, on: "ham", how: "outer").collect + # # => + # # shape: (4, 4) + # # ┌──────┬──────┬─────┬───────┐ + # # │ foo ┆ bar ┆ ham ┆ apple │ + # # │ --- ┆ --- ┆ --- ┆ --- │ + # # │ i64 ┆ f64 ┆ str ┆ str │ + # # ╞══════╪══════╪═════╪═══════╡ + # # │ 1 ┆ 6.0 ┆ a ┆ x │ + # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + # # │ 2 ┆ 7.0 ┆ b ┆ y │ + # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + # # │ null ┆ null ┆ d ┆ z │ + # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + # # │ 3 ┆ 8.0 ┆ c ┆ null │ + # # └──────┴──────┴─────┴───────┘ + # + # @example + # df.join(other_df, on: "ham", how: "left").collect + # # => + # # shape: (3, 4) + # # ┌─────┬─────┬─────┬───────┐ + # # │ foo ┆ bar ┆ ham ┆ apple │ + # # │ --- ┆ --- ┆ --- ┆ --- │ + # # │ i64 ┆ f64 ┆ str ┆ str │ + # # ╞═════╪═════╪═════╪═══════╡ + # # │ 1 ┆ 6.0 ┆ a ┆ x │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + # # │ 2 ┆ 7.0 ┆ b ┆ y │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + # # │ 3 ┆ 8.0 ┆ c ┆ null │ + # # └─────┴─────┴─────┴───────┘ + # + # @example + # df.join(other_df, on: "ham", how: "semi").collect + # # => + # # shape: (2, 3) + # # ┌─────┬─────┬─────┐ + # # │ foo ┆ bar ┆ ham │ + # # │ --- ┆ --- ┆ --- │ + # # │ i64 ┆ f64 ┆ str │ + # # ╞═════╪═════╪═════╡ + # # │ 1 ┆ 6.0 ┆ a │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 2 ┆ 7.0 ┆ b │ + # # └─────┴─────┴─────┘ + # + # @example + # df.join(other_df, on: "ham", how: "anti").collect + # # => + # # shape: (1, 3) + # # ┌─────┬─────┬─────┐ + # # │ foo ┆ bar ┆ ham │ + # # │ --- ┆ --- ┆ --- │ + # # │ i64 ┆ f64 ┆ str │ + # # ╞═════╪═════╪═════╡ + # # │ 3 ┆ 8.0 ┆ c │ + # # └─────┴─────┴─────┘ def join( other, left_on: nil, right_on: nil, on: nil, @@ -320,10 +889,47 @@ suffix, ) ) end + # Add or overwrite multiple columns in a DataFrame. + # + # @param exprs [Object] + # List of Expressions that evaluate to columns. + # + # @return [LazyFrame] + # + # @example + # ldf = Polars::DataFrame.new( + # { + # "a" => [1, 2, 3, 4], + # "b" => [0.5, 4, 10, 13], + # "c" => [true, true, false, true] + # } + # ).lazy + # ldf.with_columns( + # [ + # (Polars.col("a") ** 2).alias("a^2"), + # (Polars.col("b") / 2).alias("b/2"), + # (Polars.col("c").is_not()).alias("not c") + # ] + # ).collect + # # => + # # shape: (4, 6) + # # ┌─────┬──────┬───────┬──────┬──────┬───────┐ + # # │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + # # │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + # # ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + # # │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + # # │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + # # │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + # # │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + # # └─────┴──────┴───────┴──────┴──────┴───────┘ def with_columns(exprs) exprs = if exprs.nil? [] elsif exprs.is_a?(Expr) @@ -348,115 +954,703 @@ end # def with_context # end + # Add or overwrite column in a DataFrame. # + # @param column [Object] + # Expression that evaluates to column or a Series to use. + # + # @return [LazyFrame] + # + # @example + # df = Polars::DataFrame.new( + # { + # "a" => [1, 3, 5], + # "b" => [2, 4, 6] + # } + # ).lazy + # df.with_column((Polars.col("b") ** 2).alias("b_squared")).collect + # # => + # # shape: (3, 3) + # # ┌─────┬─────┬───────────┐ + # # │ a ┆ b ┆ b_squared │ + # # │ --- ┆ --- ┆ --- │ + # # │ i64 ┆ i64 ┆ f64 │ + # # ╞═════╪═════╪═══════════╡ + # # │ 1 ┆ 2 ┆ 4.0 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤ + # # │ 3 ┆ 4 ┆ 16.0 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤ + # # │ 5 ┆ 6 ┆ 36.0 │ + # # └─────┴─────┴───────────┘ + # + # @example + # df.with_column(Polars.col("a") ** 2).collect + # # => + # # shape: (3, 2) + # # ┌──────┬─────┐ + # # │ a ┆ b │ + # # │ --- ┆ --- │ + # # │ f64 ┆ i64 │ + # # ╞══════╪═════╡ + # # │ 1.0 ┆ 2 │ + # # ├╌╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 9.0 ┆ 4 │ + # # ├╌╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 25.0 ┆ 6 │ + # # └──────┴─────┘ def with_column(column) with_columns([column]) end - # def drop - # end + # Remove one or multiple columns from a DataFrame. + # + # @param columns [Object] + # - Name of the column that should be removed. + # - List of column names. + # + # @return [LazyFrame] + def drop(columns) + if columns.is_a?(String) + columns = [columns] + end + _from_rbldf(_ldf.drop_columns(columns)) + end + # Rename column names. # + # @param mapping [Hash] + # Key value pairs that map from old name to new name. + # + # @return [LazyFrame] def rename(mapping) existing = mapping.keys _new = mapping.values _from_rbldf(_ldf.rename(existing, _new)) end - # def reverse - # end + # Reverse the DataFrame. + # + # @return [LazyFrame] + def reverse + _from_rbldf(_ldf.reverse) + end - # def shift - # end + # Shift the values by a given period. + # + # @param periods [Integer] + # Number of places to shift (may be negative). + # + # @return [LazyFrame] + # + # @example + # df = Polars::DataFrame.new( + # { + # "a" => [1, 3, 5], + # "b" => [2, 4, 6] + # } + # ).lazy + # df.shift(1).collect + # # => + # # shape: (3, 2) + # # ┌──────┬──────┐ + # # │ a ┆ b │ + # # │ --- ┆ --- │ + # # │ i64 ┆ i64 │ + # # ╞══════╪══════╡ + # # │ null ┆ null │ + # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤ + # # │ 1 ┆ 2 │ + # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤ + # # │ 3 ┆ 4 │ + # # └──────┴──────┘ + # + # @example + # df.shift(-1).collect + # # => + # # shape: (3, 2) + # # ┌──────┬──────┐ + # # │ a ┆ b │ + # # │ --- ┆ --- │ + # # │ i64 ┆ i64 │ + # # ╞══════╪══════╡ + # # │ 3 ┆ 4 │ + # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤ + # # │ 5 ┆ 6 │ + # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤ + # # │ null ┆ null │ + # # └──────┴──────┘ + def shift(periods) + _from_rbldf(_ldf.shift(periods)) + end - # def shift_and_fill - # end + # Shift the values by a given period and fill the resulting null values. + # + # @param periods [Integer] + # Number of places to shift (may be negative). + # @param fill_value [Object] + # Fill `nil` values with the result of this expression. + # + # @return [LazyFrame] + # + # @example + # df = Polars::DataFrame.new( + # { + # "a" => [1, 3, 5], + # "b" => [2, 4, 6] + # } + # ).lazy + # df.shift_and_fill(1, 0).collect + # # => + # # shape: (3, 2) + # # ┌─────┬─────┐ + # # │ a ┆ b │ + # # │ --- ┆ --- │ + # # │ i64 ┆ i64 │ + # # ╞═════╪═════╡ + # # │ 0 ┆ 0 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 1 ┆ 2 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 3 ┆ 4 │ + # # └─────┴─────┘ + # + # @example + # df.shift_and_fill(-1, 0).collect + # # => + # # shape: (3, 2) + # # ┌─────┬─────┐ + # # │ a ┆ b │ + # # │ --- ┆ --- │ + # # │ i64 ┆ i64 │ + # # ╞═════╪═════╡ + # # │ 3 ┆ 4 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 5 ┆ 6 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 0 ┆ 0 │ + # # └─────┴─────┘ + def shift_and_fill(periods, fill_value) + if !fill_value.is_a?(Expr) + fill_value = Polars.lit(fill_value) + end + _from_rbldf(_ldf.shift_and_fill(periods, fill_value._rbexpr)) + end - # def slice - # end + # Get a slice of this DataFrame. + # + # @param offset [Integer] + # Start index. Negative indexing is supported. + # @param length [Integer] + # Length of the slice. If set to `nil`, all rows starting at the offset + # will be selected. + # + # @return [LazyFrame] + # + # @example + # df = Polars::DataFrame.new( + # { + # "a" => ["x", "y", "z"], + # "b" => [1, 3, 5], + # "c" => [2, 4, 6] + # } + # ).lazy + # df.slice(1, 2).collect + # # => + # # shape: (2, 3) + # # ┌─────┬─────┬─────┐ + # # │ a ┆ b ┆ c │ + # # │ --- ┆ --- ┆ --- │ + # # │ str ┆ i64 ┆ i64 │ + # # ╞═════╪═════╪═════╡ + # # │ y ┆ 3 ┆ 4 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ z ┆ 5 ┆ 6 │ + # # └─────┴─────┴─────┘ + def slice(offset, length = nil) + if length && length < 0 + raise ArgumentError, "Negative slice lengths (#{length}) are invalid for LazyFrame" + end + _from_rbldf(_ldf.slice(offset, length)) + end - # def limit - # end + # Get the first `n` rows. + # + # Alias for {#head}. + # + # @param n [Integer] + # Number of rows to return. + # + # @return [LazyFrame] + # + # @note + # Consider using the {#fetch} operation if you only want to test your + # query. The {#fetch} operation will load the first `n` rows at the scan + # level, whereas the {#head}/{#limit} are applied at the end. + def limit(n = 5) + head(5) + end - # def head - # end + # Get the first `n` rows. + # + # @param n [Integer] + # Number of rows to return. + # + # @return [LazyFrame] + # + # @note + # Consider using the {#fetch} operation if you only want to test your + # query. The {#fetch} operation will load the first `n` rows at the scan + # level, whereas the {#head}/{#limit} are applied at the end. + def head(n = 5) + slice(0, n) + end - # def tail - # end + # Get the last `n` rows. + # + # @param n [Integer] + # Number of rows. + # + # @return [LazyFrame] + def tail(n = 5) + _from_rbldf(_ldf.tail(n)) + end - # def last - # end + # Get the last row of the DataFrame. + # + # @return [LazyFrame] + def last + tail(1) + end - # def first - # end + # Get the first row of the DataFrame. + # + # @return [LazyFrame] + def first + slice(0, 1) + end # def with_row_count # end - # def take_every - # end + # Take every nth row in the LazyFrame and return as a new LazyFrame. + # + # @return [LazyFrame] + # + # @example + # s = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [5, 6, 7, 8]}).lazy + # s.take_every(2).collect + # # => + # # shape: (2, 2) + # # ┌─────┬─────┐ + # # │ a ┆ b │ + # # │ --- ┆ --- │ + # # │ i64 ┆ i64 │ + # # ╞═════╪═════╡ + # # │ 1 ┆ 5 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 3 ┆ 7 │ + # # └─────┴─────┘ + def take_every(n) + select(Utils.col("*").take_every(n)) + end # def fill_null # end + # Fill floating point NaN values. # + # @param fill_value [Object] + # Value to fill the NaN values with. + # + # @return [LazyFrame] + # + # @note + # Note that floating point NaN (Not a Number) are not missing values! + # To replace missing values, use `fill_null` instead. + # + # @example + # df = Polars::DataFrame.new( + # { + # "a" => [1.5, 2, Float::NAN, 4], + # "b" => [0.5, 4, Float::NAN, 13], + # } + # ).lazy + # df.fill_nan(99).collect + # # => + # # shape: (4, 2) + # # ┌──────┬──────┐ + # # │ a ┆ b │ + # # │ --- ┆ --- │ + # # │ f64 ┆ f64 │ + # # ╞══════╪══════╡ + # # │ 1.5 ┆ 0.5 │ + # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤ + # # │ 2.0 ┆ 4.0 │ + # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤ + # # │ 99.0 ┆ 99.0 │ + # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤ + # # │ 4.0 ┆ 13.0 │ + # # └──────┴──────┘ def fill_nan(fill_value) if !fill_value.is_a?(Expr) fill_value = Utils.lit(fill_value) end _from_rbldf(_ldf.fill_nan(fill_value._rbexpr)) end - # def std - # end + # Aggregate the columns in the DataFrame to their standard deviation value. + # + # @return [LazyFrame] + # + # @example + # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy + # df.std.collect + # # => + # # shape: (1, 2) + # # ┌──────────┬─────┐ + # # │ a ┆ b │ + # # │ --- ┆ --- │ + # # │ f64 ┆ f64 │ + # # ╞══════════╪═════╡ + # # │ 1.290994 ┆ 0.5 │ + # # └──────────┴─────┘ + # + # @example + # df.std(ddof: 0).collect + # # => + # # shape: (1, 2) + # # ┌──────────┬──────────┐ + # # │ a ┆ b │ + # # │ --- ┆ --- │ + # # │ f64 ┆ f64 │ + # # ╞══════════╪══════════╡ + # # │ 1.118034 ┆ 0.433013 │ + # # └──────────┴──────────┘ + def std(ddof: 1) + _from_rbldf(_ldf.std(ddof)) + end - # def var - # end + # Aggregate the columns in the DataFrame to their variance value. + # + # @return [LazyFrame] + # + # @example + # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy + # df.var.collect + # # => + # # shape: (1, 2) + # # ┌──────────┬──────┐ + # # │ a ┆ b │ + # # │ --- ┆ --- │ + # # │ f64 ┆ f64 │ + # # ╞══════════╪══════╡ + # # │ 1.666667 ┆ 0.25 │ + # # └──────────┴──────┘ + # + # @example + # df.var(ddof: 0).collect + # # => + # # shape: (1, 2) + # # ┌──────┬────────┐ + # # │ a ┆ b │ + # # │ --- ┆ --- │ + # # │ f64 ┆ f64 │ + # # ╞══════╪════════╡ + # # │ 1.25 ┆ 0.1875 │ + # # └──────┴────────┘ + def var(ddof: 1) + _from_rbldf(_ldf.var(ddof)) + end - # def max - # end + # Aggregate the columns in the DataFrame to their maximum value. + # + # @return [LazyFrame] + # + # @example + # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy + # df.max.collect + # # => + # # shape: (1, 2) + # # ┌─────┬─────┐ + # # │ a ┆ b │ + # # │ --- ┆ --- │ + # # │ i64 ┆ i64 │ + # # ╞═════╪═════╡ + # # │ 4 ┆ 2 │ + # # └─────┴─────┘ + def max + _from_rbldf(_ldf.max) + end - # def min - # end + # Aggregate the columns in the DataFrame to their minimum value. + # + # @return [LazyFrame] + # + # @example + # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy + # df.min.collect + # # => + # # shape: (1, 2) + # # ┌─────┬─────┐ + # # │ a ┆ b │ + # # │ --- ┆ --- │ + # # │ i64 ┆ i64 │ + # # ╞═════╪═════╡ + # # │ 1 ┆ 1 │ + # # └─────┴─────┘ + def min + _from_rbldf(_ldf.min) + end - # def sum - # end + # Aggregate the columns in the DataFrame to their sum value. + # + # @return [LazyFrame] + # + # @example + # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy + # df.sum.collect + # # => + # # shape: (1, 2) + # # ┌─────┬─────┐ + # # │ a ┆ b │ + # # │ --- ┆ --- │ + # # │ i64 ┆ i64 │ + # # ╞═════╪═════╡ + # # │ 10 ┆ 5 │ + # # └─────┴─────┘ + def sum + _from_rbldf(_ldf.sum) + end - # def mean - # end + # Aggregate the columns in the DataFrame to their mean value. + # + # @return [LazyFrame] + # + # @example + # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy + # df.mean.collect + # # => + # # shape: (1, 2) + # # ┌─────┬──────┐ + # # │ a ┆ b │ + # # │ --- ┆ --- │ + # # │ f64 ┆ f64 │ + # # ╞═════╪══════╡ + # # │ 2.5 ┆ 1.25 │ + # # └─────┴──────┘ + def mean + _from_rbldf(_ldf.mean) + end - # def median - # end + # Aggregate the columns in the DataFrame to their median value. + # + # @return [LazyFrame] + # + # @example + # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy + # df.median.collect + # # => + # # shape: (1, 2) + # # ┌─────┬─────┐ + # # │ a ┆ b │ + # # │ --- ┆ --- │ + # # │ f64 ┆ f64 │ + # # ╞═════╪═════╡ + # # │ 2.5 ┆ 1.0 │ + # # └─────┴─────┘ + def median + _from_rbldf(_ldf.median) + end - # def quantile - # end + # Aggregate the columns in the DataFrame to their quantile value. + # + # @param quantile [Float] + # Quantile between 0.0 and 1.0. + # @param interpolation ["nearest", "higher", "lower", "midpoint", "linear"] + # Interpolation method. + # + # @return [LazyFrame] + # + # @example + # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy + # df.quantile(0.7).collect + # # => + # # shape: (1, 2) + # # ┌─────┬─────┐ + # # │ a ┆ b │ + # # │ --- ┆ --- │ + # # │ f64 ┆ f64 │ + # # ╞═════╪═════╡ + # # │ 3.0 ┆ 1.0 │ + # # └─────┴─────┘ + def quantile(quantile, interpolation: "nearest") + _from_rbldf(_ldf.quantile(quantile, interpolation)) + end + # Explode lists to long format. # + # @return [LazyFrame] + # + # @example + # df = Polars::DataFrame.new( + # { + # "letters" => ["a", "a", "b", "c"], + # "numbers" => [[1], [2, 3], [4, 5], [6, 7, 8]], + # } + # ).lazy + # df.explode("numbers").collect + # # => + # # shape: (8, 2) + # # ┌─────────┬─────────┐ + # # │ letters ┆ numbers │ + # # │ --- ┆ --- │ + # # │ str ┆ i64 │ + # # ╞═════════╪═════════╡ + # # │ a ┆ 1 │ + # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤ + # # │ a ┆ 2 │ + # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤ + # # │ a ┆ 3 │ + # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤ + # # │ b ┆ 4 │ + # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤ + # # │ b ┆ 5 │ + # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤ + # # │ c ┆ 6 │ + # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤ + # # │ c ┆ 7 │ + # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤ + # # │ c ┆ 8 │ + # # └─────────┴─────────┘ def explode(columns) columns = Utils.selection_to_rbexpr_list(columns) _from_rbldf(_ldf.explode(columns)) end - # def unique - # end + # Drop duplicate rows from this DataFrame. + # + # Note that this fails if there is a column of type `List` in the DataFrame or + # subset. + # + # @param maintain_order [Boolean] + # Keep the same order as the original DataFrame. This requires more work to + # compute. + # @param subset [Object] + # Subset to use to compare rows. + # @param keep ["first", "last"] + # Which of the duplicate rows to keep. + # + # @return [LazyFrame] + def unique(maintain_order: true, subset: nil, keep: "first") + if !subset.nil? && !subset.is_a?(Array) + subset = [subset] + end + _from_rbldf(_ldf.unique(maintain_order, subset, keep)) + end # def drop_nulls # end # def melt # end # def map # end - # def interpolate - # end + # Interpolate intermediate values. The interpolation method is linear. + # + # @return [LazyFrame] + # + # @example + # df = Polars::DataFrame.new( + # { + # "foo" => [1, nil, 9, 10], + # "bar" => [6, 7, 9, nil], + # "baz" => [1, nil, nil, 9] + # } + # ).lazy + # df.interpolate.collect + # # => + # # shape: (4, 3) + # # ┌─────┬──────┬─────┐ + # # │ foo ┆ bar ┆ baz │ + # # │ --- ┆ --- ┆ --- │ + # # │ i64 ┆ i64 ┆ i64 │ + # # ╞═════╪══════╪═════╡ + # # │ 1 ┆ 6 ┆ 1 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 5 ┆ 7 ┆ 3 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 9 ┆ 9 ┆ 6 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 10 ┆ null ┆ 9 │ + # # └─────┴──────┴─────┘ + def interpolate + select(Utils.col("*").interpolate) + end - # def unnest - # end + # Decompose a struct into its fields. + # + # The fields will be inserted into the `DataFrame` on the location of the + # `struct` type. + # + # @param names [Object] + # Names of the struct columns that will be decomposed by its fields + # + # @return [LazyFrame] + # + # @example + # df = ( + # Polars::DataFrame.new( + # { + # "before" => ["foo", "bar"], + # "t_a" => [1, 2], + # "t_b" => ["a", "b"], + # "t_c" => [true, nil], + # "t_d" => [[1, 2], [3]], + # "after" => ["baz", "womp"] + # } + # ) + # .lazy + # .select( + # ["before", Polars.struct(Polars.col("^t_.$")).alias("t_struct"), "after"] + # ) + # ) + # df.fetch + # # => + # # shape: (2, 3) + # # ┌────────┬─────────────────────┬───────┐ + # # │ before ┆ t_struct ┆ after │ + # # │ --- ┆ --- ┆ --- │ + # # │ str ┆ struct[4] ┆ str │ + # # ╞════════╪═════════════════════╪═══════╡ + # # │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + # # │ bar ┆ {2,"b",null,[3]} ┆ womp │ + # # └────────┴─────────────────────┴───────┘ + # + # @example + # df.unnest("t_struct").fetch + # # => + # # shape: (2, 6) + # # ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + # # │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + # # │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + # # ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + # # │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + # # │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + # # └────────┴─────┴─────┴──────┴───────────┴───────┘ + def unnest(names) + if names.is_a?(String) + names = [names] + end + _from_rbldf(_ldf.unnest(names)) + end private def initialize_copy(other) super