lib/polars/data_frame.rb in polars-df-0.1.2 vs lib/polars/data_frame.rb in polars-df-0.1.3

- old
+ new

@@ -153,16 +153,39 @@ ) ) end # @private - def self._read_parquet(file) + def self._read_parquet( + file, + columns: nil, + n_rows: nil, + parallel: "auto", + row_count_name: nil, + row_count_offset: 0, + low_memory: false + ) if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname)) file = Utils.format_path(file) end - _from_rbdf(RbDataFrame.read_parquet(file)) + if file.is_a?(String) && file.include?("*") + raise Todo + end + + projection, columns = Utils.handle_projection_columns(columns) + _from_rbdf( + RbDataFrame.read_parquet( + file, + columns, + projection, + n_rows, + parallel, + Utils._prepare_row_count_args(row_count_name, row_count_offset), + low_memory + ) + ) end # def self._read_avro # end @@ -257,15 +280,17 @@ # Get column names. # # @return [Array] # # @example - # df = Polars::DataFrame.new({ - # "foo" => [1, 2, 3], - # "bar" => [6, 7, 8], - # "ham" => ["a", "b", "c"] - # }) + # df = Polars::DataFrame.new( + # { + # "foo" => [1, 2, 3], + # "bar" => [6, 7, 8], + # "ham" => ["a", "b", "c"] + # } + # ) # df.columns # # => ["foo", "bar", "ham"] def columns _df.columns end @@ -277,15 +302,17 @@ # The length of the list should be equal to the width of the DataFrame. # # @return [Object] # # @example - # df = Polars::DataFrame.new({ - # "foo" => [1, 2, 3], - # "bar" => [6, 7, 8], - # "ham" => ["a", "b", "c"] - # }) + # df = Polars::DataFrame.new( + # { + # "foo" => [1, 2, 3], + # "bar" => [6, 7, 8], + # "ham" => ["a", "b", "c"] + # } + # ) # df.columns = ["apple", "banana", "orange"] # df # # => # # shape: (3, 3) # # ┌───────┬────────┬────────┐ @@ -306,15 +333,17 @@ # Get dtypes of columns in DataFrame. Dtypes can also be found in column headers when printing the DataFrame. # # @return [Array] # # @example - # df = Polars::DataFrame.new({ - # "foo" => [1, 2, 3], - # "bar" => [6.0, 7.0, 8.0], - # "ham" => ["a", "b", "c"] - # }) + # df = Polars::DataFrame.new( + # { + # "foo" => [1, 2, 3], + # "bar" => [6.0, 7.0, 8.0], + # "ham" => ["a", "b", "c"] + # } + # ) # df.dtypes # # => [:i64, :f64, :str] def dtypes _df.dtypes end @@ -322,60 +351,136 @@ # Get the schema. # # @return [Hash] # # @example - # df = Polars::DataFrame.new({ - # "foo" => [1, 2, 3], - # "bar" => [6.0, 7.0, 8.0], - # "ham" => ["a", "b", "c"] - # }) + # df = Polars::DataFrame.new( + # { + # "foo" => [1, 2, 3], + # "bar" => [6.0, 7.0, 8.0], + # "ham" => ["a", "b", "c"] + # } + # ) # df.schema # # => {"foo"=>:i64, "bar"=>:f64, "ham"=>:str} def schema columns.zip(dtypes).to_h end - # def ==(other) - # end + # Equal. + # + # @return [DataFrame] + def ==(other) + _comp(other, "eq") + end - # def !=(other) - # end + # Not equal. + # + # @return [DataFrame] + def !=(other) + _comp(other, "neq") + end - # def >(other) - # end + # Greater than. + # + # @return [DataFrame] + def >(other) + _comp(other, "gt") + end - # def <(other) - # end + # Less than. + # + # @return [DataFrame] + def <(other) + _comp(other, "lt") + end - # def >=(other) - # end + # Greater than or equal. + # + # @return [DataFrame] + def >=(other) + _comp(other, "gt_eq") + end - # def <=(other) - # end + # Less than or equal. + # + # @return [DataFrame] + def <=(other) + _comp(other, "lt_eq") + end - # def *(other) - # end + # Performs multiplication. + # + # @return [DataFrame] + def *(other) + if other.is_a?(DataFrame) + return _from_rbdf(_df.mul_df(other._df)) + end - # def /(other) - # end + other = _prepare_other_arg(other) + _from_rbdf(_df.mul(other._s)) + end - # def +(other) - # end + # Performs division. + # + # @return [DataFrame] + def /(other) + if other.is_a?(DataFrame) + return _from_rbdf(_df.div_df(other._df)) + end - # def -(other) - # end + other = _prepare_other_arg(other) + _from_rbdf(_df.div(other._s)) + end - # def %(other) - # end + # Performs addition. + # + # @return [DataFrame] + def +(other) + if other.is_a?(DataFrame) + return _from_rbdf(_df.add_df(other._df)) + end + other = _prepare_other_arg(other) + _from_rbdf(_df.add(other._s)) + end + + # Performs subtraction. # + # @return [DataFrame] + def -(other) + if other.is_a?(DataFrame) + return _from_rbdf(_df.sub_df(other._df)) + end + + other = _prepare_other_arg(other) + _from_rbdf(_df.sub(other._s)) + end + + # Returns the modulo. + # + # @return [DataFrame] + def %(other) + if other.is_a?(DataFrame) + return _from_rbdf(_df.rem_df(other._df)) + end + + other = _prepare_other_arg(other) + _from_rbdf(_df.rem(other._s)) + end + + # Returns a string representing the DataFrame. + # + # @return [String] def to_s _df.to_s end alias_method :inspect, :to_s + # Check if DataFrame includes column. + # + # @return [Boolean] def include?(name) columns.include?(name) end # def each @@ -385,21 +490,92 @@ # end # def _pos_idxs # end + # Returns subset of the DataFrame. # - def [](name) - Utils.wrap_s(_df.column(name)) + # @return [Object] + def [](*args) + if args.size == 2 + row_selection, col_selection = args + + # df[.., unknown] + if row_selection.is_a?(Range) + + # multiple slices + # df[.., ..] + if col_selection.is_a?(Range) + raise Todo + end + end + + # df[2, ..] (select row as df) + if row_selection.is_a?(Integer) + if col_selection.is_a?(Array) + df = self[0.., col_selection] + return df.slice(row_selection, 1) + end + # df[2, "a"] + if col_selection.is_a?(String) + return self[col_selection][row_selection] + end + end + + # column selection can be "a" and ["a", "b"] + if col_selection.is_a?(String) + col_selection = [col_selection] + end + + # df[.., 1] + if col_selection.is_a?(Integer) + series = to_series(col_selection) + return series[row_selection] + end + + if col_selection.is_a?(Array) + # df[.., [1, 2]] + if is_int_sequence(col_selection) + series_list = col_selection.map { |i| to_series(i) } + df = self.class.new(series_list) + return df[row_selection] + end + end + + df = self[col_selection] + return df[row_selection] + elsif args.size == 1 + item = args[0] + + # select single column + # df["foo"] + if item.is_a?(String) + return Utils.wrap_s(_df.column(item)) + end + + # df[idx] + if item.is_a?(Integer) + return slice(_pos_idx(item, dim: 0), 1) + end + + # df[..] + if item.is_a?(Range) + return Slice.new(self).apply(item) + end + end + + raise ArgumentError, "Cannot get item of type: #{item.class.name}" end # def []=(key, value) # end # no to_arrow + # Convert DataFrame to a hash mapping column name to values. # + # @return [Hash] def to_h(as_series: true) if as_series get_columns.to_h { |s| [s.name, s] } else get_columns.to_h { |s| [s.name, s.to_a] } @@ -420,15 +596,17 @@ # Location of selection. # # @return [Series] # # @example - # df = Polars::DataFrame.new({ - # "foo" => [1, 2, 3], - # "bar" => [6, 7, 8], - # "ham" => ["a", "b", "c"] - # }) + # df = Polars::DataFrame.new( + # { + # "foo" => [1, 2, 3], + # "bar" => [6, 7, 8], + # "ham" => ["a", "b", "c"] + # } + # ) # df.to_series(1) # # => # # shape: (3,) # # Series: 'bar' [i64] # # [ @@ -517,15 +695,17 @@ # A string representing null values (defaulting to the empty string). # # @return [String, nil] # # @example - # df = Polars::DataFrame.new({ - # "foo" => [1, 2, 3, 4, 5], - # "bar" => [6, 7, 8, 9, 10], - # "ham" => ["a", "b", "c", "d", "e"] - # }) + # df = Polars::DataFrame.new( + # { + # "foo" => [1, 2, 3, 4, 5], + # "bar" => [6, 7, 8, 9, 10], + # "ham" => ["a", "b", "c", "d", "e"] + # } + # ) # df.write_csv("file.csv") def write_csv( file = nil, has_header: true, sep: ",", @@ -692,14 +872,16 @@ # Reverse the DataFrame. # # @return [DataFrame] # # @example - # df = Polars::DataFrame.new({ - # "key" => ["a", "b", "c"], - # "val" => [1, 2, 3] - # }) + # df = Polars::DataFrame.new( + # { + # "key" => ["a", "b", "c"], + # "val" => [1, 2, 3] + # } + # ) # df.reverse() # # => # # shape: (3, 2) # # ┌─────┬─────┐ # # │ key ┆ val │ @@ -722,15 +904,17 @@ # Key value pairs that map from old name to new name. # # @return [DataFrame] # # @example - # df = Polars::DataFrame.new({ - # "foo" => [1, 2, 3], - # "bar" => [6, 7, 8], - # "ham" => ["a", "b", "c"] - # }) + # df = Polars::DataFrame.new( + # { + # "foo" => [1, 2, 3], + # "bar" => [6, 7, 8], + # "ham" => ["a", "b", "c"] + # } + # ) # df.rename({"foo" => "apple"}) # # => # # shape: (3, 3) # # ┌───────┬─────┬─────┐ # # │ apple ┆ bar ┆ ham │ @@ -773,15 +957,17 @@ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤ # # │ 3 ┆ 99 ┆ 6 │ # # └─────┴─────┴─────┘ # # @example - # df = Polars::DataFrame.new({ - # "a" => [1, 2, 3, 4], - # "b" => [0.5, 4, 10, 13], - # "c" => [true, true, false, true] - # }) + # df = Polars::DataFrame.new( + # { + # "a" => [1, 2, 3, 4], + # "b" => [0.5, 4, 10, 13], + # "c" => [true, true, false, true] + # } + # ) # s = Polars::Series.new("d", [-2.5, 15, 20.5, 0]) # df.insert_at_idx(3, s) # # => # # shape: (4, 4) # # ┌─────┬──────┬───────┬──────┐ @@ -803,67 +989,564 @@ end _df.insert_at_idx(index, series._s) self end + # Filter the rows in the DataFrame based on a predicate expression. + # + # @param predicate [Expr] + # Expression that evaluates to a boolean Series. + # + # @return [DataFrame] + # + # @example Filter on one condition: + # df = Polars::DataFrame.new( + # { + # "foo" => [1, 2, 3], + # "bar" => [6, 7, 8], + # "ham" => ["a", "b", "c"] + # } + # ) + # df.filter(Polars.col("foo") < 3) + # # => + # # shape: (2, 3) + # # ┌─────┬─────┬─────┐ + # # │ foo ┆ bar ┆ ham │ + # # │ --- ┆ --- ┆ --- │ + # # │ i64 ┆ i64 ┆ str │ + # # ╞═════╪═════╪═════╡ + # # │ 1 ┆ 6 ┆ a │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 2 ┆ 7 ┆ b │ + # # └─────┴─────┴─────┘ + # + # @example Filter on multiple conditions: + # df.filter((Polars.col("foo") < 3) & (Polars.col("ham") == "a")) + # # => + # # shape: (1, 3) + # # ┌─────┬─────┬─────┐ + # # │ foo ┆ bar ┆ ham │ + # # │ --- ┆ --- ┆ --- │ + # # │ i64 ┆ i64 ┆ str │ + # # ╞═════╪═════╪═════╡ + # # │ 1 ┆ 6 ┆ a │ + # # └─────┴─────┴─────┘ def filter(predicate) lazy.filter(predicate).collect end - # def describe - # end + # Summary statistics for a DataFrame. + # + # @return [DataFrame] + # + # @example + # df = Polars::DataFrame.new( + # { + # "a" => [1.0, 2.8, 3.0], + # "b" => [4, 5, nil], + # "c" => [true, false, true], + # "d" => [nil, "b", "c"], + # "e" => ["usd", "eur", nil] + # } + # ) + # df.describe + # # => + # # shape: (7, 6) + # # ┌────────────┬──────────┬──────────┬──────┬──────┬──────┐ + # # │ describe ┆ a ┆ b ┆ c ┆ d ┆ e │ + # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + # # │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str │ + # # ╞════════════╪══════════╪══════════╪══════╪══════╪══════╡ + # # │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤ + # # │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤ + # # │ mean ┆ 2.266667 ┆ 4.5 ┆ null ┆ null ┆ null │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤ + # # │ std ┆ 1.101514 ┆ 0.707107 ┆ null ┆ null ┆ null │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤ + # # │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤ + # # │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤ + # # │ median ┆ 2.8 ┆ 4.5 ┆ null ┆ null ┆ null │ + # # └────────────┴──────────┴──────────┴──────┴──────┴──────┘ + def describe + describe_cast = lambda do |stat| + columns = [] + self.columns.each_with_index do |s, i| + if self[s].is_numeric || self[s].is_boolean + columns << stat[0.., i].cast(:f64) + else + # for dates, strings, etc, we cast to string so that all + # statistics can be shown + columns << stat[0.., i].cast(:str) + end + end + self.class.new(columns) + end - # def find_idx_by_name - # end + summary = _from_rbdf( + Polars.concat( + [ + describe_cast.( + self.class.new(columns.to_h { |c| [c, [height]] }) + ), + describe_cast.(null_count), + describe_cast.(mean), + describe_cast.(std), + describe_cast.(min), + describe_cast.(max), + describe_cast.(median) + ] + )._df + ) + summary.insert_at_idx( + 0, + Polars::Series.new( + "describe", + ["count", "null_count", "mean", "std", "min", "max", "median"], + ) + ) + summary + end - # def replace_at_idx - # end + # Find the index of a column by name. + # + # @param name [String] + # Name of the column to find. + # + # @return [Series] + # + # @example + # df = Polars::DataFrame.new( + # {"foo" => [1, 2, 3], "bar" => [6, 7, 8], "ham" => ["a", "b", "c"]} + # ) + # df.find_idx_by_name("ham") + # # => 2 + def find_idx_by_name(name) + _df.find_idx_by_name(name) + end + # Replace a column at an index location. # + # @param index [Integer] + # Column index. + # @param series [Series] + # Series that will replace the column. + # + # @return [DataFrame] + # + # @example + # df = Polars::DataFrame.new( + # { + # "foo" => [1, 2, 3], + # "bar" => [6, 7, 8], + # "ham" => ["a", "b", "c"] + # } + # ) + # s = Polars::Series.new("apple", [10, 20, 30]) + # df.replace_at_idx(0, s) + # # => + # # shape: (3, 3) + # # ┌───────┬─────┬─────┐ + # # │ apple ┆ bar ┆ ham │ + # # │ --- ┆ --- ┆ --- │ + # # │ i64 ┆ i64 ┆ str │ + # # ╞═══════╪═════╪═════╡ + # # │ 10 ┆ 6 ┆ a │ + # # ├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 20 ┆ 7 ┆ b │ + # # ├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 30 ┆ 8 ┆ c │ + # # └───────┴─────┴─────┘ + def replace_at_idx(index, series) + if index < 0 + index = columns.length + index + end + _df.replace_at_idx(index, series._s) + self + end + + # Sort the DataFrame by column. + # + # @param by [String] + # By which column to sort. + # @param reverse [Boolean] + # Reverse/descending sort. + # @param nulls_last [Boolean] + # Place null values last. Can only be used if sorted by a single column. + # + # @return [DataFrame] + # + # @example + # df = Polars::DataFrame.new( + # { + # "foo" => [1, 2, 3], + # "bar" => [6.0, 7.0, 8.0], + # "ham" => ["a", "b", "c"] + # } + # ) + # df.sort("foo", reverse: true) + # # => + # # shape: (3, 3) + # # ┌─────┬─────┬─────┐ + # # │ foo ┆ bar ┆ ham │ + # # │ --- ┆ --- ┆ --- │ + # # │ i64 ┆ f64 ┆ str │ + # # ╞═════╪═════╪═════╡ + # # │ 3 ┆ 8.0 ┆ c │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 2 ┆ 7.0 ┆ b │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 1 ┆ 6.0 ┆ a │ + # # └─────┴─────┴─────┘ + # + # @example Sort by multiple columns. + # df.sort( + # [Polars.col("foo"), Polars.col("bar")**2], + # reverse: [true, false] + # ) + # # => + # # shape: (3, 3) + # # ┌─────┬─────┬─────┐ + # # │ foo ┆ bar ┆ ham │ + # # │ --- ┆ --- ┆ --- │ + # # │ i64 ┆ f64 ┆ str │ + # # ╞═════╪═════╪═════╡ + # # │ 3 ┆ 8.0 ┆ c │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 2 ┆ 7.0 ┆ b │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 1 ┆ 6.0 ┆ a │ + # # └─────┴─────┴─────┘ def sort(by, reverse: false, nulls_last: false) - _from_rbdf(_df.sort(by, reverse, nulls_last)) + if by.is_a?(Array) || by.is_a?(Expr) + lazy + .sort(by, reverse: reverse, nulls_last: nulls_last) + .collect(no_optimization: true, string_cache: false) + else + _from_rbdf(_df.sort(by, reverse, nulls_last)) + end end + # Check if DataFrame is equal to other. + # + # @param other [DataFrame] + # DataFrame to compare with. + # @param null_equal [Boolean] + # Consider null values as equal. + # + # @return [Boolean] + # + # @example + # df1 = Polars::DataFrame.new( + # { + # "foo" => [1, 2, 3], + # "bar" => [6.0, 7.0, 8.0], + # "ham" => ["a", "b", "c"] + # } + # ) + # df2 = Polars::DataFrame.new( + # { + # "foo" => [3, 2, 1], + # "bar" => [8.0, 7.0, 6.0], + # "ham" => ["c", "b", "a"] + # } + # ) + # df1.frame_equal(df1) + # # => true + # df1.frame_equal(df2) + # # => false def frame_equal(other, null_equal: true) _df.frame_equal(other._df, null_equal) end - # def replace - # end + # Replace a column by a new Series. + # + # @param column [String] + # Column to replace. + # @param new_col [Series] + # New column to insert. + # + # @return [DataFrame] + # + # @example + # df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]}) + # s = Polars::Series.new([10, 20, 30]) + # df.replace("foo", s) + # # => + # # shape: (3, 2) + # # ┌─────┬─────┐ + # # │ foo ┆ bar │ + # # │ --- ┆ --- │ + # # │ i64 ┆ i64 │ + # # ╞═════╪═════╡ + # # │ 10 ┆ 4 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 20 ┆ 5 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 30 ┆ 6 │ + # # └─────┴─────┘ + def replace(column, new_col) + _df.replace(column, new_col._s) + self + end + # Get a slice of this DataFrame. # + # @param offset [Integer] + # Start index. Negative indexing is supported. + # @param length [Integer, nil] + # Length of the slice. If set to `nil`, all rows starting at the offset + # will be selected. + # + # @return [DataFrame] + # + # @example + # df = Polars::DataFrame.new( + # { + # "foo" => [1, 2, 3], + # "bar" => [6.0, 7.0, 8.0], + # "ham" => ["a", "b", "c"] + # } + # ) + # df.slice(1, 2) + # # => + # # shape: (2, 3) + # # ┌─────┬─────┬─────┐ + # # │ foo ┆ bar ┆ ham │ + # # │ --- ┆ --- ┆ --- │ + # # │ i64 ┆ f64 ┆ str │ + # # ╞═════╪═════╪═════╡ + # # │ 2 ┆ 7.0 ┆ b │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 3 ┆ 8.0 ┆ c │ + # # └─────┴─────┴─────┘ def slice(offset, length = nil) if !length.nil? && length < 0 length = height - offset + length end _from_rbdf(_df.slice(offset, length)) end + # Get the first `n` rows. + # + # Alias for {#head}. + # + # @param n [Integer] + # Number of rows to return. + # + # @return [DataFrame] + # + # @example + # df = Polars::DataFrame.new( + # {"foo" => [1, 2, 3, 4, 5, 6], "bar" => ["a", "b", "c", "d", "e", "f"]} + # ) + # df.limit(4) + # # => + # # shape: (4, 2) + # # ┌─────┬─────┐ + # # │ foo ┆ bar │ + # # │ --- ┆ --- │ + # # │ i64 ┆ str │ + # # ╞═════╪═════╡ + # # │ 1 ┆ a │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 2 ┆ b │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 3 ┆ c │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 4 ┆ d │ + # # └─────┴─────┘ def limit(n = 5) head(n) end + # Get the first `n` rows. + # + # @param n [Integer] + # Number of rows to return. + # + # @return [DataFrame] + # + # @example + # df = Polars::DataFrame.new( + # { + # "foo" => [1, 2, 3, 4, 5], + # "bar" => [6, 7, 8, 9, 10], + # "ham" => ["a", "b", "c", "d", "e"] + # } + # ) + # df.head(3) + # # => + # # shape: (3, 3) + # # ┌─────┬─────┬─────┐ + # # │ foo ┆ bar ┆ ham │ + # # │ --- ┆ --- ┆ --- │ + # # │ i64 ┆ i64 ┆ str │ + # # ╞═════╪═════╪═════╡ + # # │ 1 ┆ 6 ┆ a │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 2 ┆ 7 ┆ b │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 3 ┆ 8 ┆ c │ + # # └─────┴─────┴─────┘ def head(n = 5) _from_rbdf(_df.head(n)) end + # Get the last `n` rows. + # + # @param n [Integer] + # Number of rows to return. + # + # @return [DataFrame] + # + # @example + # df = Polars::DataFrame.new( + # { + # "foo" => [1, 2, 3, 4, 5], + # "bar" => [6, 7, 8, 9, 10], + # "ham" => ["a", "b", "c", "d", "e"] + # } + # ) + # df.tail(3) + # # => + # # shape: (3, 3) + # # ┌─────┬─────┬─────┐ + # # │ foo ┆ bar ┆ ham │ + # # │ --- ┆ --- ┆ --- │ + # # │ i64 ┆ i64 ┆ str │ + # # ╞═════╪═════╪═════╡ + # # │ 3 ┆ 8 ┆ c │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 4 ┆ 9 ┆ d │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 5 ┆ 10 ┆ e │ + # # └─────┴─────┴─────┘ def tail(n = 5) _from_rbdf(_df.tail(n)) end - # def drop_nulls - # end + # Return a new DataFrame where the null values are dropped. + # + # @param subset [Object] + # Subset of column(s) on which `drop_nulls` will be applied. + # + # @return [DataFrame] + # + # @example + # df = Polars::DataFrame.new( + # { + # "foo" => [1, 2, 3], + # "bar" => [6, nil, 8], + # "ham" => ["a", "b", "c"] + # } + # ) + # df.drop_nulls + # # => + # # shape: (2, 3) + # # ┌─────┬─────┬─────┐ + # # │ foo ┆ bar ┆ ham │ + # # │ --- ┆ --- ┆ --- │ + # # │ i64 ┆ i64 ┆ str │ + # # ╞═════╪═════╪═════╡ + # # │ 1 ┆ 6 ┆ a │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 3 ┆ 8 ┆ c │ + # # └─────┴─────┴─────┘ + def drop_nulls(subset: nil) + if subset.is_a?(String) + subset = [subset] + end + _from_rbdf(_df.drop_nulls(subset)) + end # def pipe # end - # def with_row_count - # end + # Add a column at index 0 that counts the rows. + # + # @param name [String] + # Name of the column to add. + # @param offset [Integer] + # Start the row count at this offset. + # + # @return [DataFrame] + # + # @example + # df = Polars::DataFrame.new( + # { + # "a" => [1, 3, 5], + # "b" => [2, 4, 6] + # } + # ) + # df.with_row_count + # # => + # # shape: (3, 3) + # # ┌────────┬─────┬─────┐ + # # │ row_nr ┆ a ┆ b │ + # # │ --- ┆ --- ┆ --- │ + # # │ u32 ┆ i64 ┆ i64 │ + # # ╞════════╪═════╪═════╡ + # # │ 0 ┆ 1 ┆ 2 │ + # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 1 ┆ 3 ┆ 4 │ + # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 2 ┆ 5 ┆ 6 │ + # # └────────┴─────┴─────┘ + def with_row_count(name: "row_nr", offset: 0) + _from_rbdf(_df.with_row_count(name, offset)) + end + # Start a groupby operation. # + # @param by [Object] + # Column(s) to group by. + # @param maintain_order [Boolean] + # Make sure that the order of the groups remain consistent. This is more + # expensive than a default groupby. Note that this only works in expression + # aggregations. + # + # @return [GroupBy] + # + # @example + # df = Polars::DataFrame.new( + # { + # "a" => ["a", "b", "a", "b", "b", "c"], + # "b" => [1, 2, 3, 4, 5, 6], + # "c" => [6, 5, 4, 3, 2, 1] + # } + # ) + # df.groupby("a").agg(Polars.col("b").sum).sort("a") + # # => + # # shape: (3, 2) + # # ┌─────┬─────┐ + # # │ a ┆ b │ + # # │ --- ┆ --- │ + # # │ str ┆ i64 │ + # # ╞═════╪═════╡ + # # │ a ┆ 4 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ b ┆ 11 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ c ┆ 6 │ + # # └─────┴─────┘ def groupby(by, maintain_order: false) - lazy.groupby(by, maintain_order: maintain_order) + if !Utils.bool?(maintain_order) + raise TypeError, "invalid input for groupby arg `maintain_order`: #{maintain_order}." + end + if by.is_a?(String) + by = [by] + end + GroupBy.new( + _df, + by, + self.class, + maintain_order: maintain_order + ) end # def groupby_rolling # end @@ -874,11 +1557,113 @@ # end # def join_asof # end + # Join in SQL-like fashion. # + # @param other [DataFrame] + # DataFrame to join with. + # @param left_on [Object] + # Name(s) of the left join column(s). + # @param right_on [Object] + # Name(s) of the right join column(s). + # @param on [Object] + # Name(s) of the join columns in both DataFrames. + # @param how ["inner", "left", "outer", "semi", "anti", "cross"] + # Join strategy. + # @param suffix [String] + # Suffix to append to columns with a duplicate name. + # + # @return [DataFrame] + # + # @example + # df = Polars::DataFrame.new( + # { + # "foo" => [1, 2, 3], + # "bar" => [6.0, 7.0, 8.0], + # "ham" => ["a", "b", "c"] + # } + # ) + # other_df = Polars::DataFrame.new( + # { + # "apple" => ["x", "y", "z"], + # "ham" => ["a", "b", "d"] + # } + # ) + # df.join(other_df, on: "ham") + # # => + # # shape: (2, 4) + # # ┌─────┬─────┬─────┬───────┐ + # # │ foo ┆ bar ┆ ham ┆ apple │ + # # │ --- ┆ --- ┆ --- ┆ --- │ + # # │ i64 ┆ f64 ┆ str ┆ str │ + # # ╞═════╪═════╪═════╪═══════╡ + # # │ 1 ┆ 6.0 ┆ a ┆ x │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + # # │ 2 ┆ 7.0 ┆ b ┆ y │ + # # └─────┴─────┴─────┴───────┘ + # + # @example + # df.join(other_df, on: "ham", how: "outer") + # # => + # # shape: (4, 4) + # # ┌──────┬──────┬─────┬───────┐ + # # │ foo ┆ bar ┆ ham ┆ apple │ + # # │ --- ┆ --- ┆ --- ┆ --- │ + # # │ i64 ┆ f64 ┆ str ┆ str │ + # # ╞══════╪══════╪═════╪═══════╡ + # # │ 1 ┆ 6.0 ┆ a ┆ x │ + # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + # # │ 2 ┆ 7.0 ┆ b ┆ y │ + # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + # # │ null ┆ null ┆ d ┆ z │ + # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + # # │ 3 ┆ 8.0 ┆ c ┆ null │ + # # └──────┴──────┴─────┴───────┘ + # + # @example + # df.join(other_df, on: "ham", how: "left") + # # => + # # shape: (3, 4) + # # ┌─────┬─────┬─────┬───────┐ + # # │ foo ┆ bar ┆ ham ┆ apple │ + # # │ --- ┆ --- ┆ --- ┆ --- │ + # # │ i64 ┆ f64 ┆ str ┆ str │ + # # ╞═════╪═════╪═════╪═══════╡ + # # │ 1 ┆ 6.0 ┆ a ┆ x │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + # # │ 2 ┆ 7.0 ┆ b ┆ y │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + # # │ 3 ┆ 8.0 ┆ c ┆ null │ + # # └─────┴─────┴─────┴───────┘ + # + # @example + # df.join(other_df, on: "ham", how: "semi") + # # => + # # shape: (2, 3) + # # ┌─────┬─────┬─────┐ + # # │ foo ┆ bar ┆ ham │ + # # │ --- ┆ --- ┆ --- │ + # # │ i64 ┆ f64 ┆ str │ + # # ╞═════╪═════╪═════╡ + # # │ 1 ┆ 6.0 ┆ a │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 2 ┆ 7.0 ┆ b │ + # # └─────┴─────┴─────┘ + # + # @example + # df.join(other_df, on: "ham", how: "anti") + # # => + # # shape: (1, 3) + # # ┌─────┬─────┬─────┐ + # # │ foo ┆ bar ┆ ham │ + # # │ --- ┆ --- ┆ --- │ + # # │ i64 ┆ f64 ┆ str │ + # # ╞═════╪═════╪═════╡ + # # │ 3 ┆ 8.0 ┆ c │ + # # └─────┴─────┴─────┘ def join(other, left_on: nil, right_on: nil, on: nil, how: "inner", suffix: "_right") lazy .join( other.lazy, left_on: left_on, @@ -891,59 +1676,412 @@ end # def apply # end + # Return a new DataFrame with the column added or replaced. # + # @param column [Object] + # Series, where the name of the Series refers to the column in the DataFrame. + # + # @return [DataFrame] + # + # @example Added + # df = Polars::DataFrame.new( + # { + # "a" => [1, 3, 5], + # "b" => [2, 4, 6] + # } + # ) + # df.with_column((Polars.col("b") ** 2).alias("b_squared")) + # # => + # # shape: (3, 3) + # # ┌─────┬─────┬───────────┐ + # # │ a ┆ b ┆ b_squared │ + # # │ --- ┆ --- ┆ --- │ + # # │ i64 ┆ i64 ┆ f64 │ + # # ╞═════╪═════╪═══════════╡ + # # │ 1 ┆ 2 ┆ 4.0 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤ + # # │ 3 ┆ 4 ┆ 16.0 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤ + # # │ 5 ┆ 6 ┆ 36.0 │ + # # └─────┴─────┴───────────┘ + # + # @example Replaced + # df.with_column(Polars.col("a") ** 2) + # # => + # # shape: (3, 2) + # # ┌──────┬─────┐ + # # │ a ┆ b │ + # # │ --- ┆ --- │ + # # │ f64 ┆ i64 │ + # # ╞══════╪═════╡ + # # │ 1.0 ┆ 2 │ + # # ├╌╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 9.0 ┆ 4 │ + # # ├╌╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 25.0 ┆ 6 │ + # # └──────┴─────┘ def with_column(column) lazy .with_column(column) .collect(no_optimization: true, string_cache: false) end - # def hstack - # end + # Return a new DataFrame grown horizontally by stacking multiple Series to it. + # + # @param columns [Object] + # Series to stack. + # @param in_place [Boolean] + # Modify in place. + # + # @return [DataFrame] + # + # @example + # df = Polars::DataFrame.new( + # { + # "foo" => [1, 2, 3], + # "bar" => [6, 7, 8], + # "ham" => ["a", "b", "c"] + # } + # ) + # x = Polars::Series.new("apple", [10, 20, 30]) + # df.hstack([x]) + # # => + # # shape: (3, 4) + # # ┌─────┬─────┬─────┬───────┐ + # # │ foo ┆ bar ┆ ham ┆ apple │ + # # │ --- ┆ --- ┆ --- ┆ --- │ + # # │ i64 ┆ i64 ┆ str ┆ i64 │ + # # ╞═════╪═════╪═════╪═══════╡ + # # │ 1 ┆ 6 ┆ a ┆ 10 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + # # │ 2 ┆ 7 ┆ b ┆ 20 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + # # │ 3 ┆ 8 ┆ c ┆ 30 │ + # # └─────┴─────┴─────┴───────┘ + def hstack(columns, in_place: false) + if !columns.is_a?(Array) + columns = columns.get_columns + end + if in_place + _df.hstack_mut(columns.map(&:_s)) + self + else + _from_rbdf(_df.hstack(columns.map(&:_s))) + end + end - # def vstack - # end + # Grow this DataFrame vertically by stacking a DataFrame to it. + # + # @param df [DataFrame] + # DataFrame to stack. + # @param in_place [Boolean] + # Modify in place + # + # @return [DataFrame] + # + # @example + # df1 = Polars::DataFrame.new( + # { + # "foo" => [1, 2], + # "bar" => [6, 7], + # "ham" => ["a", "b"] + # } + # ) + # df2 = Polars::DataFrame.new( + # { + # "foo" => [3, 4], + # "bar" => [8, 9], + # "ham" => ["c", "d"] + # } + # ) + # df1.vstack(df2) + # # => + # # shape: (4, 3) + # # ┌─────┬─────┬─────┐ + # # │ foo ┆ bar ┆ ham │ + # # │ --- ┆ --- ┆ --- │ + # # │ i64 ┆ i64 ┆ str │ + # # ╞═════╪═════╪═════╡ + # # │ 1 ┆ 6 ┆ a │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 2 ┆ 7 ┆ b │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 3 ┆ 8 ┆ c │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 4 ┆ 9 ┆ d │ + # # └─────┴─────┴─────┘ + def vstack(df, in_place: false) + if in_place + _df.vstack_mut(df._df) + self + else + _from_rbdf(_df.vstack(df._df)) + end + end + # Extend the memory backed by this `DataFrame` with the values from `other`. # + # Different from `vstack` which adds the chunks from `other` to the chunks of this + # `DataFrame` `extend` appends the data from `other` to the underlying memory + # locations and thus may cause a reallocation. + # + # If this does not cause a reallocation, the resulting data structure will not + # have any extra chunks and thus will yield faster queries. + # + # Prefer `extend` over `vstack` when you want to do a query after a single append. + # For instance during online operations where you add `n` rows and rerun a query. + # + # Prefer `vstack` over `extend` when you want to append many times before doing a + # query. For instance when you read in multiple files and when to store them in a + # single `DataFrame`. In the latter case, finish the sequence of `vstack` + # operations with a `rechunk`. + # + # @param other [DataFrame] + # DataFrame to vertically add. + # + # @return [DataFrame] + # + # @example + # df1 = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]}) + # df2 = Polars::DataFrame.new({"foo" => [10, 20, 30], "bar" => [40, 50, 60]}) + # df1.extend(df2) + # # => + # # shape: (6, 2) + # # ┌─────┬─────┐ + # # │ foo ┆ bar │ + # # │ --- ┆ --- │ + # # │ i64 ┆ i64 │ + # # ╞═════╪═════╡ + # # │ 1 ┆ 4 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 2 ┆ 5 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 3 ┆ 6 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 10 ┆ 40 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 20 ┆ 50 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 30 ┆ 60 │ + # # └─────┴─────┘ def extend(other) _df.extend(other._df) self end - # def drop - # end + # Remove column from DataFrame and return as new. + # + # @param columns [Object] + # Column(s) to drop. + # + # @return [DataFrame] + # + # @example + # df = Polars::DataFrame.new( + # { + # "foo" => [1, 2, 3], + # "bar" => [6.0, 7.0, 8.0], + # "ham" => ["a", "b", "c"] + # } + # ) + # df.drop("ham") + # # => + # # shape: (3, 2) + # # ┌─────┬─────┐ + # # │ foo ┆ bar │ + # # │ --- ┆ --- │ + # # │ i64 ┆ f64 │ + # # ╞═════╪═════╡ + # # │ 1 ┆ 6.0 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 2 ┆ 7.0 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 3 ┆ 8.0 │ + # # └─────┴─────┘ + def drop(columns) + if columns.is_a?(Array) + df = clone + columns.each do |n| + df._df.drop_in_place(n) + end + df + else + _from_rbdf(_df.drop(columns)) + end + end - # def drop_in_place - # end + # Drop in place. + # + # @param name [Object] + # Column to drop. + # + # @return [Series] + # + # @example + # df = Polars::DataFrame.new( + # { + # "foo" => [1, 2, 3], + # "bar" => [6, 7, 8], + # "ham" => ["a", "b", "c"] + # } + # ) + # df.drop_in_place("ham") + # # => + # # shape: (3,) + # # Series: 'ham' [str] + # # [ + # # "a" + # # "b" + # # "c" + # # ] + def drop_in_place(name) + Utils.wrap_s(_df.drop_in_place(name)) + end - # def cleared - # end + # Create an empty copy of the current DataFrame. + # + # Returns a DataFrame with identical schema but no data. + # + # @return [DataFrame] + # + # @example + # df = Polars::DataFrame.new( + # { + # "a" => [nil, 2, 3, 4], + # "b" => [0.5, nil, 2.5, 13], + # "c" => [true, true, false, nil] + # } + # ) + # df.cleared + # # => + # # shape: (0, 3) + # # ┌─────┬─────┬──────┐ + # # │ a ┆ b ┆ c │ + # # │ --- ┆ --- ┆ --- │ + # # │ i64 ┆ f64 ┆ bool │ + # # ╞═════╪═════╪══════╡ + # # └─────┴─────┴──────┘ + def cleared + height > 0 ? head(0) : clone + end # clone handled by initialize_copy + # Get the DataFrame as a Array of Series. # + # @return [Array] def get_columns _df.get_columns.map { |s| Utils.wrap_s(s) } end + # Get a single column as Series by name. + # + # @param name [String] + # Name of the column to retrieve. + # + # @return [Series] + # + # @example + # df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]}) + # df.get_column("foo") + # # => + # # shape: (3,) + # # Series: 'foo' [i64] + # # [ + # # 1 + # # 2 + # # 3 + # # ] def get_column(name) self[name] end # def fill_null # end + # Fill floating point NaN values by an Expression evaluation. # + # @param fill_value [Object] + # Value to fill NaN with. + # + # @return [DataFrame] + # + # @note + # Note that floating point NaNs (Not a Number) are not missing values! + # To replace missing values, use `fill_null`. + # + # @example + # df = Polars::DataFrame.new( + # { + # "a" => [1.5, 2, Float::NAN, 4], + # "b" => [0.5, 4, Float::NAN, 13] + # } + # ) + # df.fill_nan(99) + # # => + # # shape: (4, 2) + # # ┌──────┬──────┐ + # # │ a ┆ b │ + # # │ --- ┆ --- │ + # # │ f64 ┆ f64 │ + # # ╞══════╪══════╡ + # # │ 1.5 ┆ 0.5 │ + # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤ + # # │ 2.0 ┆ 4.0 │ + # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤ + # # │ 99.0 ┆ 99.0 │ + # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤ + # # │ 4.0 ┆ 13.0 │ + # # └──────┴──────┘ def fill_nan(fill_value) lazy.fill_nan(fill_value).collect(no_optimization: true) end - # def explode - # end + # Explode `DataFrame` to long format by exploding a column with Lists. + # + # @param columns [Object] + # Column of LargeList type. + # + # @return [DataFrame] + # + # @example + # df = Polars::DataFrame.new( + # { + # "letters" => ["a", "a", "b", "c"], + # "numbers" => [[1], [2, 3], [4, 5], [6, 7, 8]] + # } + # ) + # df.explode("numbers") + # # => + # # shape: (8, 2) + # # ┌─────────┬─────────┐ + # # │ letters ┆ numbers │ + # # │ --- ┆ --- │ + # # │ str ┆ i64 │ + # # ╞═════════╪═════════╡ + # # │ a ┆ 1 │ + # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤ + # # │ a ┆ 2 │ + # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤ + # # │ a ┆ 3 │ + # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤ + # # │ b ┆ 4 │ + # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤ + # # │ b ┆ 5 │ + # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤ + # # │ c ┆ 6 │ + # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤ + # # │ c ┆ 7 │ + # # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤ + # # │ c ┆ 8 │ + # # └─────────┴─────────┘ + def explode(columns) + lazy.explode(columns).collect(no_optimization: true) + end # def pivot # end # def melt @@ -953,77 +2091,433 @@ # end # def partition_by # end - # def shift - # end + # Shift values by the given period. + # + # @param periods [Integer] + # Number of places to shift (may be negative). + # + # @return [DataFrame] + # + # @example + # df = Polars::DataFrame.new( + # { + # "foo" => [1, 2, 3], + # "bar" => [6, 7, 8], + # "ham" => ["a", "b", "c"] + # } + # ) + # df.shift(1) + # # => + # # shape: (3, 3) + # # ┌──────┬──────┬──────┐ + # # │ foo ┆ bar ┆ ham │ + # # │ --- ┆ --- ┆ --- │ + # # │ i64 ┆ i64 ┆ str │ + # # ╞══════╪══════╪══════╡ + # # │ null ┆ null ┆ null │ + # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤ + # # │ 1 ┆ 6 ┆ a │ + # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤ + # # │ 2 ┆ 7 ┆ b │ + # # └──────┴──────┴──────┘ + # + # @example + # df.shift(-1) + # # => + # # shape: (3, 3) + # # ┌──────┬──────┬──────┐ + # # │ foo ┆ bar ┆ ham │ + # # │ --- ┆ --- ┆ --- │ + # # │ i64 ┆ i64 ┆ str │ + # # ╞══════╪══════╪══════╡ + # # │ 2 ┆ 7 ┆ b │ + # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤ + # # │ 3 ┆ 8 ┆ c │ + # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤ + # # │ null ┆ null ┆ null │ + # # └──────┴──────┴──────┘ + def shift(periods) + _from_rbdf(_df.shift(periods)) + end - # def shift_and_fill - # end + # Shift the values by a given period and fill the resulting null values. + # + # @param periods [Integer] + # Number of places to shift (may be negative). + # @param fill_value [Object] + # fill nil values with this value. + # + # @return [DataFrame] + # + # @example + # df = Polars::DataFrame.new( + # { + # "foo" => [1, 2, 3], + # "bar" => [6, 7, 8], + # "ham" => ["a", "b", "c"] + # } + # ) + # df.shift_and_fill(1, 0) + # # => + # # shape: (3, 3) + # # ┌─────┬─────┬─────┐ + # # │ foo ┆ bar ┆ ham │ + # # │ --- ┆ --- ┆ --- │ + # # │ i64 ┆ i64 ┆ str │ + # # ╞═════╪═════╪═════╡ + # # │ 0 ┆ 0 ┆ 0 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 1 ┆ 6 ┆ a │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 2 ┆ 7 ┆ b │ + # # └─────┴─────┴─────┘ + def shift_and_fill(periods, fill_value) + lazy + .shift_and_fill(periods, fill_value) + .collect(no_optimization: true, string_cache: false) + end + # Get a mask of all duplicated rows in this DataFrame. # + # @return [Series] + # + # @example + # df = Polars::DataFrame.new( + # { + # "a" => [1, 2, 3, 1], + # "b" => ["x", "y", "z", "x"], + # } + # ) + # df.is_duplicated + # # => + # # shape: (4,) + # # Series: '' [bool] + # # [ + # # true + # # false + # # false + # # true + # # ] def is_duplicated Utils.wrap_s(_df.is_duplicated) end + # Get a mask of all unique rows in this DataFrame. + # + # @return [Series] + # + # @example + # df = Polars::DataFrame.new( + # { + # "a" => [1, 2, 3, 1], + # "b" => ["x", "y", "z", "x"] + # } + # ) + # df.is_unique + # # => + # # shape: (4,) + # # Series: '' [bool] + # # [ + # # false + # # true + # # true + # # false + # # ] def is_unique Utils.wrap_s(_df.is_unique) end + # Start a lazy query from this point. + # + # @return [LazyFrame] def lazy wrap_ldf(_df.lazy) end + # Select columns from this DataFrame. + # + # @param exprs [Object] + # Column or columns to select. + # + # @return [DataFrame] + # + # @example + # df = Polars::DataFrame.new( + # { + # "foo" => [1, 2, 3], + # "bar" => [6, 7, 8], + # "ham" => ["a", "b", "c"] + # } + # ) + # df.select("foo") + # # => + # # shape: (3, 1) + # # ┌─────┐ + # # │ foo │ + # # │ --- │ + # # │ i64 │ + # # ╞═════╡ + # # │ 1 │ + # # ├╌╌╌╌╌┤ + # # │ 2 │ + # # ├╌╌╌╌╌┤ + # # │ 3 │ + # # └─────┘ + # + # @example + # df.select(["foo", "bar"]) + # # => + # # shape: (3, 2) + # # ┌─────┬─────┐ + # # │ foo ┆ bar │ + # # │ --- ┆ --- │ + # # │ i64 ┆ i64 │ + # # ╞═════╪═════╡ + # # │ 1 ┆ 6 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 2 ┆ 7 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 3 ┆ 8 │ + # # └─────┴─────┘ + # + # @example + # df.select(Polars.col("foo") + 1) + # # => + # # shape: (3, 1) + # # ┌─────┐ + # # │ foo │ + # # │ --- │ + # # │ i64 │ + # # ╞═════╡ + # # │ 2 │ + # # ├╌╌╌╌╌┤ + # # │ 3 │ + # # ├╌╌╌╌╌┤ + # # │ 4 │ + # # └─────┘ + # + # @example + # df.select([Polars.col("foo") + 1, Polars.col("bar") + 1]) + # # => + # # shape: (3, 2) + # # ┌─────┬─────┐ + # # │ foo ┆ bar │ + # # │ --- ┆ --- │ + # # │ i64 ┆ i64 │ + # # ╞═════╪═════╡ + # # │ 2 ┆ 7 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 3 ┆ 8 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 4 ┆ 9 │ + # # └─────┴─────┘ + # + # @example + # df.select(Polars.when(Polars.col("foo") > 2).then(10).otherwise(0)) + # # => + # # shape: (3, 1) + # # ┌─────────┐ + # # │ literal │ + # # │ --- │ + # # │ i64 │ + # # ╞═════════╡ + # # │ 0 │ + # # ├╌╌╌╌╌╌╌╌╌┤ + # # │ 0 │ + # # ├╌╌╌╌╌╌╌╌╌┤ + # # │ 10 │ + # # └─────────┘ def select(exprs) _from_rbdf( lazy .select(exprs) .collect(no_optimization: true, string_cache: false) ._df ) end + # Add or overwrite multiple columns in a DataFrame. + # + # @param exprs [Array] + # Array of Expressions that evaluate to columns. + # + # @return [DataFrame] + # + # @example + # df = Polars::DataFrame.new( + # { + # "a" => [1, 2, 3, 4], + # "b" => [0.5, 4, 10, 13], + # "c" => [true, true, false, true] + # } + # ) + # df.with_columns( + # [ + # (Polars.col("a") ** 2).alias("a^2"), + # (Polars.col("b") / 2).alias("b/2"), + # (Polars.col("c").is_not()).alias("not c") + # ] + # ) + # # => + # # shape: (4, 6) + # # ┌─────┬──────┬───────┬──────┬──────┬───────┐ + # # │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + # # │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + # # ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + # # │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + # # │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + # # │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + # # │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + # # └─────┴──────┴───────┴──────┴──────┴───────┘ def with_columns(exprs) if !exprs.nil? && !exprs.is_a?(Array) exprs = [exprs] end lazy .with_columns(exprs) .collect(no_optimization: true, string_cache: false) end + # Get number of chunks used by the ChunkedArrays of this DataFrame. + # + # @param strategy ["first", "all"] + # Return the number of chunks of the 'first' column, + # or 'all' columns in this DataFrame. + # + # @return [Object] + # + # @example + # df = Polars::DataFrame.new( + # { + # "a" => [1, 2, 3, 4], + # "b" => [0.5, 4, 10, 13], + # "c" => [true, true, false, true] + # } + # ) + # df.n_chunks + # # => 1 + # df.n_chunks(strategy: "all") + # # => [1, 1, 1] def n_chunks(strategy: "first") if strategy == "first" _df.n_chunks elsif strategy == "all" get_columns.map(&:n_chunks) else raise ArgumentError, "Strategy: '{strategy}' not understood. Choose one of {{'first', 'all'}}" end end + # Aggregate the columns of this DataFrame to their maximum value. + # + # @return [DataFrame] + # + # @example + # df = Polars::DataFrame.new( + # { + # "foo" => [1, 2, 3], + # "bar" => [6, 7, 8], + # "ham" => ["a", "b", "c"] + # } + # ) + # df.max + # # => + # # shape: (1, 3) + # # ┌─────┬─────┬─────┐ + # # │ foo ┆ bar ┆ ham │ + # # │ --- ┆ --- ┆ --- │ + # # │ i64 ┆ i64 ┆ str │ + # # ╞═════╪═════╪═════╡ + # # │ 3 ┆ 8 ┆ c │ + # # └─────┴─────┴─────┘ def max(axis: 0) if axis == 0 _from_rbdf(_df.max) elsif axis == 1 Utils.wrap_s(_df.hmax) else raise ArgumentError, "Axis should be 0 or 1." end end + # Aggregate the columns of this DataFrame to their minimum value. + # + # @return [DataFrame] + # + # @example + # df = Polars::DataFrame.new( + # { + # "foo" => [1, 2, 3], + # "bar" => [6, 7, 8], + # "ham" => ["a", "b", "c"] + # } + # ) + # df.min + # # => + # # shape: (1, 3) + # # ┌─────┬─────┬─────┐ + # # │ foo ┆ bar ┆ ham │ + # # │ --- ┆ --- ┆ --- │ + # # │ i64 ┆ i64 ┆ str │ + # # ╞═════╪═════╪═════╡ + # # │ 1 ┆ 6 ┆ a │ + # # └─────┴─────┴─────┘ def min(axis: 0) if axis == 0 _from_rbdf(_df.min) elsif axis == 1 Utils.wrap_s(_df.hmin) else raise ArgumentError, "Axis should be 0 or 1." end end + # Aggregate the columns of this DataFrame to their sum value. + # + # @param axis [Integer] + # Either 0 or 1. + # @param null_strategy ["ignore", "propagate"] + # This argument is only used if axis == 1. + # + # @return [DataFrame] + # + # @example + # df = Polars::DataFrame.new( + # { + # "foo" => [1, 2, 3], + # "bar" => [6, 7, 8], + # "ham" => ["a", "b", "c"], + # } + # ) + # df.sum + # # => + # # shape: (1, 3) + # # ┌─────┬─────┬──────┐ + # # │ foo ┆ bar ┆ ham │ + # # │ --- ┆ --- ┆ --- │ + # # │ i64 ┆ i64 ┆ str │ + # # ╞═════╪═════╪══════╡ + # # │ 6 ┆ 21 ┆ null │ + # # └─────┴─────┴──────┘ + # + # @example + # df.sum(axis: 1) + # # => + # # shape: (3,) + # # Series: 'foo' [str] + # # [ + # # "16a" + # # "27b" + # # "38c" + # # ] def sum(axis: 0, null_strategy: "ignore") case axis when 0 _from_rbdf(_df.sum) when 1 @@ -1031,10 +2525,37 @@ else raise ArgumentError, "Axis should be 0 or 1." end end + # Aggregate the columns of this DataFrame to their mean value. + # + # @param axis [Integer] + # Either 0 or 1. + # @param null_strategy ["ignore", "propagate"] + # This argument is only used if axis == 1. + # + # @return [DataFrame] + # + # @example + # df = Polars::DataFrame.new( + # { + # "foo" => [1, 2, 3], + # "bar" => [6, 7, 8], + # "ham" => ["a", "b", "c"] + # } + # ) + # df.mean + # # => + # # shape: (1, 3) + # # ┌─────┬─────┬──────┐ + # # │ foo ┆ bar ┆ ham │ + # # │ --- ┆ --- ┆ --- │ + # # │ f64 ┆ f64 ┆ str │ + # # ╞═════╪═════╪══════╡ + # # │ 2.0 ┆ 7.0 ┆ null │ + # # └─────┴─────┴──────┘ def mean(axis: 0, null_strategy: "ignore") case axis when 0 _from_rbdf(_df.mean) when 1 @@ -1042,81 +2563,637 @@ else raise ArgumentError, "Axis should be 0 or 1." end end + # Aggregate the columns of this DataFrame to their standard deviation value. + # + # @param ddof [Integer] + # Degrees of freedom + # + # @return [DataFrame] + # + # @example + # df = Polars::DataFrame.new( + # { + # "foo" => [1, 2, 3], + # "bar" => [6, 7, 8], + # "ham" => ["a", "b", "c"] + # } + # ) + # df.std + # # => + # # shape: (1, 3) + # # ┌─────┬─────┬──────┐ + # # │ foo ┆ bar ┆ ham │ + # # │ --- ┆ --- ┆ --- │ + # # │ f64 ┆ f64 ┆ str │ + # # ╞═════╪═════╪══════╡ + # # │ 1.0 ┆ 1.0 ┆ null │ + # # └─────┴─────┴──────┘ + # + # @example + # df.std(ddof: 0) + # # => + # # shape: (1, 3) + # # ┌──────────┬──────────┬──────┐ + # # │ foo ┆ bar ┆ ham │ + # # │ --- ┆ --- ┆ --- │ + # # │ f64 ┆ f64 ┆ str │ + # # ╞══════════╪══════════╪══════╡ + # # │ 0.816497 ┆ 0.816497 ┆ null │ + # # └──────────┴──────────┴──────┘ def std(ddof: 1) _from_rbdf(_df.std(ddof)) end + # Aggregate the columns of this DataFrame to their variance value. + # + # @param ddof [Integer] + # Degrees of freedom + # + # @return [DataFrame] + # + # @example + # df = Polars::DataFrame.new( + # { + # "foo" => [1, 2, 3], + # "bar" => [6, 7, 8], + # "ham" => ["a", "b", "c"] + # } + # ) + # df.var + # # => + # # shape: (1, 3) + # # ┌─────┬─────┬──────┐ + # # │ foo ┆ bar ┆ ham │ + # # │ --- ┆ --- ┆ --- │ + # # │ f64 ┆ f64 ┆ str │ + # # ╞═════╪═════╪══════╡ + # # │ 1.0 ┆ 1.0 ┆ null │ + # # └─────┴─────┴──────┘ + # + # @example + # df.var(ddof: 0) + # # => + # # shape: (1, 3) + # # ┌──────────┬──────────┬──────┐ + # # │ foo ┆ bar ┆ ham │ + # # │ --- ┆ --- ┆ --- │ + # # │ f64 ┆ f64 ┆ str │ + # # ╞══════════╪══════════╪══════╡ + # # │ 0.666667 ┆ 0.666667 ┆ null │ + # # └──────────┴──────────┴──────┘ def var(ddof: 1) _from_rbdf(_df.var(ddof)) end + # Aggregate the columns of this DataFrame to their median value. + # + # @return [DataFrame] + # + # @example + # df = Polars::DataFrame.new( + # { + # "foo" => [1, 2, 3], + # "bar" => [6, 7, 8], + # "ham" => ["a", "b", "c"] + # } + # ) + # df.median + # # => + # # shape: (1, 3) + # # ┌─────┬─────┬──────┐ + # # │ foo ┆ bar ┆ ham │ + # # │ --- ┆ --- ┆ --- │ + # # │ f64 ┆ f64 ┆ str │ + # # ╞═════╪═════╪══════╡ + # # │ 2.0 ┆ 7.0 ┆ null │ + # # └─────┴─────┴──────┘ def median _from_rbdf(_df.median) end - # def product - # end + # Aggregate the columns of this DataFrame to their product values. + # + # @return [DataFrame] + # + # @example + # df = Polars::DataFrame.new( + # { + # "a" => [1, 2, 3], + # "b" => [0.5, 4, 10], + # "c" => [true, true, false] + # } + # ) + # df.product + # # => + # # shape: (1, 3) + # # ┌─────┬──────┬─────┐ + # # │ a ┆ b ┆ c │ + # # │ --- ┆ --- ┆ --- │ + # # │ i64 ┆ f64 ┆ i64 │ + # # ╞═════╪══════╪═════╡ + # # │ 6 ┆ 20.0 ┆ 0 │ + # # └─────┴──────┴─────┘ + def product + select(Polars.all.product) + end - # def quantile(quantile, interpolation: "nearest") - # end + # Aggregate the columns of this DataFrame to their quantile value. + # + # @param quantile [Float] + # Quantile between 0.0 and 1.0. + # @param interpolation ["nearest", "higher", "lower", "midpoint", "linear"] + # Interpolation method. + # + # @return [DataFrame] + # + # @example + # df = Polars::DataFrame.new( + # { + # "foo" => [1, 2, 3], + # "bar" => [6, 7, 8], + # "ham" => ["a", "b", "c"] + # } + # ) + # df.quantile(0.5, interpolation: "nearest") + # # => + # # shape: (1, 3) + # # ┌─────┬─────┬──────┐ + # # │ foo ┆ bar ┆ ham │ + # # │ --- ┆ --- ┆ --- │ + # # │ f64 ┆ f64 ┆ str │ + # # ╞═════╪═════╪══════╡ + # # │ 2.0 ┆ 7.0 ┆ null │ + # # └─────┴─────┴──────┘ + def quantile(quantile, interpolation: "nearest") + _from_rbdf(_df.quantile(quantile, interpolation)) + end - # def to_dummies - # end + # Get one hot encoded dummy variables. + # + # @param columns + # A subset of columns to convert to dummy variables. `nil` means + # "all columns". + # + # @return [DataFrame] + # + # @example + # df = Polars::DataFrame.new( + # { + # "foo" => [1, 2], + # "bar" => [3, 4], + # "ham" => ["a", "b"] + # } + # ) + # df.to_dummies + # # => + # # shape: (2, 6) + # # ┌───────┬───────┬───────┬───────┬───────┬───────┐ + # # │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │ + # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + # # │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │ + # # ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡ + # # │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ + # # ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + # # │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ + # # └───────┴───────┴───────┴───────┴───────┴───────┘ + def to_dummies(columns: nil) + if columns.is_a?(String) + columns = [columns] + end + _from_rbdf(_df.to_dummies(columns)) + end - # def unique - # end + # Drop duplicate rows from this DataFrame. + # + # @param maintain_order [Boolean] + # Keep the same order as the original DataFrame. This requires more work to + # compute. + # @param subset [Object] + # Subset to use to compare rows. + # @param keep ["first", "last"] + # Which of the duplicate rows to keep (in conjunction with `subset`). + # + # @return [DataFrame] + # + # @note + # Note that this fails if there is a column of type `List` in the DataFrame or + # subset. + # + # @example + # df = Polars::DataFrame.new( + # { + # "a" => [1, 1, 2, 3, 4, 5], + # "b" => [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], + # "c" => [true, true, true, false, true, true] + # } + # ) + # df.unique + # # => + # # shape: (5, 3) + # # ┌─────┬─────┬───────┐ + # # │ a ┆ b ┆ c │ + # # │ --- ┆ --- ┆ --- │ + # # │ i64 ┆ f64 ┆ bool │ + # # ╞═════╪═════╪═══════╡ + # # │ 1 ┆ 0.5 ┆ true │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + # # │ 2 ┆ 1.0 ┆ true │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + # # │ 3 ┆ 2.0 ┆ false │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + # # │ 4 ┆ 3.0 ┆ true │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + # # │ 5 ┆ 3.0 ┆ true │ + # # └─────┴─────┴───────┘ + def unique(maintain_order: true, subset: nil, keep: "first") + if !subset.nil? + if subset.is_a?(String) + subset = [subset] + elsif !subset.is_a?(Array) + subset = subset.to_a + end + end - # def n_unique - # end + _from_rbdf(_df.unique(maintain_order, subset, keep)) + end + # Return the number of unique rows, or the number of unique row-subsets. # + # @param subset [Object] + # One or more columns/expressions that define what to count; + # omit to return the count of unique rows. + # + # @return [DataFrame] + # + # @example + # df = Polars::DataFrame.new( + # { + # "a" => [1, 1, 2, 3, 4, 5], + # "b" => [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], + # "c" => [true, true, true, false, true, true] + # } + # ) + # df.n_unique + # # => 5 + # + # @example Simple columns subset + # df.n_unique(subset: ["b", "c"]) + # # => 4 + # + # @example Expression subset + # df.n_unique( + # subset: [ + # (Polars.col("a").floordiv(2)), + # (Polars.col("c") | (Polars.col("b") >= 2)) + # ] + # ) + # # => 3 + def n_unique(subset: nil) + if subset.is_a?(StringIO) + subset = [Polars.col(subset)] + elsif subset.is_a?(Expr) + subset = [subset] + end + + if subset.is_a?(Array) && subset.length == 1 + expr = Utils.expr_to_lit_or_expr(subset[0], str_to_lit: false) + else + struct_fields = subset.nil? ? Polars.all : subset + expr = Polars.struct(struct_fields) + end + + df = lazy.select(expr.n_unique).collect + df.is_empty ? 0 : df.row(0)[0] + end + + # Rechunk the data in this DataFrame to a contiguous allocation. + + # This will make sure all subsequent operations have optimal and predictable + # performance. + # + # @return [DataFrame] def rechunk _from_rbdf(_df.rechunk) end + # Create a new DataFrame that shows the null counts per column. + # + # @return [DataFrame] + # + # @example + # df = Polars::DataFrame.new( + # { + # "foo" => [1, nil, 3], + # "bar" => [6, 7, nil], + # "ham" => ["a", "b", "c"] + # } + # ) + # df.null_count + # # => + # # shape: (1, 3) + # # ┌─────┬─────┬─────┐ + # # │ foo ┆ bar ┆ ham │ + # # │ --- ┆ --- ┆ --- │ + # # │ u32 ┆ u32 ┆ u32 │ + # # ╞═════╪═════╪═════╡ + # # │ 1 ┆ 1 ┆ 0 │ + # # └─────┴─────┴─────┘ def null_count _from_rbdf(_df.null_count) end - # def sample - # end + # Sample from this DataFrame. + # + # @param n [Integer] + # Number of items to return. Cannot be used with `frac`. Defaults to 1 if + # `frac` is nil. + # @param frac [Float] + # Fraction of items to return. Cannot be used with `n`. + # @param with_replacement [Boolean] + # Allow values to be sampled more than once. + # @param shuffle [Boolean] + # Shuffle the order of sampled data points. + # @param seed [Integer] + # Seed for the random number generator. If set to nil (default), a random + # seed is used. + # + # @return [DataFrame] + # + # @example + # df = Polars::DataFrame.new( + # { + # "foo" => [1, 2, 3], + # "bar" => [6, 7, 8], + # "ham" => ["a", "b", "c"] + # } + # ) + # df.sample(n: 2, seed: 0) + # # => + # # shape: (2, 3) + # # ┌─────┬─────┬─────┐ + # # │ foo ┆ bar ┆ ham │ + # # │ --- ┆ --- ┆ --- │ + # # │ i64 ┆ i64 ┆ str │ + # # ╞═════╪═════╪═════╡ + # # │ 3 ┆ 8 ┆ c │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 2 ┆ 7 ┆ b │ + # # └─────┴─────┴─────┘ + def sample( + n: nil, + frac: nil, + with_replacement: false, + shuffle: false, + seed: nil + ) + if !n.nil? && !frac.nil? + raise ArgumentError, "cannot specify both `n` and `frac`" + end + if n.nil? && !frac.nil? + _from_rbdf( + _df.sample_frac(frac, with_replacement, shuffle, seed) + ) + end + + if n.nil? + n = 1 + end + _from_rbdf(_df.sample_n(n, with_replacement, shuffle, seed)) + end + # def fold # end - # def row - # end + # Get a row as tuple, either by index or by predicate. + # + # @param index [Object] + # Row index. + # @param by_predicate [Object] + # Select the row according to a given expression/predicate. + # + # @return [Object] + # + # @note + # The `index` and `by_predicate` params are mutually exclusive. Additionally, + # to ensure clarity, the `by_predicate` parameter must be supplied by keyword. + # + # When using `by_predicate` it is an error condition if anything other than + # one row is returned; more than one row raises `TooManyRowsReturned`, and + # zero rows will raise `NoRowsReturned` (both inherit from `RowsException`). + # + # @example Return the row at the given index + # df = Polars::DataFrame.new( + # { + # "foo" => [1, 2, 3], + # "bar" => [6, 7, 8], + # "ham" => ["a", "b", "c"] + # } + # ) + # df.row(2) + # # => [3, 8, "c"] + # + # @example Return the row that matches the given predicate + # df.row(by_predicate: Polars.col("ham") == "b") + # # => [2, 7, "b"] + def row(index = nil, by_predicate: nil) + if !index.nil? && !by_predicate.nil? + raise ArgumentError, "Cannot set both 'index' and 'by_predicate'; mutually exclusive" + elsif index.is_a?(Expr) + raise TypeError, "Expressions should be passed to the 'by_predicate' param" + elsif index.is_a?(Integer) + _df.row_tuple(index) + elsif by_predicate.is_a?(Expr) + rows = filter(by_predicate).rows + n_rows = rows.length + if n_rows > 1 + raise TooManyRowsReturned, "Predicate #{by_predicate} returned #{n_rows} rows" + elsif n_rows == 0 + raise NoRowsReturned, "Predicate <{by_predicate!s}> returned no rows" + end + rows[0] + else + raise ArgumentError, "One of 'index' or 'by_predicate' must be set" + end + end - # def rows - # end + # Convert columnar data to rows as Ruby arrays. + # + # @return [Array] + # + # @example + # df = Polars::DataFrame.new( + # { + # "a" => [1, 3, 5], + # "b" => [2, 4, 6] + # } + # ) + # df.rows + # # => [[1, 2], [3, 4], [5, 6]] + def rows + _df.row_tuples + end - # def shrink_to_fit - # end + # Shrink DataFrame memory usage. + # + # Shrinks to fit the exact capacity needed to hold the data. + # + # @return [DataFrame] + def shrink_to_fit(in_place: false) + if in_place + _df.shrink_to_fit + self + else + df = clone + df._df.shrink_to_fit + df + end + end - # def take_every - # end + # Take every nth row in the DataFrame and return as a new DataFrame. + # + # @return [DataFrame] + # + # @example + # s = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [5, 6, 7, 8]}) + # s.take_every(2) + # # => + # # shape: (2, 2) + # # ┌─────┬─────┐ + # # │ a ┆ b │ + # # │ --- ┆ --- │ + # # │ i64 ┆ i64 │ + # # ╞═════╪═════╡ + # # │ 1 ┆ 5 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 3 ┆ 7 │ + # # └─────┴─────┘ + def take_every(n) + select(Utils.col("*").take_every(n)) + end # def hash_rows # end - # def interpolate - # end + # Interpolate intermediate values. The interpolation method is linear. + # + # @return [DataFrame] + # + # @example + # df = Polars::DataFrame.new( + # { + # "foo" => [1, nil, 9, 10], + # "bar" => [6, 7, 9, nil], + # "baz" => [1, nil, nil, 9] + # } + # ) + # df.interpolate + # # => + # # shape: (4, 3) + # # ┌─────┬──────┬─────┐ + # # │ foo ┆ bar ┆ baz │ + # # │ --- ┆ --- ┆ --- │ + # # │ i64 ┆ i64 ┆ i64 │ + # # ╞═════╪══════╪═════╡ + # # │ 1 ┆ 6 ┆ 1 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 5 ┆ 7 ┆ 3 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 9 ┆ 9 ┆ 6 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 10 ┆ null ┆ 9 │ + # # └─────┴──────┴─────┘ + def interpolate + select(Utils.col("*").interpolate) + end + # Check if the dataframe is empty. # + # @return [Boolean] + # + # @example + # df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]}) + # df.is_empty + # # => false + # df.filter(Polars.col("foo") > 99).is_empty + # # => true def is_empty height == 0 end alias_method :empty?, :is_empty - # def to_struct(name) - # end + # Convert a `DataFrame` to a `Series` of type `Struct`. + # + # @param name [String] + # Name for the struct Series + # + # @return [Series] + # + # @example + # df = Polars::DataFrame.new( + # { + # "a" => [1, 2, 3, 4, 5], + # "b" => ["one", "two", "three", "four", "five"] + # } + # ) + # df.to_struct("nums") + # # => + # # shape: (5,) + # # Series: 'nums' [struct[2]] + # # [ + # # {1,"one"} + # # {2,"two"} + # # {3,"three"} + # # {4,"four"} + # # {5,"five"} + # # ] + def to_struct(name) + Utils.wrap_s(_df.to_struct(name)) + end - # def unnest - # end + # Decompose a struct into its fields. + # + # The fields will be inserted into the `DataFrame` on the location of the + # `struct` type. + # + # @param names [Object] + # Names of the struct columns that will be decomposed by its fields + # + # @return [DataFrame] + # + # @example + # df = Polars::DataFrame.new( + # { + # "before" => ["foo", "bar"], + # "t_a" => [1, 2], + # "t_b" => ["a", "b"], + # "t_c" => [true, nil], + # "t_d" => [[1, 2], [3]], + # "after" => ["baz", "womp"] + # } + # ).select(["before", Polars.struct(Polars.col("^t_.$")).alias("t_struct"), "after"]) + # df.unnest("t_struct") + # # => + # # shape: (2, 6) + # # ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + # # │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + # # │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + # # ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + # # │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + # # │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + # # └────────┴─────┴─────┴──────┴───────────┴───────┘ + def unnest(names) + if names.is_a?(String) + names = [names] + end + _from_rbdf(_df.unnest(names)) + end private def initialize_copy(other) super @@ -1125,11 +3202,11 @@ def hash_to_rbdf(data, columns: nil) if !columns.nil? columns, dtypes = _unpack_columns(columns, lookup_names: data.keys) - if !data && dtypes + if data.empty? && dtypes data_series = columns.map { |name| Series.new(name, [], dtype: dtypes[name])._s } else data_series = data.map { |name, values| Series.new(name, values, dtype: dtypes[name])._s } end data_series = _handle_columns_arg(data_series, columns: columns) @@ -1145,11 +3222,11 @@ def _handle_columns_arg(data, columns: nil) if columns.nil? data else - if !data + if data.empty? columns.map { |c| Series.new(c, nil)._s } elsif data.length == columns.length columns.each_with_index do |c, i| # not in-place? data[i].rename(c) @@ -1179,8 +3256,78 @@ LazyFrame._from_rbldf(ldf) end def _from_rbdf(rb_df) self.class._from_rbdf(rb_df) + end + + def _comp(other, op) + if other.is_a?(DataFrame) + _compare_to_other_df(other, op) + else + _compare_to_non_df(other, op) + end + end + + def _compare_to_other_df(other, op) + if columns != other.columns + raise ArgmentError, "DataFrame columns do not match" + end + if shape != other.shape + raise ArgmentError, "DataFrame dimensions do not match" + end + + suffix = "__POLARS_CMP_OTHER" + other_renamed = other.select(Polars.all.suffix(suffix)) + combined = Polars.concat([self, other_renamed], how: "horizontal") + + expr = case op + when "eq" + columns.map { |n| Polars.col(n) == Polars.col("#{n}#{suffix}") } + when "neq" + columns.map { |n| Polars.col(n) != Polars.col("#{n}#{suffix}") } + when "gt" + columns.map { |n| Polars.col(n) > Polars.col("#{n}#{suffix}") } + when "lt" + columns.map { |n| Polars.col(n) < Polars.col("#{n}#{suffix}") } + when "gt_eq" + columns.map { |n| Polars.col(n) >= Polars.col("#{n}#{suffix}") } + when "lt_eq" + columns.map { |n| Polars.col(n) <= Polars.col("#{n}#{suffix}") } + else + raise ArgumentError, "got unexpected comparison operator: #{op}" + end + + combined.select(expr) + end + + def _compare_to_non_df(other, op) + case op + when "eq" + select(Polars.all == other) + when "neq" + select(Polars.all != other) + when "gt" + select(Polars.all > other) + when "lt" + select(Polars.all < other) + when "gt_eq" + select(Polars.all >= other) + when "lt_eq" + select(Polars.all <= other) + else + raise ArgumentError, "got unexpected comparison operator: #{op}" + end + end + + def _prepare_other_arg(other) + if !other.is_a?(Series) + if other.is_a?(Array) + raise ArgumentError, "Operation not supported." + end + + other = Series.new("", [other]) + end + other end end end