lib/polars/data_frame.rb in polars-df-0.1.4 vs lib/polars/data_frame.rb in polars-df-0.1.5

- old
+ new

@@ -24,18 +24,18 @@ data[k] = result.rows.map { |r| r[i] } end end if data.nil? - self._df = hash_to_rbdf({}, columns: columns) + self._df = self.class.hash_to_rbdf({}, columns: columns) elsif data.is_a?(Hash) data = data.transform_keys { |v| v.is_a?(Symbol) ? v.to_s : v } - self._df = hash_to_rbdf(data, columns: columns) + self._df = self.class.hash_to_rbdf(data, columns: columns) elsif data.is_a?(Array) - self._df = sequence_to_rbdf(data, columns: columns, orient: orient) + self._df = self.class.sequence_to_rbdf(data, columns: columns, orient: orient) elsif data.is_a?(Series) - self._df = series_to_rbdf(data, columns: columns) + self._df = self.class.series_to_rbdf(data, columns: columns) else raise ArgumentError, "DataFrame constructor called with unsupported type; got #{data.class.name}" end end @@ -44,15 +44,20 @@ df = DataFrame.allocate df._df = rb_df df end - # def self._from_hashes - # end + # @private + def self._from_hashes(data, infer_schema_length: 100, schema: nil) + rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema) + _from_rbdf(rbdf) + end - # def self._from_hash - # end + # @private + def self._from_hash(data, columns: nil) + _from_rbdf(hash_to_rbdf(data, columns: columns)) + end # def self._from_records # end # def self._from_numo @@ -184,12 +189,18 @@ low_memory ) ) end - # def self._read_avro - # end + # @private + def self._read_avro(file, columns: nil, n_rows: nil) + if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname)) + file = Utils.format_path(file) + end + projection, columns = Utils.handle_projection_columns(columns) + _from_rbdf(RbDataFrame.read_avro(file, columns, projection, n_rows)) + end # @private def self._read_ipc( file, columns: nil, @@ -484,16 +495,10 @@ end # def each # end - # def _pos_idx - # end - - # def _pos_idxs - # end - # Returns subset of the DataFrame. # # @return [Object] def [](*args) if args.size == 2 @@ -552,23 +557,37 @@ return Utils.wrap_s(_df.column(item)) end # df[idx] if item.is_a?(Integer) - return slice(_pos_idx(item, dim: 0), 1) + return slice(_pos_idx(item, 0), 1) end # df[..] if item.is_a?(Range) return Slice.new(self).apply(item) end + + if Utils.is_str_sequence(item, allow_str: false) + # select multiple columns + # df[["foo", "bar"]] + return _from_rbdf(_df.select(item)) + end end raise ArgumentError, "Cannot get item of type: #{item.class.name}" end + # Set item. + # + # @return [Object] # def []=(key, value) + # if key.is_a?(String) + # raise TypeError, "'DataFrame' object does not support 'Series' assignment by index. Use 'DataFrame.with_columns'" + # end + + # raise Todo # end # no to_arrow # Convert DataFrame to a hash mapping column name to values. @@ -580,13 +599,29 @@ else get_columns.to_h { |s| [s.name, s.to_a] } end end - # def to_hashes / to_a - # end + # Convert every row to a dictionary. + # + # Note that this is slow. + # + # @return [Array] + # + # @example + # df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]}) + # df.to_hashes + # [{'foo': 1, 'bar': 4}, {'foo': 2, 'bar': 5}, {'foo': 3, 'bar': 6}] + def to_hashes + rbdf = _df + names = columns + height.times.map do |i| + names.zip(rbdf.row_tuple(i)).to_h + end + end + # def to_numo # end # no to_pandas @@ -760,13 +795,29 @@ null_value, ) nil end - # def write_avro - # end + # Write to Apache Avro file. + # + # @param file [String] + # File path to which the file should be written. + # @param compression ["uncompressed", "snappy", "deflate"] + # Compression method. Defaults to "uncompressed". + # + # @return [nil] + def write_avro(file, compression = "uncompressed") + if compression.nil? + compression = "uncompressed" + end + if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname)) + file = Utils.format_path(file) + end + _df.write_avro(file, compression) + end + # Write to Arrow IPC binary stream or Feather file. # # @param file [String] # File path to which the file should be written. # @param compression ["uncompressed", "lz4", "zstd"] @@ -864,13 +915,89 @@ def estimated_size(unit = "b") sz = _df.estimated_size Utils.scale_bytes(sz, to: unit) end - # def transpose - # end + # Transpose a DataFrame over the diagonal. + # + # @param include_header [Boolean] + # If set, the column names will be added as first column. + # @param header_name [String] + # If `include_header` is set, this determines the name of the column that will + # be inserted. + # @param column_names [Array] + # Optional generator/iterator that yields column names. Will be used to + # replace the columns in the DataFrame. + # + # @return [DataFrame] + # + # @note + # This is a very expensive operation. Perhaps you can do it differently. + # + # @example + # df = Polars::DataFrame.new({"a" => [1, 2, 3], "b" => [1, 2, 3]}) + # df.transpose(include_header: true) + # # => + # # shape: (2, 4) + # # ┌────────┬──────────┬──────────┬──────────┐ + # # │ column ┆ column_0 ┆ column_1 ┆ column_2 │ + # # │ --- ┆ --- ┆ --- ┆ --- │ + # # │ str ┆ i64 ┆ i64 ┆ i64 │ + # # ╞════════╪══════════╪══════════╪══════════╡ + # # │ a ┆ 1 ┆ 2 ┆ 3 │ + # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤ + # # │ b ┆ 1 ┆ 2 ┆ 3 │ + # # └────────┴──────────┴──────────┴──────────┘ + # + # @example Replace the auto-generated column names with a list + # df.transpose(include_header: false, column_names: ["a", "b", "c"]) + # # => + # # shape: (2, 3) + # # ┌─────┬─────┬─────┐ + # # │ a ┆ b ┆ c │ + # # │ --- ┆ --- ┆ --- │ + # # │ i64 ┆ i64 ┆ i64 │ + # # ╞═════╪═════╪═════╡ + # # │ 1 ┆ 2 ┆ 3 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 1 ┆ 2 ┆ 3 │ + # # └─────┴─────┴─────┘ + # + # @example Include the header as a separate column + # df.transpose( + # include_header: true, header_name: "foo", column_names: ["a", "b", "c"] + # ) + # # => + # # shape: (2, 4) + # # ┌─────┬─────┬─────┬─────┐ + # # │ foo ┆ a ┆ b ┆ c │ + # # │ --- ┆ --- ┆ --- ┆ --- │ + # # │ str ┆ i64 ┆ i64 ┆ i64 │ + # # ╞═════╪═════╪═════╪═════╡ + # # │ a ┆ 1 ┆ 2 ┆ 3 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ b ┆ 1 ┆ 2 ┆ 3 │ + # # └─────┴─────┴─────┴─────┘ + def transpose(include_header: false, header_name: "column", column_names: nil) + df = _from_rbdf(_df.transpose(include_header, header_name)) + if !column_names.nil? + names = [] + n = df.width + if include_header + names << header_name + n -= 1 + end + column_names = column_names.each + n.times do + names << column_names.next + end + df.columns = names + end + df + end + # Reverse the DataFrame. # # @return [DataFrame] # # @example @@ -1460,12 +1587,52 @@ subset = [subset] end _from_rbdf(_df.drop_nulls(subset)) end - # def pipe - # end + # Offers a structured way to apply a sequence of user-defined functions (UDFs). + # + # @param func [Object] + # Callable; will receive the frame as the first parameter, + # followed by any given args/kwargs. + # @param args [Object] + # Arguments to pass to the UDF. + # @param kwargs [Object] + # Keyword arguments to pass to the UDF. + # + # @return [Object] + # + # @note + # It is recommended to use LazyFrame when piping operations, in order + # to fully take advantage of query optimization and parallelization. + # See {#lazy}. + # + # @example + # cast_str_to_int = lambda do |data, col_name:| + # data.with_column(Polars.col(col_name).cast(:i64)) + # end + # + # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => ["10", "20", "30", "40"]}) + # df.pipe(cast_str_to_int, col_name: "b") + # # => + # # shape: (4, 2) + # # ┌─────┬─────┐ + # # │ a ┆ b │ + # # │ --- ┆ --- │ + # # │ i64 ┆ i64 │ + # # ╞═════╪═════╡ + # # │ 1 ┆ 10 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 2 ┆ 20 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 3 ┆ 30 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 4 ┆ 40 │ + # # └─────┴─────┘ + def pipe(func, *args, **kwargs, &block) + func.call(self, *args, **kwargs, &block) + end # Add a column at index 0 that counts the rows. # # @param name [String] # Name of the column to add. @@ -1545,22 +1712,617 @@ self.class, maintain_order: maintain_order ) end - # def groupby_rolling - # end + # Create rolling groups based on a time column. + # + # Also works for index values of type `:i32` or `:i64`. + # + # Different from a `dynamic_groupby` the windows are now determined by the + # individual values and are not of constant intervals. For constant intervals use + # *groupby_dynamic* + # + # The `period` and `offset` arguments are created either from a timedelta, or + # by using the following string language: + # + # - 1ns (1 nanosecond) + # - 1us (1 microsecond) + # - 1ms (1 millisecond) + # - 1s (1 second) + # - 1m (1 minute) + # - 1h (1 hour) + # - 1d (1 day) + # - 1w (1 week) + # - 1mo (1 calendar month) + # - 1y (1 calendar year) + # - 1i (1 index count) + # + # Or combine them: + # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + # + # In case of a groupby_rolling on an integer column, the windows are defined by: + # + # - **"1i" # length 1** + # - **"10i" # length 10** + # + # @param index_column [Object] + # Column used to group based on the time window. + # Often to type Date/Datetime + # This column must be sorted in ascending order. If not the output will not + # make sense. + # + # In case of a rolling groupby on indices, dtype needs to be one of + # `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if + # performance matters use an `:i64` column. + # @param period [Object] + # Length of the window. + # @param offset [Object] + # Offset of the window. Default is -period. + # @param closed ["right", "left", "both", "none"] + # Define whether the temporal window interval is closed or not. + # @param by [Object] + # Also group by this column/these columns. + # + # @return [RollingGroupBy] + # + # @example + # dates = [ + # "2020-01-01 13:45:48", + # "2020-01-01 16:42:13", + # "2020-01-01 16:45:09", + # "2020-01-02 18:12:48", + # "2020-01-03 19:45:32", + # "2020-01-08 23:16:43" + # ] + # df = Polars::DataFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column( + # Polars.col("dt").str.strptime(:datetime) + # ) + # df.groupby_rolling(index_column: "dt", period: "2d").agg( + # [ + # Polars.sum("a").alias("sum_a"), + # Polars.min("a").alias("min_a"), + # Polars.max("a").alias("max_a") + # ] + # ) + # # => + # # shape: (6, 4) + # # ┌─────────────────────┬───────┬───────┬───────┐ + # # │ dt ┆ sum_a ┆ min_a ┆ max_a │ + # # │ --- ┆ --- ┆ --- ┆ --- │ + # # │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + # # ╞═════════════════════╪═══════╪═══════╪═══════╡ + # # │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + # # │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + # # │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + # # │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + # # │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + # # │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + # # └─────────────────────┴───────┴───────┴───────┘ + def groupby_rolling( + index_column:, + period:, + offset: nil, + closed: "right", + by: nil + ) + RollingGroupBy.new(self, index_column, period, offset, closed, by) + end - # def groupby_dynamic - # end + # Group based on a time value (or index value of type `:i32`, `:i64`). + # + # Time windows are calculated and rows are assigned to windows. Different from a + # normal groupby is that a row can be member of multiple groups. The time/index + # window could be seen as a rolling window, with a window size determined by + # dates/times/values instead of slots in the DataFrame. + # + # A window is defined by: + # + # - every: interval of the window + # - period: length of the window + # - offset: offset of the window + # + # The `every`, `period` and `offset` arguments are created with + # the following string language: + # + # - 1ns (1 nanosecond) + # - 1us (1 microsecond) + # - 1ms (1 millisecond) + # - 1s (1 second) + # - 1m (1 minute) + # - 1h (1 hour) + # - 1d (1 day) + # - 1w (1 week) + # - 1mo (1 calendar month) + # - 1y (1 calendar year) + # - 1i (1 index count) + # + # Or combine them: + # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + # + # In case of a groupby_dynamic on an integer column, the windows are defined by: + # + # - "1i" # length 1 + # - "10i" # length 10 + # + # @param index_column + # Column used to group based on the time window. + # Often to type Date/Datetime + # This column must be sorted in ascending order. If not the output will not + # make sense. + # + # In case of a dynamic groupby on indices, dtype needs to be one of + # `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if + # performance matters use an `:i64` column. + # @param every + # Interval of the window. + # @param period + # Length of the window, if None it is equal to 'every'. + # @param offset + # Offset of the window if None and period is None it will be equal to negative + # `every`. + # @param truncate + # Truncate the time value to the window lower bound. + # @param include_boundaries + # Add the lower and upper bound of the window to the "_lower_bound" and + # "_upper_bound" columns. This will impact performance because it's harder to + # parallelize + # @param closed ["right", "left", "both", "none"] + # Define whether the temporal window interval is closed or not. + # @param by + # Also group by this column/these columns + # + # @return [DataFrame] + # + # @example + # df = Polars::DataFrame.new( + # { + # "time" => Polars.date_range( + # DateTime.new(2021, 12, 16), + # DateTime.new(2021, 12, 16, 3), + # "30m" + # ), + # "n" => 0..6 + # } + # ) + # # => + # # shape: (7, 2) + # # ┌─────────────────────┬─────┐ + # # │ time ┆ n │ + # # │ --- ┆ --- │ + # # │ datetime[μs] ┆ i64 │ + # # ╞═════════════════════╪═════╡ + # # │ 2021-12-16 00:00:00 ┆ 0 │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 2021-12-16 00:30:00 ┆ 1 │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 2021-12-16 01:00:00 ┆ 2 │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 2021-12-16 01:30:00 ┆ 3 │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 2021-12-16 02:00:00 ┆ 4 │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 2021-12-16 02:30:00 ┆ 5 │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 2021-12-16 03:00:00 ┆ 6 │ + # # └─────────────────────┴─────┘ + # + # @example Group by windows of 1 hour starting at 2021-12-16 00:00:00. + # df.groupby_dynamic("time", every: "1h", closed: "right").agg( + # [ + # Polars.col("time").min.alias("time_min"), + # Polars.col("time").max.alias("time_max") + # ] + # ) + # # => + # # shape: (4, 3) + # # ┌─────────────────────┬─────────────────────┬─────────────────────┐ + # # │ time ┆ time_min ┆ time_max │ + # # │ --- ┆ --- ┆ --- │ + # # │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + # # ╞═════════════════════╪═════════════════════╪═════════════════════╡ + # # │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + # # │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + # # │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + # # │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + # # └─────────────────────┴─────────────────────┴─────────────────────┘ + # + # @example The window boundaries can also be added to the aggregation result. + # df.groupby_dynamic( + # "time", every: "1h", include_boundaries: true, closed: "right" + # ).agg([Polars.col("time").count.alias("time_count")]) + # # => + # # shape: (4, 4) + # # ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + # # │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + # # │ --- ┆ --- ┆ --- ┆ --- │ + # # │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + # # ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + # # │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤ + # # │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤ + # # │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤ + # # │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + # # └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + # + # @example When closed="left", should not include right end of interval. + # df.groupby_dynamic("time", every: "1h", closed: "left").agg( + # [ + # Polars.col("time").count.alias("time_count"), + # Polars.col("time").list.alias("time_agg_list") + # ] + # ) + # # => + # # shape: (4, 3) + # # ┌─────────────────────┬────────────┬─────────────────────────────────────┐ + # # │ time ┆ time_count ┆ time_agg_list │ + # # │ --- ┆ --- ┆ --- │ + # # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + # # ╞═════════════════════╪════════════╪═════════════════════════════════════╡ + # # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16... │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + # # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16... │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + # # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16... │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + # # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + # # └─────────────────────┴────────────┴─────────────────────────────────────┘ + # + # @example When closed="both" the time values at the window boundaries belong to 2 groups. + # df.groupby_dynamic("time", every: "1h", closed: "both").agg( + # [Polars.col("time").count.alias("time_count")] + # ) + # # => + # # shape: (5, 2) + # # ┌─────────────────────┬────────────┐ + # # │ time ┆ time_count │ + # # │ --- ┆ --- │ + # # │ datetime[μs] ┆ u32 │ + # # ╞═════════════════════╪════════════╡ + # # │ 2021-12-15 23:00:00 ┆ 1 │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤ + # # │ 2021-12-16 00:00:00 ┆ 3 │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤ + # # │ 2021-12-16 01:00:00 ┆ 3 │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤ + # # │ 2021-12-16 02:00:00 ┆ 3 │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤ + # # │ 2021-12-16 03:00:00 ┆ 1 │ + # # └─────────────────────┴────────────┘ + # + # @example Dynamic groupbys can also be combined with grouping on normal keys. + # df = Polars::DataFrame.new( + # { + # "time" => Polars.date_range( + # DateTime.new(2021, 12, 16), + # DateTime.new(2021, 12, 16, 3), + # "30m" + # ), + # "groups" => ["a", "a", "a", "b", "b", "a", "a"] + # } + # ) + # df.groupby_dynamic( + # "time", + # every: "1h", + # closed: "both", + # by: "groups", + # include_boundaries: true + # ).agg([Polars.col("time").count.alias("time_count")]) + # # => + # # shape: (7, 5) + # # ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + # # │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + # # │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + # # ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + # # │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤ + # # │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤ + # # │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤ + # # │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤ + # # │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤ + # # │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤ + # # │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + # # └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + # + # @example Dynamic groupby on an index column. + # df = Polars::DataFrame.new( + # { + # "idx" => Polars.arange(0, 6, eager: true), + # "A" => ["A", "A", "B", "B", "B", "C"] + # } + # ) + # df.groupby_dynamic( + # "idx", + # every: "2i", + # period: "3i", + # include_boundaries: true, + # closed: "right" + # ).agg(Polars.col("A").list.alias("A_agg_list")) + # # => + # # shape: (3, 4) + # # ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + # # │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + # # │ --- ┆ --- ┆ --- ┆ --- │ + # # │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + # # ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + # # │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + # # │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + # # │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + # # └─────────────────┴─────────────────┴─────┴─────────────────┘ + def groupby_dynamic( + index_column, + every:, + period: nil, + offset: nil, + truncate: true, + include_boundaries: false, + closed: "left", + by: nil + ) + DynamicGroupBy.new( + self, + index_column, + every, + period, + offset, + truncate, + include_boundaries, + closed, + by + ) + end - # def upsample - # end + # Upsample a DataFrame at a regular frequency. + # + # @param time_column [Object] + # time column will be used to determine a date_range. + # Note that this column has to be sorted for the output to make sense. + # @param every [String] + # interval will start 'every' duration + # @param offset [String] + # change the start of the date_range by this offset. + # @param by [Object] + # First group by these columns and then upsample for every group + # @param maintain_order [Boolean] + # Keep the ordering predictable. This is slower. + # + # The `every` and `offset` arguments are created with + # the following string language: + # + # - 1ns (1 nanosecond) + # - 1us (1 microsecond) + # - 1ms (1 millisecond) + # - 1s (1 second) + # - 1m (1 minute) + # - 1h (1 hour) + # - 1d (1 day) + # - 1w (1 week) + # - 1mo (1 calendar month) + # - 1y (1 calendar year) + # - 1i (1 index count) + # + # Or combine them: + # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + # + # @return [DataFrame] + # + # @example Upsample a DataFrame by a certain interval. + # df = Polars::DataFrame.new( + # { + # "time" => [ + # DateTime.new(2021, 2, 1), + # DateTime.new(2021, 4, 1), + # DateTime.new(2021, 5, 1), + # DateTime.new(2021, 6, 1) + # ], + # "groups" => ["A", "B", "A", "B"], + # "values" => [0, 1, 2, 3] + # } + # ) + # df.upsample( + # time_column: "time", every: "1mo", by: "groups", maintain_order: true + # ).select(Polars.all.forward_fill) + # # => + # # shape: (7, 3) + # # ┌─────────────────────┬────────┬────────┐ + # # │ time ┆ groups ┆ values │ + # # │ --- ┆ --- ┆ --- │ + # # │ datetime[ns] ┆ str ┆ i64 │ + # # ╞═════════════════════╪════════╪════════╡ + # # │ 2021-02-01 00:00:00 ┆ A ┆ 0 │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤ + # # │ 2021-03-01 00:00:00 ┆ A ┆ 0 │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤ + # # │ 2021-04-01 00:00:00 ┆ A ┆ 0 │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤ + # # │ 2021-05-01 00:00:00 ┆ A ┆ 2 │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤ + # # │ 2021-04-01 00:00:00 ┆ B ┆ 1 │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤ + # # │ 2021-05-01 00:00:00 ┆ B ┆ 1 │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤ + # # │ 2021-06-01 00:00:00 ┆ B ┆ 3 │ + # # └─────────────────────┴────────┴────────┘ + def upsample( + time_column:, + every:, + offset: nil, + by: nil, + maintain_order: false + ) + if by.nil? + by = [] + end + if by.is_a?(String) + by = [by] + end + if offset.nil? + offset = "0ns" + end - # def join_asof - # end + every = Utils._timedelta_to_pl_duration(every) + offset = Utils._timedelta_to_pl_duration(offset) + _from_rbdf( + _df.upsample(by, time_column, every, offset, maintain_order) + ) + end + + # Perform an asof join. + # + # This is similar to a left-join except that we match on nearest key rather than + # equal keys. + # + # Both DataFrames must be sorted by the asof_join key. + # + # For each row in the left DataFrame: + # + # - A "backward" search selects the last row in the right DataFrame whose 'on' key is less than or equal to the left's key. + # - A "forward" search selects the first row in the right DataFrame whose 'on' key is greater than or equal to the left's key. + # + # The default is "backward". + # + # @param other [DataFrame] + # DataFrame to join with. + # @param left_on [String] + # Join column of the left DataFrame. + # @param right_on [String] + # Join column of the right DataFrame. + # @param on [String] + # Join column of both DataFrames. If set, `left_on` and `right_on` should be + # None. + # @param by [Object] + # join on these columns before doing asof join + # @param by_left [Object] + # join on these columns before doing asof join + # @param by_right [Object] + # join on these columns before doing asof join + # @param strategy ["backward", "forward"] + # Join strategy. + # @param suffix [String] + # Suffix to append to columns with a duplicate name. + # @param tolerance [Object] + # Numeric tolerance. By setting this the join will only be done if the near + # keys are within this distance. If an asof join is done on columns of dtype + # "Date", "Datetime", "Duration" or "Time" you use the following string + # language: + # + # - 1ns (1 nanosecond) + # - 1us (1 microsecond) + # - 1ms (1 millisecond) + # - 1s (1 second) + # - 1m (1 minute) + # - 1h (1 hour) + # - 1d (1 day) + # - 1w (1 week) + # - 1mo (1 calendar month) + # - 1y (1 calendar year) + # - 1i (1 index count) + # + # Or combine them: + # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + # + # @param allow_parallel [Boolean] + # Allow the physical plan to optionally evaluate the computation of both + # DataFrames up to the join in parallel. + # @param force_parallel [Boolean] + # Force the physical plan to evaluate the computation of both DataFrames up to + # the join in parallel. + # + # @return [DataFrame] + # + # @example + # gdp = Polars::DataFrame.new( + # { + # "date" => [ + # DateTime.new(2016, 1, 1), + # DateTime.new(2017, 1, 1), + # DateTime.new(2018, 1, 1), + # DateTime.new(2019, 1, 1), + # ], # note record date: Jan 1st (sorted!) + # "gdp" => [4164, 4411, 4566, 4696] + # } + # ) + # population = Polars::DataFrame.new( + # { + # "date" => [ + # DateTime.new(2016, 5, 12), + # DateTime.new(2017, 5, 12), + # DateTime.new(2018, 5, 12), + # DateTime.new(2019, 5, 12), + # ], # note record date: May 12th (sorted!) + # "population" => [82.19, 82.66, 83.12, 83.52] + # } + # ) + # population.join_asof( + # gdp, left_on: "date", right_on: "date", strategy: "backward" + # ) + # # => + # # shape: (4, 3) + # # ┌─────────────────────┬────────────┬──────┐ + # # │ date ┆ population ┆ gdp │ + # # │ --- ┆ --- ┆ --- │ + # # │ datetime[ns] ┆ f64 ┆ i64 │ + # # ╞═════════════════════╪════════════╪══════╡ + # # │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤ + # # │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤ + # # │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤ + # # │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + # # └─────────────────────┴────────────┴──────┘ + def join_asof( + other, + left_on: nil, + right_on: nil, + on: nil, + by_left: nil, + by_right: nil, + by: nil, + strategy: "backward", + suffix: "_right", + tolerance: nil, + allow_parallel: true, + force_parallel: false + ) + lazy + .join_asof( + other.lazy, + left_on: left_on, + right_on: right_on, + on: on, + by_left: by_left, + by_right: by_right, + by: by, + strategy: strategy, + suffix: suffix, + tolerance: tolerance, + allow_parallel: allow_parallel, + force_parallel: force_parallel + ) + .collect(no_optimization: true) + end + # Join in SQL-like fashion. # # @param other [DataFrame] # DataFrame to join with. # @param left_on [Object] @@ -1673,12 +2435,82 @@ suffix: suffix, ) .collect(no_optimization: true) end - # def apply - # end + # Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + # + # The UDF will receive each row as a tuple of values: `udf(row)`. + # + # Implementing logic using a Ruby function is almost always _significantly_ + # slower and more memory intensive than implementing the same logic using + # the native expression API because: + # + # - The native expression engine runs in Rust; UDFs run in Ruby. + # - Use of Ruby UDFs forces the DataFrame to be materialized in memory. + # - Polars-native expressions can be parallelised (UDFs cannot). + # - Polars-native expressions can be logically optimised (UDFs cannot). + # + # Wherever possible you should strongly prefer the native expression API + # to achieve the best performance. + # + # @param return_dtype [Symbol] + # Output type of the operation. If none given, Polars tries to infer the type. + # @param inference_size [Integer] + # Only used in the case when the custom function returns rows. + # This uses the first `n` rows to determine the output schema + # + # @return [Object] + # + # @note + # The frame-level `apply` cannot track column names (as the UDF is a black-box + # that may arbitrarily drop, rearrange, transform, or add new columns); if you + # want to apply a UDF such that column names are preserved, you should use the + # expression-level `apply` syntax instead. + # + # @example + # df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [-1, 5, 8]}) + # + # @example Return a DataFrame by mapping each row to a tuple: + # df.apply { |t| [t[0] * 2, t[1] * 3] } + # # => + # # shape: (3, 2) + # # ┌──────────┬──────────┐ + # # │ column_0 ┆ column_1 │ + # # │ --- ┆ --- │ + # # │ i64 ┆ i64 │ + # # ╞══════════╪══════════╡ + # # │ 2 ┆ -3 │ + # # ├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤ + # # │ 4 ┆ 15 │ + # # ├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤ + # # │ 6 ┆ 24 │ + # # └──────────┴──────────┘ + # + # @example Return a Series by mapping each row to a scalar: + # df.apply { |t| t[0] * 2 + t[1] } + # # => + # # shape: (3, 1) + # # ┌───────┐ + # # │ apply │ + # # │ --- │ + # # │ i64 │ + # # ╞═══════╡ + # # │ 1 │ + # # ├╌╌╌╌╌╌╌┤ + # # │ 9 │ + # # ├╌╌╌╌╌╌╌┤ + # # │ 14 │ + # # └───────┘ + def apply(return_dtype: nil, inference_size: 256, &f) + out, is_df = _df.apply(f, return_dtype, inference_size) + if is_df + _from_rbdf(out) + else + _from_rbdf(Utils.wrap_s(out).to_frame._df) + end + end # Return a new DataFrame with the column added or replaced. # # @param column [Object] # Series, where the name of the Series refers to the column in the DataFrame. @@ -2176,22 +3008,409 @@ # # └─────────┴─────────┘ def explode(columns) lazy.explode(columns).collect(no_optimization: true) end - # def pivot - # end + # Create a spreadsheet-style pivot table as a DataFrame. + # + # @param values [Object] + # Column values to aggregate. Can be multiple columns if the *columns* + # arguments contains multiple columns as well + # @param index [Object] + # One or multiple keys to group by + # @param columns [Object] + # Columns whose values will be used as the header of the output DataFrame + # @param aggregate_fn ["first", "sum", "max", "min", "mean", "median", "last", "count"] + # A predefined aggregate function str or an expression. + # @param maintain_order [Object] + # Sort the grouped keys so that the output order is predictable. + # @param sort_columns [Object] + # Sort the transposed columns by name. Default is by order of discovery. + # + # @return [DataFrame] + # + # @example + # df = Polars::DataFrame.new( + # { + # "foo" => ["one", "one", "one", "two", "two", "two"], + # "bar" => ["A", "B", "C", "A", "B", "C"], + # "baz" => [1, 2, 3, 4, 5, 6] + # } + # ) + # df.pivot(values: "baz", index: "foo", columns: "bar") + # # => + # # shape: (2, 4) + # # ┌─────┬─────┬─────┬─────┐ + # # │ foo ┆ A ┆ B ┆ C │ + # # │ --- ┆ --- ┆ --- ┆ --- │ + # # │ str ┆ i64 ┆ i64 ┆ i64 │ + # # ╞═════╪═════╪═════╪═════╡ + # # │ one ┆ 1 ┆ 2 ┆ 3 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ two ┆ 4 ┆ 5 ┆ 6 │ + # # └─────┴─────┴─────┴─────┘ + def pivot( + values:, + index:, + columns:, + aggregate_fn: "first", + maintain_order: true, + sort_columns: false + ) + if values.is_a?(String) + values = [values] + end + if index.is_a?(String) + index = [index] + end + if columns.is_a?(String) + columns = [columns] + end - # def melt - # end + if aggregate_fn.is_a?(String) + case aggregate_fn + when "first" + aggregate_fn = Polars.element.first + when "sum" + aggregate_fn = Polars.element.sum + when "max" + aggregate_fn = Polars.element.max + when "min" + aggregate_fn = Polars.element.min + when "mean" + aggregate_fn = Polars.element.mean + when "median" + aggregate_fn = Polars.element.median + when "last" + aggregate_fn = Polars.element.last + when "count" + aggregate_fn = Polars.count + else + raise ArgumentError, "Argument aggregate fn: '#{aggregate_fn}' was not expected." + end + end - # def unstack - # end + _from_rbdf( + _df.pivot_expr( + values, + index, + columns, + aggregate_fn._rbexpr, + maintain_order, + sort_columns + ) + ) + end - # def partition_by - # end + # Unpivot a DataFrame from wide to long format. + # + # Optionally leaves identifiers set. + # + # This function is useful to massage a DataFrame into a format where one or more + # columns are identifier variables (id_vars), while all other columns, considered + # measured variables (value_vars), are "unpivoted" to the row axis, leaving just + # two non-identifier columns, 'variable' and 'value'. + # + # @param id_vars [Object] + # Columns to use as identifier variables. + # @param value_vars [Object] + # Values to use as identifier variables. + # If `value_vars` is empty all columns that are not in `id_vars` will be used. + # @param variable_name [String] + # Name to give to the `value` column. Defaults to "variable" + # @param value_name [String] + # Name to give to the `value` column. Defaults to "value" + # + # @return [DataFrame] + # + # @example + # df = Polars::DataFrame.new( + # { + # "a" => ["x", "y", "z"], + # "b" => [1, 3, 5], + # "c" => [2, 4, 6] + # } + # ) + # df.melt(id_vars: "a", value_vars: ["b", "c"]) + # # => + # # shape: (6, 3) + # # ┌─────┬──────────┬───────┐ + # # │ a ┆ variable ┆ value │ + # # │ --- ┆ --- ┆ --- │ + # # │ str ┆ str ┆ i64 │ + # # ╞═════╪══════════╪═══════╡ + # # │ x ┆ b ┆ 1 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + # # │ y ┆ b ┆ 3 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + # # │ z ┆ b ┆ 5 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + # # │ x ┆ c ┆ 2 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + # # │ y ┆ c ┆ 4 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤ + # # │ z ┆ c ┆ 6 │ + # # └─────┴──────────┴───────┘ + def melt(id_vars: nil, value_vars: nil, variable_name: nil, value_name: nil) + if value_vars.is_a?(String) + value_vars = [value_vars] + end + if id_vars.is_a?(String) + id_vars = [id_vars] + end + if value_vars.nil? + value_vars = [] + end + if id_vars.nil? + id_vars = [] + end + _from_rbdf( + _df.melt(id_vars, value_vars, value_name, variable_name) + ) + end + # Unstack a long table to a wide form without doing an aggregation. + # + # This can be much faster than a pivot, because it can skip the grouping phase. + # + # @note + # This functionality is experimental and may be subject to changes + # without it being considered a breaking change. + # + # @param step Integer + # Number of rows in the unstacked frame. + # @param how ["vertical", "horizontal"] + # Direction of the unstack. + # @param columns [Object] + # Column to include in the operation. + # @param fill_values [Object] + # Fill values that don't fit the new size with this value. + # + # @return [DataFrame] + # + # @example + # df = Polars::DataFrame.new( + # { + # "col1" => "A".."I", + # "col2" => Polars.arange(0, 9, eager: true) + # } + # ) + # # => + # # shape: (9, 2) + # # ┌──────┬──────┐ + # # │ col1 ┆ col2 │ + # # │ --- ┆ --- │ + # # │ str ┆ i64 │ + # # ╞══════╪══════╡ + # # │ A ┆ 0 │ + # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤ + # # │ B ┆ 1 │ + # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤ + # # │ C ┆ 2 │ + # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤ + # # │ D ┆ 3 │ + # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤ + # # │ ... ┆ ... │ + # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤ + # # │ F ┆ 5 │ + # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤ + # # │ G ┆ 6 │ + # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤ + # # │ H ┆ 7 │ + # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤ + # # │ I ┆ 8 │ + # # └──────┴──────┘ + # + # @example + # df.unstack(step: 3, how: "vertical") + # # => + # # shape: (3, 6) + # # ┌────────┬────────┬────────┬────────┬────────┬────────┐ + # # │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │ + # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + # # │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │ + # # ╞════════╪════════╪════════╪════════╪════════╪════════╡ + # # │ A ┆ D ┆ G ┆ 0 ┆ 3 ┆ 6 │ + # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤ + # # │ B ┆ E ┆ H ┆ 1 ┆ 4 ┆ 7 │ + # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤ + # # │ C ┆ F ┆ I ┆ 2 ┆ 5 ┆ 8 │ + # # └────────┴────────┴────────┴────────┴────────┴────────┘ + # + # @example + # df.unstack(step: 3, how: "horizontal") + # # => + # # shape: (3, 6) + # # ┌────────┬────────┬────────┬────────┬────────┬────────┐ + # # │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │ + # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + # # │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │ + # # ╞════════╪════════╪════════╪════════╪════════╪════════╡ + # # │ A ┆ B ┆ C ┆ 0 ┆ 1 ┆ 2 │ + # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤ + # # │ D ┆ E ┆ F ┆ 3 ┆ 4 ┆ 5 │ + # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤ + # # │ G ┆ H ┆ I ┆ 6 ┆ 7 ┆ 8 │ + # # └────────┴────────┴────────┴────────┴────────┴────────┘ + def unstack(step:, how: "vertical", columns: nil, fill_values: nil) + if !columns.nil? + df = select(columns) + else + df = self + end + + height = df.height + if how == "vertical" + n_rows = step + n_cols = (height / n_rows.to_f).ceil + else + n_cols = step + n_rows = (height / n_cols.to_f).ceil + end + + n_fill = n_cols * n_rows - height + + if n_fill > 0 + if !fill_values.is_a?(Array) + fill_values = [fill_values] * df.width + end + + df = df.select( + df.get_columns.zip(fill_values).map do |s, next_fill| + s.extend_constant(next_fill, n_fill) + end + ) + end + + if how == "horizontal" + df = ( + df.with_column( + (Polars.arange(0, n_cols * n_rows, eager: true) % n_cols).alias( + "__sort_order" + ) + ) + .sort("__sort_order") + .drop("__sort_order") + ) + end + + zfill_val = Math.log10(n_cols).floor + 1 + slices = + df.get_columns.flat_map do |s| + n_cols.times.map do |slice_nbr| + s.slice(slice_nbr * n_rows, n_rows).alias("%s_%0#{zfill_val}d" % [s.name, slice_nbr]) + end + end + + _from_rbdf(DataFrame.new(slices)._df) + end + + # Split into multiple DataFrames partitioned by groups. + # + # @param groups [Object] + # Groups to partition by. + # @param maintain_order [Boolean] + # Keep predictable output order. This is slower as it requires an extra sort + # operation. + # @param as_dict [Boolean] + # If true, return the partitions in a dictionary keyed by the distinct group + # values instead of a list. + # + # @return [Object] + # + # @example + # df = Polars::DataFrame.new( + # { + # "foo" => ["A", "A", "B", "B", "C"], + # "N" => [1, 2, 2, 4, 2], + # "bar" => ["k", "l", "m", "m", "l"] + # } + # ) + # df.partition_by("foo", maintain_order: true) + # # => + # # [shape: (2, 3) + # # ┌─────┬─────┬─────┐ + # # │ foo ┆ N ┆ bar │ + # # │ --- ┆ --- ┆ --- │ + # # │ str ┆ i64 ┆ str │ + # # ╞═════╪═════╪═════╡ + # # │ A ┆ 1 ┆ k │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ A ┆ 2 ┆ l │ + # # └─────┴─────┴─────┘, shape: (2, 3) + # # ┌─────┬─────┬─────┐ + # # │ foo ┆ N ┆ bar │ + # # │ --- ┆ --- ┆ --- │ + # # │ str ┆ i64 ┆ str │ + # # ╞═════╪═════╪═════╡ + # # │ B ┆ 2 ┆ m │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ B ┆ 4 ┆ m │ + # # └─────┴─────┴─────┘, shape: (1, 3) + # # ┌─────┬─────┬─────┐ + # # │ foo ┆ N ┆ bar │ + # # │ --- ┆ --- ┆ --- │ + # # │ str ┆ i64 ┆ str │ + # # ╞═════╪═════╪═════╡ + # # │ C ┆ 2 ┆ l │ + # # └─────┴─────┴─────┘] + # + # @example + # df.partition_by("foo", maintain_order: true, as_dict: true) + # # => + # # {"A"=>shape: (2, 3) + # # ┌─────┬─────┬─────┐ + # # │ foo ┆ N ┆ bar │ + # # │ --- ┆ --- ┆ --- │ + # # │ str ┆ i64 ┆ str │ + # # ╞═════╪═════╪═════╡ + # # │ A ┆ 1 ┆ k │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ A ┆ 2 ┆ l │ + # # └─────┴─────┴─────┘, "B"=>shape: (2, 3) + # # ┌─────┬─────┬─────┐ + # # │ foo ┆ N ┆ bar │ + # # │ --- ┆ --- ┆ --- │ + # # │ str ┆ i64 ┆ str │ + # # ╞═════╪═════╪═════╡ + # # │ B ┆ 2 ┆ m │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ B ┆ 4 ┆ m │ + # # └─────┴─────┴─────┘, "C"=>shape: (1, 3) + # # ┌─────┬─────┬─────┐ + # # │ foo ┆ N ┆ bar │ + # # │ --- ┆ --- ┆ --- │ + # # │ str ┆ i64 ┆ str │ + # # ╞═════╪═════╪═════╡ + # # │ C ┆ 2 ┆ l │ + # # └─────┴─────┴─────┘} + def partition_by(groups, maintain_order: true, as_dict: false) + if groups.is_a?(String) + groups = [groups] + elsif !groups.is_a?(Array) + groups = Array(groups) + end + + if as_dict + out = {} + if groups.length == 1 + _df.partition_by(groups, maintain_order).each do |df| + df = _from_rbdf(df) + out[df[groups][0, 0]] = df + end + else + _df.partition_by(groups, maintain_order).each do |df| + df = _from_rbdf(df) + out[df[groups].row(0)] = df + end + end + out + else + _df.partition_by(groups, maintain_order).map { |df| _from_rbdf(df) } + end + end + # Shift values by the given period. # # @param periods [Integer] # Number of places to shift (may be negative). # @@ -3059,13 +4278,98 @@ n = 1 end _from_rbdf(_df.sample_n(n, with_replacement, shuffle, seed)) end - # def fold - # end + # Apply a horizontal reduction on a DataFrame. + # + # This can be used to effectively determine aggregations on a row level, and can + # be applied to any DataType that can be supercasted (casted to a similar parent + # type). + # + # An example of the supercast rules when applying an arithmetic operation on two + # DataTypes are for instance: + # + # i8 + str = str + # f32 + i64 = f32 + # f32 + f64 = f64 + # + # @return [Series] + # + # @example A horizontal sum operation: + # df = Polars::DataFrame.new( + # { + # "a" => [2, 1, 3], + # "b" => [1, 2, 3], + # "c" => [1.0, 2.0, 3.0] + # } + # ) + # df.fold { |s1, s2| s1 + s2 } + # # => + # # shape: (3,) + # # Series: 'a' [f64] + # # [ + # # 4.0 + # # 5.0 + # # 9.0 + # # ] + # + # @example A horizontal minimum operation: + # df = Polars::DataFrame.new({"a" => [2, 1, 3], "b" => [1, 2, 3], "c" => [1.0, 2.0, 3.0]}) + # df.fold { |s1, s2| s1.zip_with(s1 < s2, s2) } + # # => + # # shape: (3,) + # # Series: 'a' [f64] + # # [ + # # 1.0 + # # 1.0 + # # 3.0 + # # ] + # + # @example A horizontal string concatenation: + # df = Polars::DataFrame.new( + # { + # "a" => ["foo", "bar", 2], + # "b" => [1, 2, 3], + # "c" => [1.0, 2.0, 3.0] + # } + # ) + # df.fold { |s1, s2| s1 + s2 } + # # => + # # shape: (3,) + # # Series: 'a' [str] + # # [ + # # "foo11.0" + # # "bar22.0" + # # null + # # ] + # + # @example A horizontal boolean or, similar to a row-wise .any(): + # df = Polars::DataFrame.new( + # { + # "a" => [false, false, true], + # "b" => [false, true, false] + # } + # ) + # df.fold { |s1, s2| s1 | s2 } + # # => + # # shape: (3,) + # # Series: 'a' [bool] + # # [ + # # false + # # true + # # true + # # ] + def fold(&operation) + acc = to_series(0) + 1.upto(width - 1) do |i| + acc = operation.call(acc, to_series(i)) + end + acc + end + # Get a row as tuple, either by index or by predicate. # # @param index [Object] # Row index. # @param by_predicate [Object] @@ -3169,12 +4473,49 @@ # # └─────┴─────┘ def take_every(n) select(Utils.col("*").take_every(n)) end - # def hash_rows - # end + # Hash and combine the rows in this DataFrame. + # + # The hash value is of type `:u64`. + # + # @param seed [Integer] + # Random seed parameter. Defaults to 0. + # @param seed_1 [Integer] + # Random seed parameter. Defaults to `seed` if not set. + # @param seed_2 [Integer] + # Random seed parameter. Defaults to `seed` if not set. + # @param seed_3 [Integer] + # Random seed parameter. Defaults to `seed` if not set. + # + # @return [Series] + # + # @example + # df = Polars::DataFrame.new( + # { + # "foo" => [1, nil, 3, 4], + # "ham" => ["a", "b", nil, "d"] + # } + # ) + # df.hash_rows(seed: 42) + # # => + # # shape: (4,) + # # Series: '' [u64] + # # [ + # # 4238614331852490969 + # # 17976148875586754089 + # # 4702262519505526977 + # # 18144177983981041107 + # # ] + def hash_rows(seed: 0, seed_1: nil, seed_2: nil, seed_3: nil) + k0 = seed + k1 = seed_1.nil? ? seed : seed_1 + k2 = seed_2.nil? ? seed : seed_2 + k3 = seed_3.nil? ? seed : seed_3 + Utils.wrap_s(_df.hash_rows(k0, k1, k2, k3)) + end # Interpolate intermediate values. The interpolation method is linear. # # @return [DataFrame] # @@ -3295,11 +4636,23 @@ def initialize_copy(other) super self._df = _df._clone end - def hash_to_rbdf(data, columns: nil) + def _pos_idx(idx, dim) + if idx >= 0 + idx + else + shape[dim] + idx + end + end + + # def _pos_idxs + # end + + # @private + def self.hash_to_rbdf(data, columns: nil) if !columns.nil? columns, dtypes = _unpack_columns(columns, lookup_names: data.keys) if data.empty? && dtypes data_series = columns.map { |name| Series.new(name, [], dtype: dtypes[name])._s } @@ -3311,15 +4664,38 @@ end RbDataFrame.read_hash(data) end - def _unpack_columns(columns, lookup_names: nil) - [columns.keys, columns] + # @private + def self._unpack_columns(columns, lookup_names: nil, n_expected: nil) + if columns.is_a?(Hash) + columns = columns.to_a + end + column_names = + (columns || []).map.with_index do |col, i| + if col.is_a?(String) + col || "column_#{i}" + else + col[0] + end + end + if column_names.empty? && n_expected + column_names = n_expected.times.map { |i| "column_#{i}" } + end + # TODO zip_longest + lookup = column_names.zip(lookup_names || []).to_h + + [ + column_names, + (columns || []).select { |col| !col.is_a?(String) && col[1] }.to_h do |col| + [lookup[col[0]] || col[0], col[1]] + end + ] end - def _handle_columns_arg(data, columns: nil) + def self._handle_columns_arg(data, columns: nil) if columns.nil? data else if data.empty? columns.map { |c| Series.new(c, nil)._s } @@ -3333,17 +4709,42 @@ raise ArgumentError, "Dimensions of columns arg must match data dimensions." end end end - def sequence_to_rbdf(data, columns: nil, orient: nil) - if columns || orient - raise Todo + # @private + def self.sequence_to_rbdf(data, columns: nil, orient: nil) + if data.length == 0 + return hash_to_rbdf({}, columns: columns) end - RbDataFrame.new(data.map(&:_s)) + + if data[0].is_a?(Series) + # series_names = data.map(&:name) + # columns, dtypes = _unpack_columns(columns || series_names, n_expected: data.length) + data_series = [] + data.each do |s| + data_series << s._s + end + elsif data[0].is_a?(Array) + if orient.nil? && !columns.nil? + orient = columns.length == data.length ? "col" : "row" + end + + if orient == "row" + raise Todo + elsif orient == "col" || orient.nil? + raise Todo + else + raise ArgumentError, "orient must be one of {{'col', 'row', nil}}, got #{orient} instead." + end + end + + data_series = _handle_columns_arg(data_series, columns: columns) + RbDataFrame.new(data_series) end - def series_to_rbdf(data, columns: nil) + # @private + def self.series_to_rbdf(data, columns: nil) if columns raise Todo end RbDataFrame.new([data._s]) end