lib/polars/data_frame.rb in polars-df-0.1.4 vs lib/polars/data_frame.rb in polars-df-0.1.5
- old
+ new
@@ -24,18 +24,18 @@
data[k] = result.rows.map { |r| r[i] }
end
end
if data.nil?
- self._df = hash_to_rbdf({}, columns: columns)
+ self._df = self.class.hash_to_rbdf({}, columns: columns)
elsif data.is_a?(Hash)
data = data.transform_keys { |v| v.is_a?(Symbol) ? v.to_s : v }
- self._df = hash_to_rbdf(data, columns: columns)
+ self._df = self.class.hash_to_rbdf(data, columns: columns)
elsif data.is_a?(Array)
- self._df = sequence_to_rbdf(data, columns: columns, orient: orient)
+ self._df = self.class.sequence_to_rbdf(data, columns: columns, orient: orient)
elsif data.is_a?(Series)
- self._df = series_to_rbdf(data, columns: columns)
+ self._df = self.class.series_to_rbdf(data, columns: columns)
else
raise ArgumentError, "DataFrame constructor called with unsupported type; got #{data.class.name}"
end
end
@@ -44,15 +44,20 @@
df = DataFrame.allocate
df._df = rb_df
df
end
- # def self._from_hashes
- # end
+ # @private
+ def self._from_hashes(data, infer_schema_length: 100, schema: nil)
+ rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema)
+ _from_rbdf(rbdf)
+ end
- # def self._from_hash
- # end
+ # @private
+ def self._from_hash(data, columns: nil)
+ _from_rbdf(hash_to_rbdf(data, columns: columns))
+ end
# def self._from_records
# end
# def self._from_numo
@@ -184,12 +189,18 @@
low_memory
)
)
end
- # def self._read_avro
- # end
+ # @private
+ def self._read_avro(file, columns: nil, n_rows: nil)
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
+ file = Utils.format_path(file)
+ end
+ projection, columns = Utils.handle_projection_columns(columns)
+ _from_rbdf(RbDataFrame.read_avro(file, columns, projection, n_rows))
+ end
# @private
def self._read_ipc(
file,
columns: nil,
@@ -484,16 +495,10 @@
end
# def each
# end
- # def _pos_idx
- # end
-
- # def _pos_idxs
- # end
-
# Returns subset of the DataFrame.
#
# @return [Object]
def [](*args)
if args.size == 2
@@ -552,23 +557,37 @@
return Utils.wrap_s(_df.column(item))
end
# df[idx]
if item.is_a?(Integer)
- return slice(_pos_idx(item, dim: 0), 1)
+ return slice(_pos_idx(item, 0), 1)
end
# df[..]
if item.is_a?(Range)
return Slice.new(self).apply(item)
end
+
+ if Utils.is_str_sequence(item, allow_str: false)
+ # select multiple columns
+ # df[["foo", "bar"]]
+ return _from_rbdf(_df.select(item))
+ end
end
raise ArgumentError, "Cannot get item of type: #{item.class.name}"
end
+ # Set item.
+ #
+ # @return [Object]
# def []=(key, value)
+ # if key.is_a?(String)
+ # raise TypeError, "'DataFrame' object does not support 'Series' assignment by index. Use 'DataFrame.with_columns'"
+ # end
+
+ # raise Todo
# end
# no to_arrow
# Convert DataFrame to a hash mapping column name to values.
@@ -580,13 +599,29 @@
else
get_columns.to_h { |s| [s.name, s.to_a] }
end
end
- # def to_hashes / to_a
- # end
+ # Convert every row to a dictionary.
+ #
+ # Note that this is slow.
+ #
+ # @return [Array]
+ #
+ # @example
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
+ # df.to_hashes
+ # [{'foo': 1, 'bar': 4}, {'foo': 2, 'bar': 5}, {'foo': 3, 'bar': 6}]
+ def to_hashes
+ rbdf = _df
+ names = columns
+ height.times.map do |i|
+ names.zip(rbdf.row_tuple(i)).to_h
+ end
+ end
+
# def to_numo
# end
# no to_pandas
@@ -760,13 +795,29 @@
null_value,
)
nil
end
- # def write_avro
- # end
+ # Write to Apache Avro file.
+ #
+ # @param file [String]
+ # File path to which the file should be written.
+ # @param compression ["uncompressed", "snappy", "deflate"]
+ # Compression method. Defaults to "uncompressed".
+ #
+ # @return [nil]
+ def write_avro(file, compression = "uncompressed")
+ if compression.nil?
+ compression = "uncompressed"
+ end
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
+ file = Utils.format_path(file)
+ end
+ _df.write_avro(file, compression)
+ end
+
# Write to Arrow IPC binary stream or Feather file.
#
# @param file [String]
# File path to which the file should be written.
# @param compression ["uncompressed", "lz4", "zstd"]
@@ -864,13 +915,89 @@
def estimated_size(unit = "b")
sz = _df.estimated_size
Utils.scale_bytes(sz, to: unit)
end
- # def transpose
- # end
+ # Transpose a DataFrame over the diagonal.
+ #
+ # @param include_header [Boolean]
+ # If set, the column names will be added as first column.
+ # @param header_name [String]
+ # If `include_header` is set, this determines the name of the column that will
+ # be inserted.
+ # @param column_names [Array]
+ # Optional generator/iterator that yields column names. Will be used to
+ # replace the columns in the DataFrame.
+ #
+ # @return [DataFrame]
+ #
+ # @note
+ # This is a very expensive operation. Perhaps you can do it differently.
+ #
+ # @example
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3], "b" => [1, 2, 3]})
+ # df.transpose(include_header: true)
+ # # =>
+ # # shape: (2, 4)
+ # # ┌────────┬──────────┬──────────┬──────────┐
+ # # │ column ┆ column_0 ┆ column_1 ┆ column_2 │
+ # # │ --- ┆ --- ┆ --- ┆ --- │
+ # # │ str ┆ i64 ┆ i64 ┆ i64 │
+ # # ╞════════╪══════════╪══════════╪══════════╡
+ # # │ a ┆ 1 ┆ 2 ┆ 3 │
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
+ # # │ b ┆ 1 ┆ 2 ┆ 3 │
+ # # └────────┴──────────┴──────────┴──────────┘
+ #
+ # @example Replace the auto-generated column names with a list
+ # df.transpose(include_header: false, column_names: ["a", "b", "c"])
+ # # =>
+ # # shape: (2, 3)
+ # # ┌─────┬─────┬─────┐
+ # # │ a ┆ b ┆ c │
+ # # │ --- ┆ --- ┆ --- │
+ # # │ i64 ┆ i64 ┆ i64 │
+ # # ╞═════╪═════╪═════╡
+ # # │ 1 ┆ 2 ┆ 3 │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ 1 ┆ 2 ┆ 3 │
+ # # └─────┴─────┴─────┘
+ #
+ # @example Include the header as a separate column
+ # df.transpose(
+ # include_header: true, header_name: "foo", column_names: ["a", "b", "c"]
+ # )
+ # # =>
+ # # shape: (2, 4)
+ # # ┌─────┬─────┬─────┬─────┐
+ # # │ foo ┆ a ┆ b ┆ c │
+ # # │ --- ┆ --- ┆ --- ┆ --- │
+ # # │ str ┆ i64 ┆ i64 ┆ i64 │
+ # # ╞═════╪═════╪═════╪═════╡
+ # # │ a ┆ 1 ┆ 2 ┆ 3 │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ b ┆ 1 ┆ 2 ┆ 3 │
+ # # └─────┴─────┴─────┴─────┘
+ def transpose(include_header: false, header_name: "column", column_names: nil)
+ df = _from_rbdf(_df.transpose(include_header, header_name))
+ if !column_names.nil?
+ names = []
+ n = df.width
+ if include_header
+ names << header_name
+ n -= 1
+ end
+ column_names = column_names.each
+ n.times do
+ names << column_names.next
+ end
+ df.columns = names
+ end
+ df
+ end
+
# Reverse the DataFrame.
#
# @return [DataFrame]
#
# @example
@@ -1460,12 +1587,52 @@
subset = [subset]
end
_from_rbdf(_df.drop_nulls(subset))
end
- # def pipe
- # end
+ # Offers a structured way to apply a sequence of user-defined functions (UDFs).
+ #
+ # @param func [Object]
+ # Callable; will receive the frame as the first parameter,
+ # followed by any given args/kwargs.
+ # @param args [Object]
+ # Arguments to pass to the UDF.
+ # @param kwargs [Object]
+ # Keyword arguments to pass to the UDF.
+ #
+ # @return [Object]
+ #
+ # @note
+ # It is recommended to use LazyFrame when piping operations, in order
+ # to fully take advantage of query optimization and parallelization.
+ # See {#lazy}.
+ #
+ # @example
+ # cast_str_to_int = lambda do |data, col_name:|
+ # data.with_column(Polars.col(col_name).cast(:i64))
+ # end
+ #
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => ["10", "20", "30", "40"]})
+ # df.pipe(cast_str_to_int, col_name: "b")
+ # # =>
+ # # shape: (4, 2)
+ # # ┌─────┬─────┐
+ # # │ a ┆ b │
+ # # │ --- ┆ --- │
+ # # │ i64 ┆ i64 │
+ # # ╞═════╪═════╡
+ # # │ 1 ┆ 10 │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ 2 ┆ 20 │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ 3 ┆ 30 │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ 4 ┆ 40 │
+ # # └─────┴─────┘
+ def pipe(func, *args, **kwargs, &block)
+ func.call(self, *args, **kwargs, &block)
+ end
# Add a column at index 0 that counts the rows.
#
# @param name [String]
# Name of the column to add.
@@ -1545,22 +1712,617 @@
self.class,
maintain_order: maintain_order
)
end
- # def groupby_rolling
- # end
+ # Create rolling groups based on a time column.
+ #
+ # Also works for index values of type `:i32` or `:i64`.
+ #
+ # Different from a `dynamic_groupby` the windows are now determined by the
+ # individual values and are not of constant intervals. For constant intervals use
+ # *groupby_dynamic*
+ #
+ # The `period` and `offset` arguments are created either from a timedelta, or
+ # by using the following string language:
+ #
+ # - 1ns (1 nanosecond)
+ # - 1us (1 microsecond)
+ # - 1ms (1 millisecond)
+ # - 1s (1 second)
+ # - 1m (1 minute)
+ # - 1h (1 hour)
+ # - 1d (1 day)
+ # - 1w (1 week)
+ # - 1mo (1 calendar month)
+ # - 1y (1 calendar year)
+ # - 1i (1 index count)
+ #
+ # Or combine them:
+ # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
+ #
+ # In case of a groupby_rolling on an integer column, the windows are defined by:
+ #
+ # - **"1i" # length 1**
+ # - **"10i" # length 10**
+ #
+ # @param index_column [Object]
+ # Column used to group based on the time window.
+ # Often to type Date/Datetime
+ # This column must be sorted in ascending order. If not the output will not
+ # make sense.
+ #
+ # In case of a rolling groupby on indices, dtype needs to be one of
+ # `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
+ # performance matters use an `:i64` column.
+ # @param period [Object]
+ # Length of the window.
+ # @param offset [Object]
+ # Offset of the window. Default is -period.
+ # @param closed ["right", "left", "both", "none"]
+ # Define whether the temporal window interval is closed or not.
+ # @param by [Object]
+ # Also group by this column/these columns.
+ #
+ # @return [RollingGroupBy]
+ #
+ # @example
+ # dates = [
+ # "2020-01-01 13:45:48",
+ # "2020-01-01 16:42:13",
+ # "2020-01-01 16:45:09",
+ # "2020-01-02 18:12:48",
+ # "2020-01-03 19:45:32",
+ # "2020-01-08 23:16:43"
+ # ]
+ # df = Polars::DataFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
+ # Polars.col("dt").str.strptime(:datetime)
+ # )
+ # df.groupby_rolling(index_column: "dt", period: "2d").agg(
+ # [
+ # Polars.sum("a").alias("sum_a"),
+ # Polars.min("a").alias("min_a"),
+ # Polars.max("a").alias("max_a")
+ # ]
+ # )
+ # # =>
+ # # shape: (6, 4)
+ # # ┌─────────────────────┬───────┬───────┬───────┐
+ # # │ dt ┆ sum_a ┆ min_a ┆ max_a │
+ # # │ --- ┆ --- ┆ --- ┆ --- │
+ # # │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │
+ # # ╞═════════════════════╪═══════╪═══════╪═══════╡
+ # # │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
+ # # │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
+ # # │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
+ # # │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
+ # # │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
+ # # │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │
+ # # └─────────────────────┴───────┴───────┴───────┘
+ def groupby_rolling(
+ index_column:,
+ period:,
+ offset: nil,
+ closed: "right",
+ by: nil
+ )
+ RollingGroupBy.new(self, index_column, period, offset, closed, by)
+ end
- # def groupby_dynamic
- # end
+ # Group based on a time value (or index value of type `:i32`, `:i64`).
+ #
+ # Time windows are calculated and rows are assigned to windows. Different from a
+ # normal groupby is that a row can be member of multiple groups. The time/index
+ # window could be seen as a rolling window, with a window size determined by
+ # dates/times/values instead of slots in the DataFrame.
+ #
+ # A window is defined by:
+ #
+ # - every: interval of the window
+ # - period: length of the window
+ # - offset: offset of the window
+ #
+ # The `every`, `period` and `offset` arguments are created with
+ # the following string language:
+ #
+ # - 1ns (1 nanosecond)
+ # - 1us (1 microsecond)
+ # - 1ms (1 millisecond)
+ # - 1s (1 second)
+ # - 1m (1 minute)
+ # - 1h (1 hour)
+ # - 1d (1 day)
+ # - 1w (1 week)
+ # - 1mo (1 calendar month)
+ # - 1y (1 calendar year)
+ # - 1i (1 index count)
+ #
+ # Or combine them:
+ # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
+ #
+ # In case of a groupby_dynamic on an integer column, the windows are defined by:
+ #
+ # - "1i" # length 1
+ # - "10i" # length 10
+ #
+ # @param index_column
+ # Column used to group based on the time window.
+ # Often to type Date/Datetime
+ # This column must be sorted in ascending order. If not the output will not
+ # make sense.
+ #
+ # In case of a dynamic groupby on indices, dtype needs to be one of
+ # `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
+ # performance matters use an `:i64` column.
+ # @param every
+ # Interval of the window.
+ # @param period
+ # Length of the window, if None it is equal to 'every'.
+ # @param offset
+ # Offset of the window if None and period is None it will be equal to negative
+ # `every`.
+ # @param truncate
+ # Truncate the time value to the window lower bound.
+ # @param include_boundaries
+ # Add the lower and upper bound of the window to the "_lower_bound" and
+ # "_upper_bound" columns. This will impact performance because it's harder to
+ # parallelize
+ # @param closed ["right", "left", "both", "none"]
+ # Define whether the temporal window interval is closed or not.
+ # @param by
+ # Also group by this column/these columns
+ #
+ # @return [DataFrame]
+ #
+ # @example
+ # df = Polars::DataFrame.new(
+ # {
+ # "time" => Polars.date_range(
+ # DateTime.new(2021, 12, 16),
+ # DateTime.new(2021, 12, 16, 3),
+ # "30m"
+ # ),
+ # "n" => 0..6
+ # }
+ # )
+ # # =>
+ # # shape: (7, 2)
+ # # ┌─────────────────────┬─────┐
+ # # │ time ┆ n │
+ # # │ --- ┆ --- │
+ # # │ datetime[μs] ┆ i64 │
+ # # ╞═════════════════════╪═════╡
+ # # │ 2021-12-16 00:00:00 ┆ 0 │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ 2021-12-16 00:30:00 ┆ 1 │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ 2021-12-16 01:00:00 ┆ 2 │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ 2021-12-16 01:30:00 ┆ 3 │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ 2021-12-16 02:00:00 ┆ 4 │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ 2021-12-16 02:30:00 ┆ 5 │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ 2021-12-16 03:00:00 ┆ 6 │
+ # # └─────────────────────┴─────┘
+ #
+ # @example Group by windows of 1 hour starting at 2021-12-16 00:00:00.
+ # df.groupby_dynamic("time", every: "1h", closed: "right").agg(
+ # [
+ # Polars.col("time").min.alias("time_min"),
+ # Polars.col("time").max.alias("time_max")
+ # ]
+ # )
+ # # =>
+ # # shape: (4, 3)
+ # # ┌─────────────────────┬─────────────────────┬─────────────────────┐
+ # # │ time ┆ time_min ┆ time_max │
+ # # │ --- ┆ --- ┆ --- │
+ # # │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │
+ # # ╞═════════════════════╪═════════════════════╪═════════════════════╡
+ # # │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
+ # # │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
+ # # │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
+ # # │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │
+ # # └─────────────────────┴─────────────────────┴─────────────────────┘
+ #
+ # @example The window boundaries can also be added to the aggregation result.
+ # df.groupby_dynamic(
+ # "time", every: "1h", include_boundaries: true, closed: "right"
+ # ).agg([Polars.col("time").count.alias("time_count")])
+ # # =>
+ # # shape: (4, 4)
+ # # ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐
+ # # │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │
+ # # │ --- ┆ --- ┆ --- ┆ --- │
+ # # │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │
+ # # ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡
+ # # │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
+ # # │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
+ # # │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
+ # # │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │
+ # # └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
+ #
+ # @example When closed="left", should not include right end of interval.
+ # df.groupby_dynamic("time", every: "1h", closed: "left").agg(
+ # [
+ # Polars.col("time").count.alias("time_count"),
+ # Polars.col("time").list.alias("time_agg_list")
+ # ]
+ # )
+ # # =>
+ # # shape: (4, 3)
+ # # ┌─────────────────────┬────────────┬─────────────────────────────────────┐
+ # # │ time ┆ time_count ┆ time_agg_list │
+ # # │ --- ┆ --- ┆ --- │
+ # # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │
+ # # ╞═════════════════════╪════════════╪═════════════════════════════════════╡
+ # # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16... │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
+ # # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16... │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
+ # # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16... │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
+ # # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │
+ # # └─────────────────────┴────────────┴─────────────────────────────────────┘
+ #
+ # @example When closed="both" the time values at the window boundaries belong to 2 groups.
+ # df.groupby_dynamic("time", every: "1h", closed: "both").agg(
+ # [Polars.col("time").count.alias("time_count")]
+ # )
+ # # =>
+ # # shape: (5, 2)
+ # # ┌─────────────────────┬────────────┐
+ # # │ time ┆ time_count │
+ # # │ --- ┆ --- │
+ # # │ datetime[μs] ┆ u32 │
+ # # ╞═════════════════════╪════════════╡
+ # # │ 2021-12-15 23:00:00 ┆ 1 │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
+ # # │ 2021-12-16 00:00:00 ┆ 3 │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
+ # # │ 2021-12-16 01:00:00 ┆ 3 │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
+ # # │ 2021-12-16 02:00:00 ┆ 3 │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
+ # # │ 2021-12-16 03:00:00 ┆ 1 │
+ # # └─────────────────────┴────────────┘
+ #
+ # @example Dynamic groupbys can also be combined with grouping on normal keys.
+ # df = Polars::DataFrame.new(
+ # {
+ # "time" => Polars.date_range(
+ # DateTime.new(2021, 12, 16),
+ # DateTime.new(2021, 12, 16, 3),
+ # "30m"
+ # ),
+ # "groups" => ["a", "a", "a", "b", "b", "a", "a"]
+ # }
+ # )
+ # df.groupby_dynamic(
+ # "time",
+ # every: "1h",
+ # closed: "both",
+ # by: "groups",
+ # include_boundaries: true
+ # ).agg([Polars.col("time").count.alias("time_count")])
+ # # =>
+ # # shape: (7, 5)
+ # # ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐
+ # # │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
+ # # │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │
+ # # ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡
+ # # │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
+ # # │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
+ # # │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
+ # # │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
+ # # │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
+ # # │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
+ # # │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │
+ # # └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
+ #
+ # @example Dynamic groupby on an index column.
+ # df = Polars::DataFrame.new(
+ # {
+ # "idx" => Polars.arange(0, 6, eager: true),
+ # "A" => ["A", "A", "B", "B", "B", "C"]
+ # }
+ # )
+ # df.groupby_dynamic(
+ # "idx",
+ # every: "2i",
+ # period: "3i",
+ # include_boundaries: true,
+ # closed: "right"
+ # ).agg(Polars.col("A").list.alias("A_agg_list"))
+ # # =>
+ # # shape: (3, 4)
+ # # ┌─────────────────┬─────────────────┬─────┬─────────────────┐
+ # # │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │
+ # # │ --- ┆ --- ┆ --- ┆ --- │
+ # # │ i64 ┆ i64 ┆ i64 ┆ list[str] │
+ # # ╞═════════════════╪═════════════════╪═════╪═════════════════╡
+ # # │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
+ # # │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
+ # # │ 4 ┆ 7 ┆ 4 ┆ ["C"] │
+ # # └─────────────────┴─────────────────┴─────┴─────────────────┘
+ def groupby_dynamic(
+ index_column,
+ every:,
+ period: nil,
+ offset: nil,
+ truncate: true,
+ include_boundaries: false,
+ closed: "left",
+ by: nil
+ )
+ DynamicGroupBy.new(
+ self,
+ index_column,
+ every,
+ period,
+ offset,
+ truncate,
+ include_boundaries,
+ closed,
+ by
+ )
+ end
- # def upsample
- # end
+ # Upsample a DataFrame at a regular frequency.
+ #
+ # @param time_column [Object]
+ # time column will be used to determine a date_range.
+ # Note that this column has to be sorted for the output to make sense.
+ # @param every [String]
+ # interval will start 'every' duration
+ # @param offset [String]
+ # change the start of the date_range by this offset.
+ # @param by [Object]
+ # First group by these columns and then upsample for every group
+ # @param maintain_order [Boolean]
+ # Keep the ordering predictable. This is slower.
+ #
+ # The `every` and `offset` arguments are created with
+ # the following string language:
+ #
+ # - 1ns (1 nanosecond)
+ # - 1us (1 microsecond)
+ # - 1ms (1 millisecond)
+ # - 1s (1 second)
+ # - 1m (1 minute)
+ # - 1h (1 hour)
+ # - 1d (1 day)
+ # - 1w (1 week)
+ # - 1mo (1 calendar month)
+ # - 1y (1 calendar year)
+ # - 1i (1 index count)
+ #
+ # Or combine them:
+ # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
+ #
+ # @return [DataFrame]
+ #
+ # @example Upsample a DataFrame by a certain interval.
+ # df = Polars::DataFrame.new(
+ # {
+ # "time" => [
+ # DateTime.new(2021, 2, 1),
+ # DateTime.new(2021, 4, 1),
+ # DateTime.new(2021, 5, 1),
+ # DateTime.new(2021, 6, 1)
+ # ],
+ # "groups" => ["A", "B", "A", "B"],
+ # "values" => [0, 1, 2, 3]
+ # }
+ # )
+ # df.upsample(
+ # time_column: "time", every: "1mo", by: "groups", maintain_order: true
+ # ).select(Polars.all.forward_fill)
+ # # =>
+ # # shape: (7, 3)
+ # # ┌─────────────────────┬────────┬────────┐
+ # # │ time ┆ groups ┆ values │
+ # # │ --- ┆ --- ┆ --- │
+ # # │ datetime[ns] ┆ str ┆ i64 │
+ # # ╞═════════════════════╪════════╪════════╡
+ # # │ 2021-02-01 00:00:00 ┆ A ┆ 0 │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
+ # # │ 2021-03-01 00:00:00 ┆ A ┆ 0 │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
+ # # │ 2021-04-01 00:00:00 ┆ A ┆ 0 │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
+ # # │ 2021-05-01 00:00:00 ┆ A ┆ 2 │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
+ # # │ 2021-04-01 00:00:00 ┆ B ┆ 1 │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
+ # # │ 2021-05-01 00:00:00 ┆ B ┆ 1 │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
+ # # │ 2021-06-01 00:00:00 ┆ B ┆ 3 │
+ # # └─────────────────────┴────────┴────────┘
+ def upsample(
+ time_column:,
+ every:,
+ offset: nil,
+ by: nil,
+ maintain_order: false
+ )
+ if by.nil?
+ by = []
+ end
+ if by.is_a?(String)
+ by = [by]
+ end
+ if offset.nil?
+ offset = "0ns"
+ end
- # def join_asof
- # end
+ every = Utils._timedelta_to_pl_duration(every)
+ offset = Utils._timedelta_to_pl_duration(offset)
+ _from_rbdf(
+ _df.upsample(by, time_column, every, offset, maintain_order)
+ )
+ end
+
+ # Perform an asof join.
+ #
+ # This is similar to a left-join except that we match on nearest key rather than
+ # equal keys.
+ #
+ # Both DataFrames must be sorted by the asof_join key.
+ #
+ # For each row in the left DataFrame:
+ #
+ # - A "backward" search selects the last row in the right DataFrame whose 'on' key is less than or equal to the left's key.
+ # - A "forward" search selects the first row in the right DataFrame whose 'on' key is greater than or equal to the left's key.
+ #
+ # The default is "backward".
+ #
+ # @param other [DataFrame]
+ # DataFrame to join with.
+ # @param left_on [String]
+ # Join column of the left DataFrame.
+ # @param right_on [String]
+ # Join column of the right DataFrame.
+ # @param on [String]
+ # Join column of both DataFrames. If set, `left_on` and `right_on` should be
+ # None.
+ # @param by [Object]
+ # join on these columns before doing asof join
+ # @param by_left [Object]
+ # join on these columns before doing asof join
+ # @param by_right [Object]
+ # join on these columns before doing asof join
+ # @param strategy ["backward", "forward"]
+ # Join strategy.
+ # @param suffix [String]
+ # Suffix to append to columns with a duplicate name.
+ # @param tolerance [Object]
+ # Numeric tolerance. By setting this the join will only be done if the near
+ # keys are within this distance. If an asof join is done on columns of dtype
+ # "Date", "Datetime", "Duration" or "Time" you use the following string
+ # language:
+ #
+ # - 1ns (1 nanosecond)
+ # - 1us (1 microsecond)
+ # - 1ms (1 millisecond)
+ # - 1s (1 second)
+ # - 1m (1 minute)
+ # - 1h (1 hour)
+ # - 1d (1 day)
+ # - 1w (1 week)
+ # - 1mo (1 calendar month)
+ # - 1y (1 calendar year)
+ # - 1i (1 index count)
+ #
+ # Or combine them:
+ # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
+ #
+ # @param allow_parallel [Boolean]
+ # Allow the physical plan to optionally evaluate the computation of both
+ # DataFrames up to the join in parallel.
+ # @param force_parallel [Boolean]
+ # Force the physical plan to evaluate the computation of both DataFrames up to
+ # the join in parallel.
+ #
+ # @return [DataFrame]
+ #
+ # @example
+ # gdp = Polars::DataFrame.new(
+ # {
+ # "date" => [
+ # DateTime.new(2016, 1, 1),
+ # DateTime.new(2017, 1, 1),
+ # DateTime.new(2018, 1, 1),
+ # DateTime.new(2019, 1, 1),
+ # ], # note record date: Jan 1st (sorted!)
+ # "gdp" => [4164, 4411, 4566, 4696]
+ # }
+ # )
+ # population = Polars::DataFrame.new(
+ # {
+ # "date" => [
+ # DateTime.new(2016, 5, 12),
+ # DateTime.new(2017, 5, 12),
+ # DateTime.new(2018, 5, 12),
+ # DateTime.new(2019, 5, 12),
+ # ], # note record date: May 12th (sorted!)
+ # "population" => [82.19, 82.66, 83.12, 83.52]
+ # }
+ # )
+ # population.join_asof(
+ # gdp, left_on: "date", right_on: "date", strategy: "backward"
+ # )
+ # # =>
+ # # shape: (4, 3)
+ # # ┌─────────────────────┬────────────┬──────┐
+ # # │ date ┆ population ┆ gdp │
+ # # │ --- ┆ --- ┆ --- │
+ # # │ datetime[ns] ┆ f64 ┆ i64 │
+ # # ╞═════════════════════╪════════════╪══════╡
+ # # │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
+ # # │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
+ # # │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
+ # # │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │
+ # # └─────────────────────┴────────────┴──────┘
+ def join_asof(
+ other,
+ left_on: nil,
+ right_on: nil,
+ on: nil,
+ by_left: nil,
+ by_right: nil,
+ by: nil,
+ strategy: "backward",
+ suffix: "_right",
+ tolerance: nil,
+ allow_parallel: true,
+ force_parallel: false
+ )
+ lazy
+ .join_asof(
+ other.lazy,
+ left_on: left_on,
+ right_on: right_on,
+ on: on,
+ by_left: by_left,
+ by_right: by_right,
+ by: by,
+ strategy: strategy,
+ suffix: suffix,
+ tolerance: tolerance,
+ allow_parallel: allow_parallel,
+ force_parallel: force_parallel
+ )
+ .collect(no_optimization: true)
+ end
+
# Join in SQL-like fashion.
#
# @param other [DataFrame]
# DataFrame to join with.
# @param left_on [Object]
@@ -1673,12 +2435,82 @@
suffix: suffix,
)
.collect(no_optimization: true)
end
- # def apply
- # end
+ # Apply a custom/user-defined function (UDF) over the rows of the DataFrame.
+ #
+ # The UDF will receive each row as a tuple of values: `udf(row)`.
+ #
+ # Implementing logic using a Ruby function is almost always _significantly_
+ # slower and more memory intensive than implementing the same logic using
+ # the native expression API because:
+ #
+ # - The native expression engine runs in Rust; UDFs run in Ruby.
+ # - Use of Ruby UDFs forces the DataFrame to be materialized in memory.
+ # - Polars-native expressions can be parallelised (UDFs cannot).
+ # - Polars-native expressions can be logically optimised (UDFs cannot).
+ #
+ # Wherever possible you should strongly prefer the native expression API
+ # to achieve the best performance.
+ #
+ # @param return_dtype [Symbol]
+ # Output type of the operation. If none given, Polars tries to infer the type.
+ # @param inference_size [Integer]
+ # Only used in the case when the custom function returns rows.
+ # This uses the first `n` rows to determine the output schema
+ #
+ # @return [Object]
+ #
+ # @note
+ # The frame-level `apply` cannot track column names (as the UDF is a black-box
+ # that may arbitrarily drop, rearrange, transform, or add new columns); if you
+ # want to apply a UDF such that column names are preserved, you should use the
+ # expression-level `apply` syntax instead.
+ #
+ # @example
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [-1, 5, 8]})
+ #
+ # @example Return a DataFrame by mapping each row to a tuple:
+ # df.apply { |t| [t[0] * 2, t[1] * 3] }
+ # # =>
+ # # shape: (3, 2)
+ # # ┌──────────┬──────────┐
+ # # │ column_0 ┆ column_1 │
+ # # │ --- ┆ --- │
+ # # │ i64 ┆ i64 │
+ # # ╞══════════╪══════════╡
+ # # │ 2 ┆ -3 │
+ # # ├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
+ # # │ 4 ┆ 15 │
+ # # ├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
+ # # │ 6 ┆ 24 │
+ # # └──────────┴──────────┘
+ #
+ # @example Return a Series by mapping each row to a scalar:
+ # df.apply { |t| t[0] * 2 + t[1] }
+ # # =>
+ # # shape: (3, 1)
+ # # ┌───────┐
+ # # │ apply │
+ # # │ --- │
+ # # │ i64 │
+ # # ╞═══════╡
+ # # │ 1 │
+ # # ├╌╌╌╌╌╌╌┤
+ # # │ 9 │
+ # # ├╌╌╌╌╌╌╌┤
+ # # │ 14 │
+ # # └───────┘
+ def apply(return_dtype: nil, inference_size: 256, &f)
+ out, is_df = _df.apply(f, return_dtype, inference_size)
+ if is_df
+ _from_rbdf(out)
+ else
+ _from_rbdf(Utils.wrap_s(out).to_frame._df)
+ end
+ end
# Return a new DataFrame with the column added or replaced.
#
# @param column [Object]
# Series, where the name of the Series refers to the column in the DataFrame.
@@ -2176,22 +3008,409 @@
# # └─────────┴─────────┘
def explode(columns)
lazy.explode(columns).collect(no_optimization: true)
end
- # def pivot
- # end
+ # Create a spreadsheet-style pivot table as a DataFrame.
+ #
+ # @param values [Object]
+ # Column values to aggregate. Can be multiple columns if the *columns*
+ # arguments contains multiple columns as well
+ # @param index [Object]
+ # One or multiple keys to group by
+ # @param columns [Object]
+ # Columns whose values will be used as the header of the output DataFrame
+ # @param aggregate_fn ["first", "sum", "max", "min", "mean", "median", "last", "count"]
+ # A predefined aggregate function str or an expression.
+ # @param maintain_order [Object]
+ # Sort the grouped keys so that the output order is predictable.
+ # @param sort_columns [Object]
+ # Sort the transposed columns by name. Default is by order of discovery.
+ #
+ # @return [DataFrame]
+ #
+ # @example
+ # df = Polars::DataFrame.new(
+ # {
+ # "foo" => ["one", "one", "one", "two", "two", "two"],
+ # "bar" => ["A", "B", "C", "A", "B", "C"],
+ # "baz" => [1, 2, 3, 4, 5, 6]
+ # }
+ # )
+ # df.pivot(values: "baz", index: "foo", columns: "bar")
+ # # =>
+ # # shape: (2, 4)
+ # # ┌─────┬─────┬─────┬─────┐
+ # # │ foo ┆ A ┆ B ┆ C │
+ # # │ --- ┆ --- ┆ --- ┆ --- │
+ # # │ str ┆ i64 ┆ i64 ┆ i64 │
+ # # ╞═════╪═════╪═════╪═════╡
+ # # │ one ┆ 1 ┆ 2 ┆ 3 │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ two ┆ 4 ┆ 5 ┆ 6 │
+ # # └─────┴─────┴─────┴─────┘
+ def pivot(
+ values:,
+ index:,
+ columns:,
+ aggregate_fn: "first",
+ maintain_order: true,
+ sort_columns: false
+ )
+ if values.is_a?(String)
+ values = [values]
+ end
+ if index.is_a?(String)
+ index = [index]
+ end
+ if columns.is_a?(String)
+ columns = [columns]
+ end
- # def melt
- # end
+ if aggregate_fn.is_a?(String)
+ case aggregate_fn
+ when "first"
+ aggregate_fn = Polars.element.first
+ when "sum"
+ aggregate_fn = Polars.element.sum
+ when "max"
+ aggregate_fn = Polars.element.max
+ when "min"
+ aggregate_fn = Polars.element.min
+ when "mean"
+ aggregate_fn = Polars.element.mean
+ when "median"
+ aggregate_fn = Polars.element.median
+ when "last"
+ aggregate_fn = Polars.element.last
+ when "count"
+ aggregate_fn = Polars.count
+ else
+ raise ArgumentError, "Argument aggregate fn: '#{aggregate_fn}' was not expected."
+ end
+ end
- # def unstack
- # end
+ _from_rbdf(
+ _df.pivot_expr(
+ values,
+ index,
+ columns,
+ aggregate_fn._rbexpr,
+ maintain_order,
+ sort_columns
+ )
+ )
+ end
- # def partition_by
- # end
+ # Unpivot a DataFrame from wide to long format.
+ #
+ # Optionally leaves identifiers set.
+ #
+ # This function is useful to massage a DataFrame into a format where one or more
+ # columns are identifier variables (id_vars), while all other columns, considered
+ # measured variables (value_vars), are "unpivoted" to the row axis, leaving just
+ # two non-identifier columns, 'variable' and 'value'.
+ #
+ # @param id_vars [Object]
+ # Columns to use as identifier variables.
+ # @param value_vars [Object]
+ # Values to use as identifier variables.
+ # If `value_vars` is empty all columns that are not in `id_vars` will be used.
+ # @param variable_name [String]
+ # Name to give to the `value` column. Defaults to "variable"
+ # @param value_name [String]
+ # Name to give to the `value` column. Defaults to "value"
+ #
+ # @return [DataFrame]
+ #
+ # @example
+ # df = Polars::DataFrame.new(
+ # {
+ # "a" => ["x", "y", "z"],
+ # "b" => [1, 3, 5],
+ # "c" => [2, 4, 6]
+ # }
+ # )
+ # df.melt(id_vars: "a", value_vars: ["b", "c"])
+ # # =>
+ # # shape: (6, 3)
+ # # ┌─────┬──────────┬───────┐
+ # # │ a ┆ variable ┆ value │
+ # # │ --- ┆ --- ┆ --- │
+ # # │ str ┆ str ┆ i64 │
+ # # ╞═════╪══════════╪═══════╡
+ # # │ x ┆ b ┆ 1 │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
+ # # │ y ┆ b ┆ 3 │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
+ # # │ z ┆ b ┆ 5 │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
+ # # │ x ┆ c ┆ 2 │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
+ # # │ y ┆ c ┆ 4 │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
+ # # │ z ┆ c ┆ 6 │
+ # # └─────┴──────────┴───────┘
+ def melt(id_vars: nil, value_vars: nil, variable_name: nil, value_name: nil)
+ if value_vars.is_a?(String)
+ value_vars = [value_vars]
+ end
+ if id_vars.is_a?(String)
+ id_vars = [id_vars]
+ end
+ if value_vars.nil?
+ value_vars = []
+ end
+ if id_vars.nil?
+ id_vars = []
+ end
+ _from_rbdf(
+ _df.melt(id_vars, value_vars, value_name, variable_name)
+ )
+ end
+ # Unstack a long table to a wide form without doing an aggregation.
+ #
+ # This can be much faster than a pivot, because it can skip the grouping phase.
+ #
+ # @note
+ # This functionality is experimental and may be subject to changes
+ # without it being considered a breaking change.
+ #
+ # @param step Integer
+ # Number of rows in the unstacked frame.
+ # @param how ["vertical", "horizontal"]
+ # Direction of the unstack.
+ # @param columns [Object]
+ # Column to include in the operation.
+ # @param fill_values [Object]
+ # Fill values that don't fit the new size with this value.
+ #
+ # @return [DataFrame]
+ #
+ # @example
+ # df = Polars::DataFrame.new(
+ # {
+ # "col1" => "A".."I",
+ # "col2" => Polars.arange(0, 9, eager: true)
+ # }
+ # )
+ # # =>
+ # # shape: (9, 2)
+ # # ┌──────┬──────┐
+ # # │ col1 ┆ col2 │
+ # # │ --- ┆ --- │
+ # # │ str ┆ i64 │
+ # # ╞══════╪══════╡
+ # # │ A ┆ 0 │
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
+ # # │ B ┆ 1 │
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
+ # # │ C ┆ 2 │
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
+ # # │ D ┆ 3 │
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
+ # # │ ... ┆ ... │
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
+ # # │ F ┆ 5 │
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
+ # # │ G ┆ 6 │
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
+ # # │ H ┆ 7 │
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
+ # # │ I ┆ 8 │
+ # # └──────┴──────┘
+ #
+ # @example
+ # df.unstack(step: 3, how: "vertical")
+ # # =>
+ # # shape: (3, 6)
+ # # ┌────────┬────────┬────────┬────────┬────────┬────────┐
+ # # │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
+ # # │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │
+ # # ╞════════╪════════╪════════╪════════╪════════╪════════╡
+ # # │ A ┆ D ┆ G ┆ 0 ┆ 3 ┆ 6 │
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
+ # # │ B ┆ E ┆ H ┆ 1 ┆ 4 ┆ 7 │
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
+ # # │ C ┆ F ┆ I ┆ 2 ┆ 5 ┆ 8 │
+ # # └────────┴────────┴────────┴────────┴────────┴────────┘
+ #
+ # @example
+ # df.unstack(step: 3, how: "horizontal")
+ # # =>
+ # # shape: (3, 6)
+ # # ┌────────┬────────┬────────┬────────┬────────┬────────┐
+ # # │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
+ # # │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │
+ # # ╞════════╪════════╪════════╪════════╪════════╪════════╡
+ # # │ A ┆ B ┆ C ┆ 0 ┆ 1 ┆ 2 │
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
+ # # │ D ┆ E ┆ F ┆ 3 ┆ 4 ┆ 5 │
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
+ # # │ G ┆ H ┆ I ┆ 6 ┆ 7 ┆ 8 │
+ # # └────────┴────────┴────────┴────────┴────────┴────────┘
+ def unstack(step:, how: "vertical", columns: nil, fill_values: nil)
+ if !columns.nil?
+ df = select(columns)
+ else
+ df = self
+ end
+
+ height = df.height
+ if how == "vertical"
+ n_rows = step
+ n_cols = (height / n_rows.to_f).ceil
+ else
+ n_cols = step
+ n_rows = (height / n_cols.to_f).ceil
+ end
+
+ n_fill = n_cols * n_rows - height
+
+ if n_fill > 0
+ if !fill_values.is_a?(Array)
+ fill_values = [fill_values] * df.width
+ end
+
+ df = df.select(
+ df.get_columns.zip(fill_values).map do |s, next_fill|
+ s.extend_constant(next_fill, n_fill)
+ end
+ )
+ end
+
+ if how == "horizontal"
+ df = (
+ df.with_column(
+ (Polars.arange(0, n_cols * n_rows, eager: true) % n_cols).alias(
+ "__sort_order"
+ )
+ )
+ .sort("__sort_order")
+ .drop("__sort_order")
+ )
+ end
+
+ zfill_val = Math.log10(n_cols).floor + 1
+ slices =
+ df.get_columns.flat_map do |s|
+ n_cols.times.map do |slice_nbr|
+ s.slice(slice_nbr * n_rows, n_rows).alias("%s_%0#{zfill_val}d" % [s.name, slice_nbr])
+ end
+ end
+
+ _from_rbdf(DataFrame.new(slices)._df)
+ end
+
+ # Split into multiple DataFrames partitioned by groups.
+ #
+ # @param groups [Object]
+ # Groups to partition by.
+ # @param maintain_order [Boolean]
+ # Keep predictable output order. This is slower as it requires an extra sort
+ # operation.
+ # @param as_dict [Boolean]
+ # If true, return the partitions in a dictionary keyed by the distinct group
+ # values instead of a list.
+ #
+ # @return [Object]
+ #
+ # @example
+ # df = Polars::DataFrame.new(
+ # {
+ # "foo" => ["A", "A", "B", "B", "C"],
+ # "N" => [1, 2, 2, 4, 2],
+ # "bar" => ["k", "l", "m", "m", "l"]
+ # }
+ # )
+ # df.partition_by("foo", maintain_order: true)
+ # # =>
+ # # [shape: (2, 3)
+ # # ┌─────┬─────┬─────┐
+ # # │ foo ┆ N ┆ bar │
+ # # │ --- ┆ --- ┆ --- │
+ # # │ str ┆ i64 ┆ str │
+ # # ╞═════╪═════╪═════╡
+ # # │ A ┆ 1 ┆ k │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ A ┆ 2 ┆ l │
+ # # └─────┴─────┴─────┘, shape: (2, 3)
+ # # ┌─────┬─────┬─────┐
+ # # │ foo ┆ N ┆ bar │
+ # # │ --- ┆ --- ┆ --- │
+ # # │ str ┆ i64 ┆ str │
+ # # ╞═════╪═════╪═════╡
+ # # │ B ┆ 2 ┆ m │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ B ┆ 4 ┆ m │
+ # # └─────┴─────┴─────┘, shape: (1, 3)
+ # # ┌─────┬─────┬─────┐
+ # # │ foo ┆ N ┆ bar │
+ # # │ --- ┆ --- ┆ --- │
+ # # │ str ┆ i64 ┆ str │
+ # # ╞═════╪═════╪═════╡
+ # # │ C ┆ 2 ┆ l │
+ # # └─────┴─────┴─────┘]
+ #
+ # @example
+ # df.partition_by("foo", maintain_order: true, as_dict: true)
+ # # =>
+ # # {"A"=>shape: (2, 3)
+ # # ┌─────┬─────┬─────┐
+ # # │ foo ┆ N ┆ bar │
+ # # │ --- ┆ --- ┆ --- │
+ # # │ str ┆ i64 ┆ str │
+ # # ╞═════╪═════╪═════╡
+ # # │ A ┆ 1 ┆ k │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ A ┆ 2 ┆ l │
+ # # └─────┴─────┴─────┘, "B"=>shape: (2, 3)
+ # # ┌─────┬─────┬─────┐
+ # # │ foo ┆ N ┆ bar │
+ # # │ --- ┆ --- ┆ --- │
+ # # │ str ┆ i64 ┆ str │
+ # # ╞═════╪═════╪═════╡
+ # # │ B ┆ 2 ┆ m │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ B ┆ 4 ┆ m │
+ # # └─────┴─────┴─────┘, "C"=>shape: (1, 3)
+ # # ┌─────┬─────┬─────┐
+ # # │ foo ┆ N ┆ bar │
+ # # │ --- ┆ --- ┆ --- │
+ # # │ str ┆ i64 ┆ str │
+ # # ╞═════╪═════╪═════╡
+ # # │ C ┆ 2 ┆ l │
+ # # └─────┴─────┴─────┘}
+ def partition_by(groups, maintain_order: true, as_dict: false)
+ if groups.is_a?(String)
+ groups = [groups]
+ elsif !groups.is_a?(Array)
+ groups = Array(groups)
+ end
+
+ if as_dict
+ out = {}
+ if groups.length == 1
+ _df.partition_by(groups, maintain_order).each do |df|
+ df = _from_rbdf(df)
+ out[df[groups][0, 0]] = df
+ end
+ else
+ _df.partition_by(groups, maintain_order).each do |df|
+ df = _from_rbdf(df)
+ out[df[groups].row(0)] = df
+ end
+ end
+ out
+ else
+ _df.partition_by(groups, maintain_order).map { |df| _from_rbdf(df) }
+ end
+ end
+
# Shift values by the given period.
#
# @param periods [Integer]
# Number of places to shift (may be negative).
#
@@ -3059,13 +4278,98 @@
n = 1
end
_from_rbdf(_df.sample_n(n, with_replacement, shuffle, seed))
end
- # def fold
- # end
+ # Apply a horizontal reduction on a DataFrame.
+ #
+ # This can be used to effectively determine aggregations on a row level, and can
+ # be applied to any DataType that can be supercasted (casted to a similar parent
+ # type).
+ #
+ # An example of the supercast rules when applying an arithmetic operation on two
+ # DataTypes are for instance:
+ #
+ # i8 + str = str
+ # f32 + i64 = f32
+ # f32 + f64 = f64
+ #
+ # @return [Series]
+ #
+ # @example A horizontal sum operation:
+ # df = Polars::DataFrame.new(
+ # {
+ # "a" => [2, 1, 3],
+ # "b" => [1, 2, 3],
+ # "c" => [1.0, 2.0, 3.0]
+ # }
+ # )
+ # df.fold { |s1, s2| s1 + s2 }
+ # # =>
+ # # shape: (3,)
+ # # Series: 'a' [f64]
+ # # [
+ # # 4.0
+ # # 5.0
+ # # 9.0
+ # # ]
+ #
+ # @example A horizontal minimum operation:
+ # df = Polars::DataFrame.new({"a" => [2, 1, 3], "b" => [1, 2, 3], "c" => [1.0, 2.0, 3.0]})
+ # df.fold { |s1, s2| s1.zip_with(s1 < s2, s2) }
+ # # =>
+ # # shape: (3,)
+ # # Series: 'a' [f64]
+ # # [
+ # # 1.0
+ # # 1.0
+ # # 3.0
+ # # ]
+ #
+ # @example A horizontal string concatenation:
+ # df = Polars::DataFrame.new(
+ # {
+ # "a" => ["foo", "bar", 2],
+ # "b" => [1, 2, 3],
+ # "c" => [1.0, 2.0, 3.0]
+ # }
+ # )
+ # df.fold { |s1, s2| s1 + s2 }
+ # # =>
+ # # shape: (3,)
+ # # Series: 'a' [str]
+ # # [
+ # # "foo11.0"
+ # # "bar22.0"
+ # # null
+ # # ]
+ #
+ # @example A horizontal boolean or, similar to a row-wise .any():
+ # df = Polars::DataFrame.new(
+ # {
+ # "a" => [false, false, true],
+ # "b" => [false, true, false]
+ # }
+ # )
+ # df.fold { |s1, s2| s1 | s2 }
+ # # =>
+ # # shape: (3,)
+ # # Series: 'a' [bool]
+ # # [
+ # # false
+ # # true
+ # # true
+ # # ]
+ def fold(&operation)
+ acc = to_series(0)
+ 1.upto(width - 1) do |i|
+ acc = operation.call(acc, to_series(i))
+ end
+ acc
+ end
+
# Get a row as tuple, either by index or by predicate.
#
# @param index [Object]
# Row index.
# @param by_predicate [Object]
@@ -3169,12 +4473,49 @@
# # └─────┴─────┘
def take_every(n)
select(Utils.col("*").take_every(n))
end
- # def hash_rows
- # end
+ # Hash and combine the rows in this DataFrame.
+ #
+ # The hash value is of type `:u64`.
+ #
+ # @param seed [Integer]
+ # Random seed parameter. Defaults to 0.
+ # @param seed_1 [Integer]
+ # Random seed parameter. Defaults to `seed` if not set.
+ # @param seed_2 [Integer]
+ # Random seed parameter. Defaults to `seed` if not set.
+ # @param seed_3 [Integer]
+ # Random seed parameter. Defaults to `seed` if not set.
+ #
+ # @return [Series]
+ #
+ # @example
+ # df = Polars::DataFrame.new(
+ # {
+ # "foo" => [1, nil, 3, 4],
+ # "ham" => ["a", "b", nil, "d"]
+ # }
+ # )
+ # df.hash_rows(seed: 42)
+ # # =>
+ # # shape: (4,)
+ # # Series: '' [u64]
+ # # [
+ # # 4238614331852490969
+ # # 17976148875586754089
+ # # 4702262519505526977
+ # # 18144177983981041107
+ # # ]
+ def hash_rows(seed: 0, seed_1: nil, seed_2: nil, seed_3: nil)
+ k0 = seed
+ k1 = seed_1.nil? ? seed : seed_1
+ k2 = seed_2.nil? ? seed : seed_2
+ k3 = seed_3.nil? ? seed : seed_3
+ Utils.wrap_s(_df.hash_rows(k0, k1, k2, k3))
+ end
# Interpolate intermediate values. The interpolation method is linear.
#
# @return [DataFrame]
#
@@ -3295,11 +4636,23 @@
def initialize_copy(other)
super
self._df = _df._clone
end
- def hash_to_rbdf(data, columns: nil)
+ def _pos_idx(idx, dim)
+ if idx >= 0
+ idx
+ else
+ shape[dim] + idx
+ end
+ end
+
+ # def _pos_idxs
+ # end
+
+ # @private
+ def self.hash_to_rbdf(data, columns: nil)
if !columns.nil?
columns, dtypes = _unpack_columns(columns, lookup_names: data.keys)
if data.empty? && dtypes
data_series = columns.map { |name| Series.new(name, [], dtype: dtypes[name])._s }
@@ -3311,15 +4664,38 @@
end
RbDataFrame.read_hash(data)
end
- def _unpack_columns(columns, lookup_names: nil)
- [columns.keys, columns]
+ # @private
+ def self._unpack_columns(columns, lookup_names: nil, n_expected: nil)
+ if columns.is_a?(Hash)
+ columns = columns.to_a
+ end
+ column_names =
+ (columns || []).map.with_index do |col, i|
+ if col.is_a?(String)
+ col || "column_#{i}"
+ else
+ col[0]
+ end
+ end
+ if column_names.empty? && n_expected
+ column_names = n_expected.times.map { |i| "column_#{i}" }
+ end
+ # TODO zip_longest
+ lookup = column_names.zip(lookup_names || []).to_h
+
+ [
+ column_names,
+ (columns || []).select { |col| !col.is_a?(String) && col[1] }.to_h do |col|
+ [lookup[col[0]] || col[0], col[1]]
+ end
+ ]
end
- def _handle_columns_arg(data, columns: nil)
+ def self._handle_columns_arg(data, columns: nil)
if columns.nil?
data
else
if data.empty?
columns.map { |c| Series.new(c, nil)._s }
@@ -3333,17 +4709,42 @@
raise ArgumentError, "Dimensions of columns arg must match data dimensions."
end
end
end
- def sequence_to_rbdf(data, columns: nil, orient: nil)
- if columns || orient
- raise Todo
+ # @private
+ def self.sequence_to_rbdf(data, columns: nil, orient: nil)
+ if data.length == 0
+ return hash_to_rbdf({}, columns: columns)
end
- RbDataFrame.new(data.map(&:_s))
+
+ if data[0].is_a?(Series)
+ # series_names = data.map(&:name)
+ # columns, dtypes = _unpack_columns(columns || series_names, n_expected: data.length)
+ data_series = []
+ data.each do |s|
+ data_series << s._s
+ end
+ elsif data[0].is_a?(Array)
+ if orient.nil? && !columns.nil?
+ orient = columns.length == data.length ? "col" : "row"
+ end
+
+ if orient == "row"
+ raise Todo
+ elsif orient == "col" || orient.nil?
+ raise Todo
+ else
+ raise ArgumentError, "orient must be one of {{'col', 'row', nil}}, got #{orient} instead."
+ end
+ end
+
+ data_series = _handle_columns_arg(data_series, columns: columns)
+ RbDataFrame.new(data_series)
end
- def series_to_rbdf(data, columns: nil)
+ # @private
+ def self.series_to_rbdf(data, columns: nil)
if columns
raise Todo
end
RbDataFrame.new([data._s])
end