lib/polars/data_frame.rb in polars-df-0.1.1 vs lib/polars/data_frame.rb in polars-df-0.1.2
- old
+ new
@@ -1,94 +1,338 @@
module Polars
+ # Two-dimensional data structure representing data as a table with rows and columns.
class DataFrame
+ # @private
attr_accessor :_df
- def initialize(data = nil)
+ # Create a new DataFrame.
+ #
+ # @param data [Hash, Array, Series, nil]
+ # Two-dimensional data in various forms. Hash must contain Arrays.
+ # Array may contain Series.
+ # @param columns [Array, Hash, nil]
+ # Column labels to use for resulting DataFrame. If specified, overrides any
+ # labels already present in the data. Must match data dimensions.
+ # @param orient ["col", "row", nil]
+ # Whether to interpret two-dimensional data as columns or as rows. If `nil`,
+ # the orientation is inferred by matching the columns and data dimensions. If
+ # this does not yield conclusive results, column orientation is used.
+ def initialize(data = nil, columns: nil, orient: nil)
if defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || data.is_a?(ActiveRecord::Result))
result = data.is_a?(ActiveRecord::Result) ? data : data.connection.select_all(data.to_sql)
data = {}
result.columns.each_with_index do |k, i|
data[k] = result.rows.map { |r| r[i] }
end
end
if data.nil?
- self._df = hash_to_rbdf({})
+ self._df = hash_to_rbdf({}, columns: columns)
elsif data.is_a?(Hash)
data = data.transform_keys { |v| v.is_a?(Symbol) ? v.to_s : v }
- self._df = hash_to_rbdf(data)
+ self._df = hash_to_rbdf(data, columns: columns)
elsif data.is_a?(Array)
- self._df = sequence_to_rbdf(data)
+ self._df = sequence_to_rbdf(data, columns: columns, orient: orient)
elsif data.is_a?(Series)
- self._df = series_to_rbdf(data)
+ self._df = series_to_rbdf(data, columns: columns)
else
raise ArgumentError, "DataFrame constructor called with unsupported type; got #{data.class.name}"
end
end
+ # @private
def self._from_rbdf(rb_df)
df = DataFrame.allocate
df._df = rb_df
df
end
- def self._read_csv(file, has_header: true)
+ # def self._from_hashes
+ # end
+
+ # def self._from_hash
+ # end
+
+ # def self._from_records
+ # end
+
+ # def self._from_numo
+ # end
+
+ # no self._from_arrow
+
+ # no self._from_pandas
+
+ # @private
+ def self._read_csv(
+ file,
+ has_header: true,
+ columns: nil,
+ sep: str = ",",
+ comment_char: nil,
+ quote_char: '"',
+ skip_rows: 0,
+ dtypes: nil,
+ null_values: nil,
+ ignore_errors: false,
+ parse_dates: false,
+ n_threads: nil,
+ infer_schema_length: 100,
+ batch_size: 8192,
+ n_rows: nil,
+ encoding: "utf8",
+ low_memory: false,
+ rechunk: true,
+ skip_rows_after_header: 0,
+ row_count_name: nil,
+ row_count_offset: 0,
+ sample_size: 1024,
+ eol_char: "\n"
+ )
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
- file = Utils.format_path(file)
+ path = Utils.format_path(file)
+ else
+ path = nil
+ # if defined?(StringIO) && file.is_a?(StringIO)
+ # file = file.string
+ # end
end
- _from_rbdf(RbDataFrame.read_csv(file, has_header))
+ dtype_list = nil
+ dtype_slice = nil
+ if !dtypes.nil?
+ if dtypes.is_a?(Hash)
+ dtype_list = []
+ dtypes.each do|k, v|
+ dtype_list << [k, Utils.rb_type_to_dtype(v)]
+ end
+ elsif dtypes.is_a?(Array)
+ dtype_slice = dtypes
+ else
+ raise ArgumentError, "dtype arg should be list or dict"
+ end
+ end
+
+ processed_null_values = Utils._process_null_values(null_values)
+
+ if columns.is_a?(String)
+ columns = [columns]
+ end
+ if file.is_a?(String) && file.include?("*")
+ raise Todo
+ end
+
+ projection, columns = Utils.handle_projection_columns(columns)
+
+ _from_rbdf(
+ RbDataFrame.read_csv(
+ file,
+ infer_schema_length,
+ batch_size,
+ has_header,
+ ignore_errors,
+ n_rows,
+ skip_rows,
+ projection,
+ sep,
+ rechunk,
+ columns,
+ encoding,
+ n_threads,
+ path,
+ dtype_list,
+ dtype_slice,
+ low_memory,
+ comment_char,
+ quote_char,
+ processed_null_values,
+ parse_dates,
+ skip_rows_after_header,
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
+ sample_size,
+ eol_char
+ )
+ )
end
+ # @private
def self._read_parquet(file)
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
file = Utils.format_path(file)
end
_from_rbdf(RbDataFrame.read_parquet(file))
end
+ # def self._read_avro
+ # end
+
+ # @private
+ def self._read_ipc(
+ file,
+ columns: nil,
+ n_rows: nil,
+ row_count_name: nil,
+ row_count_offset: 0,
+ rechunk: true,
+ memory_map: true
+ )
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
+ file = Utils.format_path(file)
+ end
+ if columns.is_a?(String)
+ columns = [columns]
+ end
+
+ if file.is_a?(String) && file.include?("*")
+ raise Todo
+ end
+
+ projection, columns = Utils.handle_projection_columns(columns)
+ _from_rbdf(
+ RbDataFrame.read_ipc(
+ file,
+ columns,
+ projection,
+ n_rows,
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
+ memory_map
+ )
+ )
+ end
+
+ # @private
def self._read_json(file)
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
file = Utils.format_path(file)
end
_from_rbdf(RbDataFrame.read_json(file))
end
+ # @private
def self._read_ndjson(file)
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
file = Utils.format_path(file)
end
_from_rbdf(RbDataFrame.read_ndjson(file))
end
+ # Get the shape of the DataFrame.
+ #
+ # @return [Array]
+ #
+ # @example
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3, 4, 5]})
+ # df.shape
+ # # => [5, 1]
def shape
_df.shape
end
+ # Get the height of the DataFrame.
+ #
+ # @return [Integer]
+ #
+ # @example
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3, 4, 5]})
+ # df.height
+ # # => 5
def height
_df.height
end
+ # Get the width of the DataFrame.
+ #
+ # @return [Integer]
+ #
+ # @example
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3, 4, 5]})
+ # df.width
+ # # => 1
def width
_df.width
end
+ # Get column names.
+ #
+ # @return [Array]
+ #
+ # @example
+ # df = Polars::DataFrame.new({
+ # "foo" => [1, 2, 3],
+ # "bar" => [6, 7, 8],
+ # "ham" => ["a", "b", "c"]
+ # })
+ # df.columns
+ # # => ["foo", "bar", "ham"]
def columns
_df.columns
end
+ # Change the column names of the DataFrame.
+ #
+ # @param columns [Array]
+ # A list with new names for the DataFrame.
+ # The length of the list should be equal to the width of the DataFrame.
+ #
+ # @return [Object]
+ #
+ # @example
+ # df = Polars::DataFrame.new({
+ # "foo" => [1, 2, 3],
+ # "bar" => [6, 7, 8],
+ # "ham" => ["a", "b", "c"]
+ # })
+ # df.columns = ["apple", "banana", "orange"]
+ # df
+ # # =>
+ # # shape: (3, 3)
+ # # ┌───────┬────────┬────────┐
+ # # │ apple ┆ banana ┆ orange │
+ # # │ --- ┆ --- ┆ --- │
+ # # │ i64 ┆ i64 ┆ str │
+ # # ╞═══════╪════════╪════════╡
+ # # │ 1 ┆ 6 ┆ a │
+ # # ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
+ # # │ 2 ┆ 7 ┆ b │
+ # # ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
+ # # │ 3 ┆ 8 ┆ c │
+ # # └───────┴────────┴────────┘
def columns=(columns)
_df.set_column_names(columns)
end
+ # Get dtypes of columns in DataFrame. Dtypes can also be found in column headers when printing the DataFrame.
+ #
+ # @return [Array]
+ #
+ # @example
+ # df = Polars::DataFrame.new({
+ # "foo" => [1, 2, 3],
+ # "bar" => [6.0, 7.0, 8.0],
+ # "ham" => ["a", "b", "c"]
+ # })
+ # df.dtypes
+ # # => [:i64, :f64, :str]
def dtypes
- _df.dtypes.map(&:to_sym)
+ _df.dtypes
end
+ # Get the schema.
+ #
+ # @return [Hash]
+ #
+ # @example
+ # df = Polars::DataFrame.new({
+ # "foo" => [1, 2, 3],
+ # "bar" => [6.0, 7.0, 8.0],
+ # "ham" => ["a", "b", "c"]
+ # })
+ # df.schema
+ # # => {"foo"=>:i64, "bar"=>:f64, "ham"=>:str}
def schema
columns.zip(dtypes).to_h
end
# def ==(other)
@@ -122,49 +366,97 @@
# end
# def %(other)
# end
+ #
def to_s
_df.to_s
end
alias_method :inspect, :to_s
def include?(name)
columns.include?(name)
end
+ # def each
+ # end
+
+ # def _pos_idx
+ # end
+
+ # def _pos_idxs
+ # end
+
+ #
def [](name)
Utils.wrap_s(_df.column(name))
end
# def []=(key, value)
# end
+ # no to_arrow
+
+ #
def to_h(as_series: true)
if as_series
get_columns.to_h { |s| [s.name, s] }
else
get_columns.to_h { |s| [s.name, s.to_a] }
end
end
- # def to_hs / to_a
+ # def to_hashes / to_a
# end
# def to_numo
# end
# no to_pandas
+ # Select column as Series at index location.
+ #
+ # @param index [Integer]
+ # Location of selection.
+ #
+ # @return [Series]
+ #
+ # @example
+ # df = Polars::DataFrame.new({
+ # "foo" => [1, 2, 3],
+ # "bar" => [6, 7, 8],
+ # "ham" => ["a", "b", "c"]
+ # })
+ # df.to_series(1)
+ # # =>
+ # # shape: (3,)
+ # # Series: 'bar' [i64]
+ # # [
+ # # 6
+ # # 7
+ # # 8
+ # # ]
def to_series(index = 0)
if index < 0
index = columns.length + index
end
Utils.wrap_s(_df.select_at_idx(index))
end
+ # Serialize to JSON representation.
+ #
+ # @return [nil]
+ #
+ # @param file [String]
+ # File path to which the result should be written.
+ # @param pretty [Boolean]
+ # Pretty serialize json.
+ # @param row_oriented [Boolean]
+ # Write to row oriented json. This is slower, but more common.
+ #
+ # @see #write_ndjson
def write_json(
file,
pretty: false,
row_oriented: false
)
@@ -174,19 +466,67 @@
_df.write_json(file, pretty, row_oriented)
nil
end
+ # Serialize to newline delimited JSON representation.
+ #
+ # @param file [String]
+ # File path to which the result should be written.
+ #
+ # @return [nil]
def write_ndjson(file)
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
file = Utils.format_path(file)
end
_df.write_ndjson(file)
nil
end
+ # Write to comma-separated values (CSV) file.
+ #
+ # @param file [String, nil]
+ # File path to which the result should be written. If set to `nil`
+ # (default), the output is returned as a string instead.
+ # @param has_header [Boolean]
+ # Whether to include header in the CSV output.
+ # @param sep [String]
+ # Separate CSV fields with this symbol.
+ # @param quote [String]
+ # Byte to use as quoting character.
+ # @param batch_size [Integer]
+ # Number of rows that will be processed per thread.
+ # @param datetime_format [String, nil]
+ # A format string, with the specifiers defined by the
+ # [chrono](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
+ # Rust crate. If no format specified, the default fractional-second
+ # precision is inferred from the maximum timeunit found in the frame's
+ # Datetime cols (if any).
+ # @param date_format [String, nil]
+ # A format string, with the specifiers defined by the
+ # [chrono](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
+ # Rust crate.
+ # @param time_format [String, nil]
+ # A format string, with the specifiers defined by the
+ # [chrono](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
+ # Rust crate.
+ # @param float_precision [Integer, nil]
+ # Number of decimal places to write, applied to both `:f32` and
+ # `:f64` datatypes.
+ # @param null_value [String, nil]
+ # A string representing null values (defaulting to the empty string).
+ #
+ # @return [String, nil]
+ #
+ # @example
+ # df = Polars::DataFrame.new({
+ # "foo" => [1, 2, 3, 4, 5],
+ # "bar" => [6, 7, 8, 9, 10],
+ # "ham" => ["a", "b", "c", "d", "e"]
+ # })
+ # df.write_csv("file.csv")
def write_csv(
file = nil,
has_header: true,
sep: ",",
quote: '"',
@@ -218,12 +558,11 @@
date_format,
time_format,
float_precision,
null_value
)
- buffer.rewind
- return buffer.read.force_encoding(Encoding::UTF_8)
+ return buffer.string.force_encoding(Encoding::UTF_8)
end
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
file = Utils.format_path(file)
end
@@ -244,13 +583,54 @@
end
# def write_avro
# end
- # def write_ipc
- # end
+ # Write to Arrow IPC binary stream or Feather file.
+ #
+ # @param file [String]
+ # File path to which the file should be written.
+ # @param compression ["uncompressed", "lz4", "zstd"]
+ # Compression method. Defaults to "uncompressed".
+ #
+ # @return [nil]
+ def write_ipc(file, compression: "uncompressed")
+ if compression.nil?
+ compression = "uncompressed"
+ end
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
+ file = Utils.format_path(file)
+ end
+ _df.write_ipc(file, compression)
+ end
+
+ # Write to Apache Parquet file.
+ #
+ # @param file [String]
+ # File path to which the file should be written.
+ # @param compression ["lz4", "uncompressed", "snappy", "gzip", "lzo", "brotli", "zstd"]
+ # Choose "zstd" for good compression performance.
+ # Choose "lz4" for fast compression/decompression.
+ # Choose "snappy" for more backwards compatibility guarantees
+ # when you deal with older parquet readers.
+ # @param compression_level [Integer, nil]
+ # The level of compression to use. Higher compression means smaller files on
+ # disk.
+ #
+ # - "gzip" : min-level: 0, max-level: 10.
+ # - "brotli" : min-level: 0, max-level: 11.
+ # - "zstd" : min-level: 1, max-level: 22.
+ # @param statistics [Boolean]
+ # Write statistics to the parquet headers. This requires extra compute.
+ # @param row_group_size [Integer, nil]
+ # Size of the row groups in number of rows.
+ # If `nil` (default), the chunks of the DataFrame are
+ # used. Writing in smaller chunks may reduce memory pressure and improve
+ # writing speeds.
+ #
+ # @return [nil]
def write_parquet(
file,
compression: "zstd",
compression_level: nil,
statistics: false,
@@ -266,26 +646,159 @@
_df.write_parquet(
file, compression, compression_level, statistics, row_group_size
)
end
+ # Return an estimation of the total (heap) allocated size of the DataFrame.
+ #
+ # Estimated size is given in the specified unit (bytes by default).
+ #
+ # This estimation is the sum of the size of its buffers, validity, including
+ # nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the
+ # size of 2 arrays is not the sum of the sizes computed from this function. In
+ # particular, StructArray's size is an upper bound.
+ #
+ # When an array is sliced, its allocated size remains constant because the buffer
+ # unchanged. However, this function will yield a smaller number. This is because
+ # this function returns the visible size of the buffer, not its total capacity.
+ #
+ # FFI buffers are included in this estimation.
+ #
+ # @param unit ["b", "kb", "mb", "gb", "tb"]
+ # Scale the returned size to the given unit.
+ #
+ # @return [Numeric]
+ #
+ # @example
+ # df = Polars::DataFrame.new(
+ # {
+ # "x" => 1_000_000.times.to_a.reverse,
+ # "y" => 1_000_000.times.map { |v| v / 1000.0 },
+ # "z" => 1_000_000.times.map(&:to_s)
+ # },
+ # columns: {"x" => :u32, "y" => :f64, "z" => :str}
+ # )
+ # df.estimated_size
+ # # => 25888898
+ # df.estimated_size("mb")
+ # # => 24.689577102661133
def estimated_size(unit = "b")
sz = _df.estimated_size
Utils.scale_bytes(sz, to: unit)
end
# def transpose
# end
+ # Reverse the DataFrame.
+ #
+ # @return [DataFrame]
+ #
+ # @example
+ # df = Polars::DataFrame.new({
+ # "key" => ["a", "b", "c"],
+ # "val" => [1, 2, 3]
+ # })
+ # df.reverse()
+ # # =>
+ # # shape: (3, 2)
+ # # ┌─────┬─────┐
+ # # │ key ┆ val │
+ # # │ --- ┆ --- │
+ # # │ str ┆ i64 │
+ # # ╞═════╪═════╡
+ # # │ c ┆ 3 │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ b ┆ 2 │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ a ┆ 1 │
+ # # └─────┴─────┘
def reverse
select(Polars.col("*").reverse)
end
+ # Rename column names.
+ #
+ # @param mapping [Hash]
+ # Key value pairs that map from old name to new name.
+ #
+ # @return [DataFrame]
+ #
+ # @example
+ # df = Polars::DataFrame.new({
+ # "foo" => [1, 2, 3],
+ # "bar" => [6, 7, 8],
+ # "ham" => ["a", "b", "c"]
+ # })
+ # df.rename({"foo" => "apple"})
+ # # =>
+ # # shape: (3, 3)
+ # # ┌───────┬─────┬─────┐
+ # # │ apple ┆ bar ┆ ham │
+ # # │ --- ┆ --- ┆ --- │
+ # # │ i64 ┆ i64 ┆ str │
+ # # ╞═══════╪═════╪═════╡
+ # # │ 1 ┆ 6 ┆ a │
+ # # ├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ 2 ┆ 7 ┆ b │
+ # # ├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ 3 ┆ 8 ┆ c │
+ # # └───────┴─────┴─────┘
def rename(mapping)
lazy.rename(mapping).collect(no_optimization: true)
end
+ # Insert a Series at a certain column index. This operation is in place.
+ #
+ # @param index [Integer]
+ # Column to insert the new `Series` column.
+ # @param series [Series]
+ # `Series` to insert.
+ #
+ # @return [DataFrame]
+ #
+ # @example
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
+ # s = Polars::Series.new("baz", [97, 98, 99])
+ # df.insert_at_idx(1, s)
+ # # =>
+ # # shape: (3, 3)
+ # # ┌─────┬─────┬─────┐
+ # # │ foo ┆ baz ┆ bar │
+ # # │ --- ┆ --- ┆ --- │
+ # # │ i64 ┆ i64 ┆ i64 │
+ # # ╞═════╪═════╪═════╡
+ # # │ 1 ┆ 97 ┆ 4 │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ 2 ┆ 98 ┆ 5 │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
+ # # │ 3 ┆ 99 ┆ 6 │
+ # # └─────┴─────┴─────┘
+ #
+ # @example
+ # df = Polars::DataFrame.new({
+ # "a" => [1, 2, 3, 4],
+ # "b" => [0.5, 4, 10, 13],
+ # "c" => [true, true, false, true]
+ # })
+ # s = Polars::Series.new("d", [-2.5, 15, 20.5, 0])
+ # df.insert_at_idx(3, s)
+ # # =>
+ # # shape: (4, 4)
+ # # ┌─────┬──────┬───────┬──────┐
+ # # │ a ┆ b ┆ c ┆ d │
+ # # │ --- ┆ --- ┆ --- ┆ --- │
+ # # │ i64 ┆ f64 ┆ bool ┆ f64 │
+ # # ╞═════╪══════╪═══════╪══════╡
+ # # │ 1 ┆ 0.5 ┆ true ┆ -2.5 │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
+ # # │ 2 ┆ 4.0 ┆ true ┆ 15.0 │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
+ # # │ 3 ┆ 10.0 ┆ false ┆ 20.5 │
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
+ # # │ 4 ┆ 13.0 ┆ true ┆ 0.0 │
+ # # └─────┴──────┴───────┴──────┘
def insert_at_idx(index, series)
if index < 0
index = columns.length + index
end
_df.insert_at_idx(index, series._s)
@@ -303,10 +816,11 @@
# end
# def replace_at_idx
# end
+ #
def sort(by, reverse: false, nulls_last: false)
_from_rbdf(_df.sort(by, reverse, nulls_last))
end
def frame_equal(other, null_equal: true)
@@ -314,10 +828,11 @@
end
# def replace
# end
+ #
def slice(offset, length = nil)
if !length.nil? && length < 0
length = height - offset + length
end
_from_rbdf(_df.slice(offset, length))
@@ -342,10 +857,11 @@
# end
# def with_row_count
# end
+ #
def groupby(by, maintain_order: false)
lazy.groupby(by, maintain_order: maintain_order)
end
# def groupby_rolling
@@ -358,10 +874,11 @@
# end
# def join_asof
# end
+ #
def join(other, left_on: nil, right_on: nil, on: nil, how: "inner", suffix: "_right")
lazy
.join(
other.lazy,
left_on: left_on,
@@ -374,10 +891,11 @@
end
# def apply
# end
+ #
def with_column(column)
lazy
.with_column(column)
.collect(no_optimization: true, string_cache: false)
end
@@ -386,12 +904,15 @@
# end
# def vstack
# end
- # def extend
- # end
+ #
+ def extend(other)
+ _df.extend(other._df)
+ self
+ end
# def drop
# end
# def drop_in_place
@@ -400,10 +921,11 @@
# def cleared
# end
# clone handled by initialize_copy
+ #
def get_columns
_df.get_columns.map { |s| Utils.wrap_s(s) }
end
def get_column(name)
@@ -411,10 +933,11 @@
end
# def fill_null
# end
+ #
def fill_nan(fill_value)
lazy.fill_nan(fill_value).collect(no_optimization: true)
end
# def explode
@@ -436,10 +959,11 @@
# end
# def shift_and_fill
# end
+ #
def is_duplicated
Utils.wrap_s(_df.is_duplicated)
end
def is_unique
@@ -545,10 +1069,11 @@
# end
# def n_unique
# end
+ #
def rechunk
_from_rbdf(_df.rechunk)
end
def null_count
@@ -577,10 +1102,11 @@
# end
# def interpolate
# end
+ #
def is_empty
height == 0
end
alias_method :empty?, :is_empty
@@ -595,18 +1121,58 @@
def initialize_copy(other)
super
self._df = _df._clone
end
- def hash_to_rbdf(data)
+ def hash_to_rbdf(data, columns: nil)
+ if !columns.nil?
+ columns, dtypes = _unpack_columns(columns, lookup_names: data.keys)
+
+ if !data && dtypes
+ data_series = columns.map { |name| Series.new(name, [], dtype: dtypes[name])._s }
+ else
+ data_series = data.map { |name, values| Series.new(name, values, dtype: dtypes[name])._s }
+ end
+ data_series = _handle_columns_arg(data_series, columns: columns)
+ return RbDataFrame.new(data_series)
+ end
+
RbDataFrame.read_hash(data)
end
- def sequence_to_rbdf(data)
+ def _unpack_columns(columns, lookup_names: nil)
+ [columns.keys, columns]
+ end
+
+ def _handle_columns_arg(data, columns: nil)
+ if columns.nil?
+ data
+ else
+ if !data
+ columns.map { |c| Series.new(c, nil)._s }
+ elsif data.length == columns.length
+ columns.each_with_index do |c, i|
+ # not in-place?
+ data[i].rename(c)
+ end
+ data
+ else
+ raise ArgumentError, "Dimensions of columns arg must match data dimensions."
+ end
+ end
+ end
+
+ def sequence_to_rbdf(data, columns: nil, orient: nil)
+ if columns || orient
+ raise Todo
+ end
RbDataFrame.new(data.map(&:_s))
end
- def series_to_rbdf(data)
+ def series_to_rbdf(data, columns: nil)
+ if columns
+ raise Todo
+ end
RbDataFrame.new([data._s])
end
def wrap_ldf(ldf)
LazyFrame._from_rbldf(ldf)