lib/polars/data_frame.rb in polars-df-0.1.0 vs lib/polars/data_frame.rb in polars-df-0.1.1
- old
+ new
@@ -77,14 +77,55 @@
def columns
_df.columns
end
+ def columns=(columns)
+ _df.set_column_names(columns)
+ end
+
def dtypes
_df.dtypes.map(&:to_sym)
end
+ def schema
+ columns.zip(dtypes).to_h
+ end
+
+ # def ==(other)
+ # end
+
+ # def !=(other)
+ # end
+
+ # def >(other)
+ # end
+
+ # def <(other)
+ # end
+
+ # def >=(other)
+ # end
+
+ # def <=(other)
+ # end
+
+ # def *(other)
+ # end
+
+ # def /(other)
+ # end
+
+ # def +(other)
+ # end
+
+ # def -(other)
+ # end
+
+ # def %(other)
+ # end
+
def to_s
_df.to_s
end
alias_method :inspect, :to_s
@@ -94,10 +135,29 @@
def [](name)
Utils.wrap_s(_df.column(name))
end
+ # def []=(key, value)
+ # end
+
+ def to_h(as_series: true)
+ if as_series
+ get_columns.to_h { |s| [s.name, s] }
+ else
+ get_columns.to_h { |s| [s.name, s.to_a] }
+ end
+ end
+
+ # def to_hs / to_a
+ # end
+
+ # def to_numo
+ # end
+
+ # no to_pandas
+
def to_series(index = 0)
if index < 0
index = columns.length + index
end
Utils.wrap_s(_df.select_at_idx(index))
@@ -181,10 +241,16 @@
null_value,
)
nil
end
+ # def write_avro
+ # end
+
+ # def write_ipc
+ # end
+
def write_parquet(
file,
compression: "zstd",
compression_level: nil,
statistics: false,
@@ -200,22 +266,65 @@
_df.write_parquet(
file, compression, compression_level, statistics, row_group_size
)
end
+ def estimated_size(unit = "b")
+ sz = _df.estimated_size
+ Utils.scale_bytes(sz, to: unit)
+ end
+
+ # def transpose
+ # end
+
+ def reverse
+ select(Polars.col("*").reverse)
+ end
+
+ def rename(mapping)
+ lazy.rename(mapping).collect(no_optimization: true)
+ end
+
+ def insert_at_idx(index, series)
+ if index < 0
+ index = columns.length + index
+ end
+ _df.insert_at_idx(index, series._s)
+ self
+ end
+
def filter(predicate)
lazy.filter(predicate).collect
end
+ # def describe
+ # end
+
+ # def find_idx_by_name
+ # end
+
+ # def replace_at_idx
+ # end
+
def sort(by, reverse: false, nulls_last: false)
_from_rbdf(_df.sort(by, reverse, nulls_last))
end
def frame_equal(other, null_equal: true)
_df.frame_equal(other._df, null_equal)
end
+ # def replace
+ # end
+
+ def slice(offset, length = nil)
+ if !length.nil? && length < 0
+ length = height - offset + length
+ end
+ _from_rbdf(_df.slice(offset, length))
+ end
+
def limit(n = 5)
head(n)
end
def head(n = 5)
@@ -224,14 +333,35 @@
def tail(n = 5)
_from_rbdf(_df.tail(n))
end
+ # def drop_nulls
+ # end
+
+ # def pipe
+ # end
+
+ # def with_row_count
+ # end
+
def groupby(by, maintain_order: false)
lazy.groupby(by, maintain_order: maintain_order)
end
+ # def groupby_rolling
+ # end
+
+ # def groupby_dynamic
+ # end
+
+ # def upsample
+ # end
+
+ # def join_asof
+ # end
+
def join(other, left_on: nil, right_on: nil, on: nil, how: "inner", suffix: "_right")
lazy
.join(
other.lazy,
left_on: left_on,
@@ -241,16 +371,83 @@
suffix: suffix,
)
.collect(no_optimization: true)
end
+ # def apply
+ # end
+
def with_column(column)
lazy
.with_column(column)
.collect(no_optimization: true, string_cache: false)
end
+ # def hstack
+ # end
+
+ # def vstack
+ # end
+
+ # def extend
+ # end
+
+ # def drop
+ # end
+
+ # def drop_in_place
+ # end
+
+ # def cleared
+ # end
+
+ # clone handled by initialize_copy
+
+ def get_columns
+ _df.get_columns.map { |s| Utils.wrap_s(s) }
+ end
+
+ def get_column(name)
+ self[name]
+ end
+
+ # def fill_null
+ # end
+
+ def fill_nan(fill_value)
+ lazy.fill_nan(fill_value).collect(no_optimization: true)
+ end
+
+ # def explode
+ # end
+
+ # def pivot
+ # end
+
+ # def melt
+ # end
+
+ # def unstack
+ # end
+
+ # def partition_by
+ # end
+
+ # def shift
+ # end
+
+ # def shift_and_fill
+ # end
+
+ def is_duplicated
+ Utils.wrap_s(_df.is_duplicated)
+ end
+
+ def is_unique
+ Utils.wrap_s(_df.is_unique)
+ end
+
def lazy
wrap_ldf(_df.lazy)
end
def select(exprs)
@@ -260,10 +457,60 @@
.collect(no_optimization: true, string_cache: false)
._df
)
end
+ def with_columns(exprs)
+ if !exprs.nil? && !exprs.is_a?(Array)
+ exprs = [exprs]
+ end
+ lazy
+ .with_columns(exprs)
+ .collect(no_optimization: true, string_cache: false)
+ end
+
+ def n_chunks(strategy: "first")
+ if strategy == "first"
+ _df.n_chunks
+ elsif strategy == "all"
+ get_columns.map(&:n_chunks)
+ else
+ raise ArgumentError, "Strategy: '{strategy}' not understood. Choose one of {{'first', 'all'}}"
+ end
+ end
+
+ def max(axis: 0)
+ if axis == 0
+ _from_rbdf(_df.max)
+ elsif axis == 1
+ Utils.wrap_s(_df.hmax)
+ else
+ raise ArgumentError, "Axis should be 0 or 1."
+ end
+ end
+
+ def min(axis: 0)
+ if axis == 0
+ _from_rbdf(_df.min)
+ elsif axis == 1
+ Utils.wrap_s(_df.hmin)
+ else
+ raise ArgumentError, "Axis should be 0 or 1."
+ end
+ end
+
+ def sum(axis: 0, null_strategy: "ignore")
+ case axis
+ when 0
+ _from_rbdf(_df.sum)
+ when 1
+ Utils.wrap_s(_df.hsum(null_strategy))
+ else
+ raise ArgumentError, "Axis should be 0 or 1."
+ end
+ end
+
def mean(axis: 0, null_strategy: "ignore")
case axis
when 0
_from_rbdf(_df.mean)
when 1
@@ -271,27 +518,85 @@
else
raise ArgumentError, "Axis should be 0 or 1."
end
end
- def with_columns(exprs)
- if !exprs.nil? && !exprs.is_a?(Array)
- exprs = [exprs]
- end
- lazy
- .with_columns(exprs)
- .collect(no_optimization: true, string_cache: false)
+ def std(ddof: 1)
+ _from_rbdf(_df.std(ddof))
end
+ def var(ddof: 1)
+ _from_rbdf(_df.var(ddof))
+ end
+
+ def median
+ _from_rbdf(_df.median)
+ end
+
+ # def product
+ # end
+
+ # def quantile(quantile, interpolation: "nearest")
+ # end
+
+ # def to_dummies
+ # end
+
+ # def unique
+ # end
+
+ # def n_unique
+ # end
+
def rechunk
_from_rbdf(_df.rechunk)
end
def null_count
_from_rbdf(_df.null_count)
end
+ # def sample
+ # end
+
+ # def fold
+ # end
+
+ # def row
+ # end
+
+ # def rows
+ # end
+
+ # def shrink_to_fit
+ # end
+
+ # def take_every
+ # end
+
+ # def hash_rows
+ # end
+
+ # def interpolate
+ # end
+
+ def is_empty
+ height == 0
+ end
+ alias_method :empty?, :is_empty
+
+ # def to_struct(name)
+ # end
+
+ # def unnest
+ # end
+
private
+
+ def initialize_copy(other)
+ super
+ self._df = _df._clone
+ end
def hash_to_rbdf(data)
RbDataFrame.read_hash(data)
end