data_frame.rb in polars-df-0.1.1

- old
+ new

@@ -77,14 +77,55 @@
 
     def columns
       _df.columns
     end
 
+    def columns=(columns)
+      _df.set_column_names(columns)
+    end
+
     def dtypes
       _df.dtypes.map(&:to_sym)
     end
 
+    def schema
+      columns.zip(dtypes).to_h
+    end
+
+    # def ==(other)
+    # end
+
+    # def !=(other)
+    # end
+
+    # def >(other)
+    # end
+
+    # def <(other)
+    # end
+
+    # def >=(other)
+    # end
+
+    # def <=(other)
+    # end
+
+    # def *(other)
+    # end
+
+    # def /(other)
+    # end
+
+    # def +(other)
+    # end
+
+    # def -(other)
+    # end
+
+    # def %(other)
+    # end
+
     def to_s
       _df.to_s
     end
     alias_method :inspect, :to_s
 
@@ -94,10 +135,29 @@
 
     def [](name)
       Utils.wrap_s(_df.column(name))
     end
 
+    # def []=(key, value)
+    # end
+
+    def to_h(as_series: true)
+      if as_series
+        get_columns.to_h { |s| [s.name, s] }
+      else
+        get_columns.to_h { |s| [s.name, s.to_a] }
+      end
+    end
+
+    # def to_hs / to_a
+    # end
+
+    # def to_numo
+    # end
+
+    # no to_pandas
+
     def to_series(index = 0)
       if index < 0
         index = columns.length + index
       end
       Utils.wrap_s(_df.select_at_idx(index))
@@ -181,10 +241,16 @@
         null_value,
       )
       nil
     end
 
+    # def write_avro
+    # end
+
+    # def write_ipc
+    # end
+
     def write_parquet(
       file,
       compression: "zstd",
       compression_level: nil,
       statistics: false,
@@ -200,22 +266,65 @@
       _df.write_parquet(
         file, compression, compression_level, statistics, row_group_size
       )
     end
 
+    def estimated_size(unit = "b")
+      sz = _df.estimated_size
+      Utils.scale_bytes(sz, to: unit)
+    end
+
+    # def transpose
+    # end
+
+    def reverse
+      select(Polars.col("*").reverse)
+    end
+
+    def rename(mapping)
+      lazy.rename(mapping).collect(no_optimization: true)
+    end
+
+    def insert_at_idx(index, series)
+      if index < 0
+        index = columns.length + index
+      end
+      _df.insert_at_idx(index, series._s)
+      self
+    end
+
     def filter(predicate)
       lazy.filter(predicate).collect
     end
 
+    # def describe
+    # end
+
+    # def find_idx_by_name
+    # end
+
+    # def replace_at_idx
+    # end
+
     def sort(by, reverse: false, nulls_last: false)
       _from_rbdf(_df.sort(by, reverse, nulls_last))
     end
 
     def frame_equal(other, null_equal: true)
       _df.frame_equal(other._df, null_equal)
     end
 
+    # def replace
+    # end
+
+    def slice(offset, length = nil)
+      if !length.nil? && length < 0
+        length = height - offset + length
+      end
+      _from_rbdf(_df.slice(offset, length))
+    end
+
     def limit(n = 5)
       head(n)
     end
 
     def head(n = 5)
@@ -224,14 +333,35 @@
 
     def tail(n = 5)
       _from_rbdf(_df.tail(n))
     end
 
+    # def drop_nulls
+    # end
+
+    # def pipe
+    # end
+
+    # def with_row_count
+    # end
+
     def groupby(by, maintain_order: false)
       lazy.groupby(by, maintain_order: maintain_order)
     end
 
+    # def groupby_rolling
+    # end
+
+    # def groupby_dynamic
+    # end
+
+    # def upsample
+    # end
+
+    # def join_asof
+    # end
+
     def join(other, left_on: nil, right_on: nil, on: nil, how: "inner", suffix: "_right")
       lazy
         .join(
           other.lazy,
           left_on: left_on,
@@ -241,16 +371,83 @@
           suffix: suffix,
         )
         .collect(no_optimization: true)
     end
 
+    # def apply
+    # end
+
     def with_column(column)
       lazy
         .with_column(column)
         .collect(no_optimization: true, string_cache: false)
     end
 
+    # def hstack
+    # end
+
+    # def vstack
+    # end
+
+    # def extend
+    # end
+
+    # def drop
+    # end
+
+    # def drop_in_place
+    # end
+
+    # def cleared
+    # end
+
+    # clone handled by initialize_copy
+
+    def get_columns
+      _df.get_columns.map { |s| Utils.wrap_s(s) }
+    end
+
+    def get_column(name)
+      self[name]
+    end
+
+    # def fill_null
+    # end
+
+    def fill_nan(fill_value)
+      lazy.fill_nan(fill_value).collect(no_optimization: true)
+    end
+
+    # def explode
+    # end
+
+    # def pivot
+    # end
+
+    # def melt
+    # end
+
+    # def unstack
+    # end
+
+    # def partition_by
+    # end
+
+    # def shift
+    # end
+
+    # def shift_and_fill
+    # end
+
+    def is_duplicated
+      Utils.wrap_s(_df.is_duplicated)
+    end
+
+    def is_unique
+      Utils.wrap_s(_df.is_unique)
+    end
+
     def lazy
       wrap_ldf(_df.lazy)
     end
 
     def select(exprs)
@@ -260,10 +457,60 @@
           .collect(no_optimization: true, string_cache: false)
           ._df
       )
     end
 
+    def with_columns(exprs)
+      if !exprs.nil? && !exprs.is_a?(Array)
+        exprs = [exprs]
+      end
+      lazy
+        .with_columns(exprs)
+        .collect(no_optimization: true, string_cache: false)
+    end
+
+    def n_chunks(strategy: "first")
+      if strategy == "first"
+        _df.n_chunks
+      elsif strategy == "all"
+        get_columns.map(&:n_chunks)
+      else
+        raise ArgumentError, "Strategy: '{strategy}' not understood. Choose one of {{'first',  'all'}}"
+      end
+    end
+
+    def max(axis: 0)
+      if axis == 0
+        _from_rbdf(_df.max)
+      elsif axis == 1
+        Utils.wrap_s(_df.hmax)
+      else
+        raise ArgumentError, "Axis should be 0 or 1."
+      end
+    end
+
+    def min(axis: 0)
+      if axis == 0
+        _from_rbdf(_df.min)
+      elsif axis == 1
+        Utils.wrap_s(_df.hmin)
+      else
+        raise ArgumentError, "Axis should be 0 or 1."
+      end
+    end
+
+    def sum(axis: 0, null_strategy: "ignore")
+      case axis
+      when 0
+        _from_rbdf(_df.sum)
+      when 1
+        Utils.wrap_s(_df.hsum(null_strategy))
+      else
+        raise ArgumentError, "Axis should be 0 or 1."
+      end
+    end
+
     def mean(axis: 0, null_strategy: "ignore")
       case axis
       when 0
         _from_rbdf(_df.mean)
       when 1
@@ -271,27 +518,85 @@
       else
         raise ArgumentError, "Axis should be 0 or 1."
       end
     end
 
-    def with_columns(exprs)
-      if !exprs.nil? && !exprs.is_a?(Array)
-        exprs = [exprs]
-      end
-      lazy
-        .with_columns(exprs)
-        .collect(no_optimization: true, string_cache: false)
+    def std(ddof: 1)
+      _from_rbdf(_df.std(ddof))
     end
 
+    def var(ddof: 1)
+      _from_rbdf(_df.var(ddof))
+    end
+
+    def median
+      _from_rbdf(_df.median)
+    end
+
+    # def product
+    # end
+
+    # def quantile(quantile, interpolation: "nearest")
+    # end
+
+    # def to_dummies
+    # end
+
+    # def unique
+    # end
+
+    # def n_unique
+    # end
+
     def rechunk
       _from_rbdf(_df.rechunk)
     end
 
     def null_count
       _from_rbdf(_df.null_count)
     end
 
+    # def sample
+    # end
+
+    # def fold
+    # end
+
+    # def row
+    # end
+
+    # def rows
+    # end
+
+    # def shrink_to_fit
+    # end
+
+    # def take_every
+    # end
+
+    # def hash_rows
+    # end
+
+    # def interpolate
+    # end
+
+    def is_empty
+      height == 0
+    end
+    alias_method :empty?, :is_empty
+
+    # def to_struct(name)
+    # end
+
+    # def unnest
+    # end
+
     private
+
+    def initialize_copy(other)
+      super
+      self._df = _df._clone
+    end
 
     def hash_to_rbdf(data)
       RbDataFrame.read_hash(data)
     end