lib/polars/series.rb in polars-df-0.1.3 vs lib/polars/series.rb in polars-df-0.1.4

- old
+ new

@@ -1,13 +1,10 @@ module Polars # A Series represents a single column in a polars DataFrame. class Series include ExprDispatch - # @private - attr_accessor :_s - # Create a new Series. # # @param name [String, Array, nil] # Name of the series. Will be used as a column name when used in a DataFrame. # When not specified, name is set to an empty string. @@ -728,12 +725,47 @@ # # => 0.8568409950394724 def entropy(base: Math::E, normalize: false) Polars.select(Polars.lit(self).entropy(base: base, normalize: normalize)).to_series[0] end - # def cumulative_eval - # end + # Run an expression over a sliding window that increases `1` slot every iteration. + # + # @param expr [Expr] + # Expression to evaluate + # @param min_periods [Integer] + # Number of valid values there should be in the window before the expression + # is evaluated. valid values = `length - null_count` + # @param parallel [Boolean] + # Run in parallel. Don't do this in a groupby or another operation that + # already has much parallelization. + # + # @return [Series] + # + # @note + # This functionality is experimental and may change without it being considered a + # breaking change. + # + # @note + # This can be really slow as it can have `O(n^2)` complexity. Don't use this + # for operations that visit all elements. + # + # @example + # s = Polars::Series.new("values", [1, 2, 3, 4, 5]) + # s.cumulative_eval(Polars.element.first - Polars.element.last ** 2) + # # => + # # shape: (5,) + # # Series: 'values' [f64] + # # [ + # # 0.0 + # # -3.0 + # # -8.0 + # # -15.0 + # # -24.0 + # # ] + def cumulative_eval(expr, min_periods: 1, parallel: false) + super + end # Return a copy of the Series with a new alias/name. # # @param name [String] # New name. @@ -1434,12 +1466,61 @@ # # ] def is_not_nan super end - # def is_in - # end + # Check if elements of this Series are in the other Series. + # + # @return [Series] + # + # @example + # s = Polars::Series.new("a", [1, 2, 3]) + # s2 = Polars::Series.new("b", [2, 4]) + # s2.is_in(s) + # # => + # # shape: (2,) + # # Series: 'b' [bool] + # # [ + # # true + # # false + # # ] + # + # @example + # sets = Polars::Series.new("sets", [[1, 2, 3], [1, 2], [9, 10]]) + # # => + # # shape: (3,) + # # Series: 'sets' [list] + # # [ + # # [1, 2, 3] + # # [1, 2] + # # [9, 10] + # # ] + # + # @example + # optional_members = Polars::Series.new("optional_members", [1, 2, 3]) + # # => + # # shape: (3,) + # # Series: 'optional_members' [i64] + # # [ + # # 1 + # # 2 + # # 3 + # # ] + # + # @example + # optional_members.is_in(sets) + # # => + # # shape: (3,) + # # Series: 'optional_members' [bool] + # # [ + # # true + # # true + # # false + # # ] + def is_in(other) + super + end # Get index values where Boolean Series evaluate `true`. # # @return [Series] # @@ -1907,12 +1988,32 @@ # # ] def round(decimals = 0) super end - # def dot - # end + # Compute the dot/inner product between two Series. + # + # @param other [Object] + # Series (or array) to compute dot product with. + # + # @return [Numeric] + # + # @example + # s = Polars::Series.new("a", [1, 2, 3]) + # s2 = Polars::Series.new("b", [4.0, 5.0, 6.0]) + # s.dot(s2) + # # => 32.0 + def dot(other) + if !other.is_a?(Series) + other = Series.new(other) + end + if len != other.len + n, m = len, other.len + raise ArgumentError, "Series length mismatch: expected #{n}, found #{m}" + end + _s.dot(other._s) + end # Compute the most occurring value(s). # # Can return multiple Values. # @@ -2275,43 +2376,514 @@ # # ] def zip_with(mask, other) Utils.wrap_s(_s.zip_with(mask._s, other._s)) end - # def rolling_min - # end + # Apply a rolling min (moving min) over the values in this array. + # + # A window of length `window_size` will traverse the array. The values that fill + # this window will (optionally) be multiplied with the weights given by the + # `weight` vector. The resulting values will be aggregated to their sum. + # + # @param window_size [Integer] + # The length of the window. + # @param weights [Array] + # An optional slice with the same length as the window that will be multiplied + # elementwise with the values in the window. + # @param min_periods [Integer] + # The number of values in the window that should be non-null before computing + # a result. If None, it will be set equal to window size. + # @param center [Boolean] + # Set the labels at the center of the window + # + # @return [Series] + # + # @example + # s = Polars::Series.new("a", [100, 200, 300, 400, 500]) + # s.rolling_min(3) + # # => + # # shape: (5,) + # # Series: 'a' [i64] + # # [ + # # null + # # null + # # 100 + # # 200 + # # 300 + # # ] + def rolling_min( + window_size, + weights: nil, + min_periods: nil, + center: false + ) + to_frame + .select( + Polars.col(name).rolling_min( + window_size, + weights: weights, + min_periods: min_periods, + center: center + ) + ) + .to_series + end - # def rolling_max - # end + # Apply a rolling max (moving max) over the values in this array. + # + # A window of length `window_size` will traverse the array. The values that fill + # this window will (optionally) be multiplied with the weights given by the + # `weight` vector. The resulting values will be aggregated to their sum. + # + # @param window_size [Integer] + # The length of the window. + # @param weights [Array] + # An optional slice with the same length as the window that will be multiplied + # elementwise with the values in the window. + # @param min_periods [Integer] + # The number of values in the window that should be non-null before computing + # a result. If None, it will be set equal to window size. + # @param center [Boolean] + # Set the labels at the center of the window + # + # @return [Series] + # + # @example + # s = Polars::Series.new("a", [100, 200, 300, 400, 500]) + # s.rolling_max(2) + # # => + # # shape: (5,) + # # Series: 'a' [i64] + # # [ + # # null + # # 200 + # # 300 + # # 400 + # # 500 + # # ] + def rolling_max( + window_size, + weights: nil, + min_periods: nil, + center: false + ) + to_frame + .select( + Polars.col(name).rolling_max( + window_size, + weights: weights, + min_periods: min_periods, + center: center + ) + ) + .to_series + end - # def rolling_mean - # end + # Apply a rolling mean (moving mean) over the values in this array. + # + # A window of length `window_size` will traverse the array. The values that fill + # this window will (optionally) be multiplied with the weights given by the + # `weight` vector. The resulting values will be aggregated to their sum. + # + # @param window_size [Integer] + # The length of the window. + # @param weights [Array] + # An optional slice with the same length as the window that will be multiplied + # elementwise with the values in the window. + # @param min_periods [Integer] + # The number of values in the window that should be non-null before computing + # a result. If None, it will be set equal to window size. + # @param center [Boolean] + # Set the labels at the center of the window + # + # @return [Series] + # + # @example + # s = Polars::Series.new("a", [100, 200, 300, 400, 500]) + # s.rolling_mean(2) + # # => + # # shape: (5,) + # # Series: 'a' [f64] + # # [ + # # null + # # 150.0 + # # 250.0 + # # 350.0 + # # 450.0 + # # ] + def rolling_mean( + window_size, + weights: nil, + min_periods: nil, + center: false + ) + to_frame + .select( + Polars.col(name).rolling_mean( + window_size, + weights: weights, + min_periods: min_periods, + center: center + ) + ) + .to_series + end - # def rolling_sum - # end + # Apply a rolling sum (moving sum) over the values in this array. + # + # A window of length `window_size` will traverse the array. The values that fill + # this window will (optionally) be multiplied with the weights given by the + # `weight` vector. The resulting values will be aggregated to their sum. + # + # @param window_size [Integer] + # The length of the window. + # @param weights [Array] + # An optional slice with the same length as the window that will be multiplied + # elementwise with the values in the window. + # @param min_periods [Integer] + # The number of values in the window that should be non-null before computing + # a result. If None, it will be set equal to window size. + # @param center [Boolean] + # Set the labels at the center of the window + # + # @return [Series] + # + # @example + # s = Polars::Series.new("a", [1, 2, 3, 4, 5]) + # s.rolling_sum(2) + # # => + # # shape: (5,) + # # Series: 'a' [i64] + # # [ + # # null + # # 3 + # # 5 + # # 7 + # # 9 + # # ] + def rolling_sum( + window_size, + weights: nil, + min_periods: nil, + center: false + ) + to_frame + .select( + Polars.col(name).rolling_sum( + window_size, + weights: weights, + min_periods: min_periods, + center: center + ) + ) + .to_series + end - # def rolling_std - # end + # Compute a rolling std dev. + # + # A window of length `window_size` will traverse the array. The values that fill + # this window will (optionally) be multiplied with the weights given by the + # `weight` vector. The resulting values will be aggregated to their sum. + # + # @param window_size [Integer] + # The length of the window. + # @param weights [Array] + # An optional slice with the same length as the window that will be multiplied + # elementwise with the values in the window. + # @param min_periods [Integer] + # The number of values in the window that should be non-null before computing + # a result. If None, it will be set equal to window size. + # @param center [Boolean] + # Set the labels at the center of the window + # + # @return [Series] + # + # @example + # s = Polars::Series.new("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + # s.rolling_std(3) + # # => + # # shape: (6,) + # # Series: 'a' [f64] + # # [ + # # null + # # null + # # 1.0 + # # 1.0 + # # 1.527525 + # # 2.0 + # # ] + def rolling_std( + window_size, + weights: nil, + min_periods: nil, + center: false + ) + to_frame + .select( + Polars.col(name).rolling_std( + window_size, + weights: weights, + min_periods: min_periods, + center: center + ) + ) + .to_series + end - # def rolling_var - # end + # Compute a rolling variance. + # + # A window of length `window_size` will traverse the array. The values that fill + # this window will (optionally) be multiplied with the weights given by the + # `weight` vector. The resulting values will be aggregated to their sum. + # + # @param window_size [Integer] + # The length of the window. + # @param weights [Array] + # An optional slice with the same length as the window that will be multiplied + # elementwise with the values in the window. + # @param min_periods [Integer] + # The number of values in the window that should be non-null before computing + # a result. If None, it will be set equal to window size. + # @param center [Boolean] + # Set the labels at the center of the window + # + # @return [Series] + # + # @example + # s = Polars::Series.new("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + # s.rolling_var(3) + # # => + # # shape: (6,) + # # Series: 'a' [f64] + # # [ + # # null + # # null + # # 1.0 + # # 1.0 + # # 2.333333 + # # 4.0 + # # ] + def rolling_var( + window_size, + weights: nil, + min_periods: nil, + center: false + ) + to_frame + .select( + Polars.col(name).rolling_var( + window_size, + weights: weights, + min_periods: min_periods, + center: center + ) + ) + .to_series + end # def rolling_apply # end - # def rolling_median - # end + # Compute a rolling median. + # + # @param window_size [Integer] + # The length of the window. + # @param weights [Array] + # An optional slice with the same length as the window that will be multiplied + # elementwise with the values in the window. + # @param min_periods [Integer] + # The number of values in the window that should be non-null before computing + # a result. If None, it will be set equal to window size. + # @param center [Boolean] + # Set the labels at the center of the window + # + # @return [Series] + # + # @example + # s = Polars::Series.new("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + # s.rolling_median(3) + # # => + # # shape: (6,) + # # Series: 'a' [f64] + # # [ + # # null + # # null + # # 2.0 + # # 3.0 + # # 4.0 + # # 6.0 + # # ] + def rolling_median( + window_size, + weights: nil, + min_periods: nil, + center: false + ) + if min_periods.nil? + min_periods = window_size + end - # def rolling_quantile - # end + to_frame + .select( + Polars.col(name).rolling_median( + window_size, + weights: weights, + min_periods: min_periods, + center: center + ) + ) + .to_series + end - # def rolling_skew - # end + # Compute a rolling quantile. + # + # @param quantile [Float] + # Quantile between 0.0 and 1.0. + # @param interpolation ["nearest", "higher", "lower", "midpoint", "linear"] + # Interpolation method. + # @param window_size [Integer] + # The length of the window. + # @param weights [Array] + # An optional slice with the same length as the window that will be multiplied + # elementwise with the values in the window. + # @param min_periods [Integer] + # The number of values in the window that should be non-null before computing + # a result. If None, it will be set equal to window size. + # @param center [Boolean] + # Set the labels at the center of the window + # + # @return [Series] + # + # @example + # s = Polars::Series.new("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + # s.rolling_quantile(0.33, window_size: 3) + # # => + # # shape: (6,) + # # Series: 'a' [f64] + # # [ + # # null + # # null + # # 1.0 + # # 2.0 + # # 3.0 + # # 4.0 + # # ] + # + # @example + # s.rolling_quantile(0.33, interpolation: "linear", window_size: 3) + # # => + # # shape: (6,) + # # Series: 'a' [f64] + # # [ + # # null + # # null + # # 1.66 + # # 2.66 + # # 3.66 + # # 5.32 + # # ] + def rolling_quantile( + quantile, + interpolation: "nearest", + window_size: 2, + weights: nil, + min_periods: nil, + center: false + ) + if min_periods.nil? + min_periods = window_size + end - # def sample - # end + to_frame + .select( + Polars.col(name).rolling_quantile( + quantile, + interpolation: interpolation, + window_size: window_size, + weights: weights, + min_periods: min_periods, + center: center + ) + ) + .to_series + end + # Compute a rolling skew. + # + # @param window_size [Integer] + # Integer size of the rolling window. + # @param bias [Boolean] + # If false, the calculations are corrected for statistical bias. + # + # @return [Series] + # + # @example + # s = Polars::Series.new("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + # s.rolling_skew(3) + # # => + # # shape: (6,) + # # Series: 'a' [f64] + # # [ + # # null + # # null + # # 0.0 + # # 0.0 + # # 0.381802 + # # 0.0 + # # ] + def rolling_skew(window_size, bias: true) + super + end + + # Sample from this Series. + # + # @param n [Integer] + # Number of items to return. Cannot be used with `frac`. Defaults to 1 if + # `frac` is None. + # @param frac [Float] + # Fraction of items to return. Cannot be used with `n`. + # @param with_replacement [Boolean] + # Allow values to be sampled more than once. + # @param shuffle [Boolean] + # Shuffle the order of sampled data points. + # @param seed [Integer] + # Seed for the random number generator. If set to None (default), a random + # seed is used. + # + # @return [Series] + # + # @example + # s = Polars::Series.new("a", [1, 2, 3, 4, 5]) + # s.sample(n: 2, seed: 0) + # # => + # # shape: (2,) + # # Series: 'a' [i64] + # # [ + # # 1 + # # 5 + # # ] + def sample( + n: nil, + frac: nil, + with_replacement: false, + shuffle: false, + seed: nil + ) + if !n.nil? && !frac.nil? + raise ArgumentError, "cannot specify both `n` and `frac`" + end + + if n.nil? && !frac.nil? + return Utils.wrap_s(_s.sample_frac(frac, with_replacement, shuffle, seed)) + end + + if n.nil? + n = 1 + end + Utils.wrap_s(_s.sample_n(n, with_replacement, shuffle, seed)) + end + # Get a boolean mask of the local maximum peaks. # # @return [Series] # # @example @@ -2423,12 +2995,65 @@ # @return [Series] def abs super end - # def rank - # end + # Assign ranks to data, dealing with ties appropriately. + # + # @param method ["average", "min", "max", "dense", "ordinal", "random"] + # The method used to assign ranks to tied elements. + # The following methods are available (default is 'average'): + # + # - 'average' : The average of the ranks that would have been assigned to + # all the tied values is assigned to each value. + # - 'min' : The minimum of the ranks that would have been assigned to all + # the tied values is assigned to each value. (This is also referred to + # as "competition" ranking.) + # - 'max' : The maximum of the ranks that would have been assigned to all + # the tied values is assigned to each value. + # - 'dense' : Like 'min', but the rank of the next highest element is + # assigned the rank immediately after those assigned to the tied + # elements. + # - 'ordinal' : All values are given a distinct rank, corresponding to + # the order that the values occur in the Series. + # - 'random' : Like 'ordinal', but the rank for ties is not dependent + # on the order that the values occur in the Series. + # @param reverse [Boolean] + # Reverse the operation. + # + # @return [Series] + # + # @example The 'average' method: + # s = Polars::Series.new("a", [3, 6, 1, 1, 6]) + # s.rank + # # => + # # shape: (5,) + # # Series: 'a' [f32] + # # [ + # # 3.0 + # # 4.5 + # # 1.5 + # # 1.5 + # # 4.5 + # # ] + # + # @example The 'ordinal' method: + # s = Polars::Series.new("a", [3, 6, 1, 1, 6]) + # s.rank(method: "ordinal") + # # => + # # shape: (5,) + # # Series: 'a' [u32] + # # [ + # # 3 + # # 4 + # # 1 + # # 2 + # # 5 + # # ] + def rank(method: "average", reverse: false) + super + end # Calculate the n-th discrete difference. # # @param n [Integer] # Number of slots to shift. @@ -2438,12 +3063,60 @@ # @return [Series] def diff(n: 1, null_behavior: "ignore") super end - # def pct_change - # end + # Computes percentage change between values. + # + # Percentage change (as fraction) between current element and most-recent + # non-null element at least `n` period(s) before the current element. + # + # Computes the change from the previous row by default. + # + # @param n [Integer] + # periods to shift for forming percent change. + # + # @return [Series] + # + # @example + # Polars::Series.new(0..9).pct_change + # # => + # # shape: (10,) + # # Series: '' [f64] + # # [ + # # null + # # inf + # # 1.0 + # # 0.5 + # # 0.333333 + # # 0.25 + # # 0.2 + # # 0.166667 + # # 0.142857 + # # 0.125 + # # ] + # + # @example + # Polars::Series.new([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(n: 2) + # # => + # # shape: (10,) + # # Series: '' [f64] + # # [ + # # null + # # null + # # 3.0 + # # 3.0 + # # 3.0 + # # 3.0 + # # 3.0 + # # 3.0 + # # 3.0 + # # 3.0 + # # ] + def pct_change(n: 1) + super + end # Compute the sample skewness of a data set. # # For normally distributed data, the skewness should be about zero. For # unimodal continuous distributions, a skewness value greater than zero means @@ -2569,18 +3242,53 @@ # # ] def shuffle(seed: nil) super end - # def ewm_mean - # end + # Exponentially-weighted moving average. + # + # @return [Series] + def ewm_mean( + com: nil, + span: nil, + half_life: nil, + alpha: nil, + adjust: true, + min_periods: 1 + ) + super + end - # def ewm_std - # end + # Exponentially-weighted moving standard deviation. + # + # @return [Series] + def ewm_std( + com: nil, + span: nil, + half_life: nil, + alpha: nil, + adjust: true, + bias: false, + min_periods: 1 + ) + super + end - # def ewm_var - # end + # Exponentially-weighted moving variance. + # + # @return [Series] + def ewm_var( + com: nil, + span: nil, + half_life: nil, + alpha: nil, + adjust: true, + bias: false, + min_periods: 1 + ) + super + end # Extend the Series with given number of values. # # @param value [Object] # The value to extend the Series with. This value may be `nil` to fill with @@ -2643,24 +3351,44 @@ # @return [Series] def shrink_dtype super end - # def arr - # end + # Create an object namespace of all list related methods. + # + # @return [ListNameSpace] + def arr + ListNameSpace.new(self) + end - # def cat - # end + # Create an object namespace of all categorical related methods. + # + # @return [CatNameSpace] + def cat + CatNameSpace.new(self) + end - # def dt - # end + # Create an object namespace of all datetime related methods. + # + # @return [DateTimeNameSpace] + def dt + DateTimeNameSpace.new(self) + end - # def str - # end + # Create an object namespace of all string related methods. + # + # @return [StringNameSpace] + def str + StringNameSpace.new(self) + end - # def struct - # end + # Create an object namespace of all struct related methods. + # + # @return [StructNameSpace] + def struct + StructNameSpace.new(self) + end private def initialize_copy(other) super @@ -2749,10 +3477,14 @@ # dtype = rb_type_to_dtype(dtype) # end if ruby_dtype == Date RbSeries.new_opt_date(name, values, strict) + elsif ruby_dtype == Time + RbSeries.new_opt_datetime(name, values, strict) + elsif ruby_dtype == DateTime + RbSeries.new_opt_datetime(name, values.map(&:to_time), strict) else raise Todo end elsif ruby_dtype == Array if nested_dtype.nil? @@ -2762,11 +3494,30 @@ if nested_dtype == Array raise Todo end - raise Todo + if value.is_a?(Array) + count = 0 + equal_to_inner = true + values.each do |lst| + lst.each do |vl| + equal_to_inner = vl.class == nested_dtype + if !equal_to_inner || count > 50 + break + end + count += 1 + end + end + if equal_to_inner + dtype = Utils.rb_type_to_dtype(nested_dtype) + # TODO rescue and fallback to new_object + return RbSeries.new_list(name, values, dtype) + end + end + + RbSeries.new_object(name, values, strict) else constructor = rb_type_to_constructor(value.class) constructor.call(name, values, strict) end end @@ -2802,11 +3553,10 @@ } def rb_type_to_constructor(dtype) RB_TYPE_TO_CONSTRUCTOR.fetch(dtype) rescue KeyError - # RbSeries.method(:new_object) - raise ArgumentError, "Cannot determine type" + RbSeries.method(:new_object) end def _get_first_non_none(values) values.find { |v| !v.nil? } end