# frozen_string_literal: true # Available functions in Arrow are shown by `Arrow::Function.all.map(&:name)` # reference: https://arrow.apache.org/docs/cpp/compute.html module RedAmber # Representing a series of data. class Vector class << self private # @!macro [attach] define_unary_aggregation # [Unary aggregation function] Returns a scalar. # def define_unary_aggregation(function) define_method(function) do |**options| datum = exec_func_unary(function, options) get_scalar(datum) end end end # Not implemented in red-arrow yet: # Arrow::Indexoptions, Arrow::ModeOptions, Arrow::TDigestOptions # @!macro scalar_aggregate_options # @param skip_nulls [true, false] # If true, nil values are ignored. # Otherwise, if any value is nil, emit nil. # @param min_count [Integer] # if less than this many non-nil values are observed, emit nil. # If skip_nulls is false, this option is not respected. # @!macro count_options # @param mode [:only_valid, :only_null, :all] # control count aggregate kernel behavior. # - only_valid: count only non-nil values. # - only_null: count only nil. # - all: count both. # @!macro variance_options # @param ddof [0, 1] # Control Delta Degrees of Freedom (ddof) of Variance and Stddev kernel. # The divisor used in calculations is N - ddof, where N is the number # of elements. By default, ddof is zero, and population variance or stddev # is returned. # @macro scalar_aggregate_options # Test whether all elements in self are evaluated to true. # # @!method all(skip_nulls: true, min_count: 1) # @macro scalar_aggregate_options # @return [true, false] # `all` result of self. # @example Default. # Vector.new(true, true, nil).all # => true # # @example Skip nils. # Vector.new(true, true, nil).all(skip_nulls: false) # => false # define_unary_aggregation :all alias_method :all?, :all # Test whether any elements in self are evaluated to true. # # @!method any(skip_nulls: true, min_count: 1) # @macro scalar_aggregate_options # @return [true, false] # `any` result of self. # @example Default. # Vector.new(true, false, nil).any # => true # define_unary_aggregation :any alias_method :any?, :any # Approximate median of a numeric Vector with T-Digest algorithm. # # @!method approximate_median(skip_nulls: true, min_count: 1) # @macro scalar_aggregate_options # @return [Float] # median of self. # A nil is returned if there is no valid data point. # define_unary_aggregation :approximate_median alias_method :median, :approximate_median # Count the number of nil / non-nil values. # # @!method count(mode: :non_null) # @macro count_options # @return [Integer] count of self. # @example Count only non-nil (default) # Vector.new(1.0, -2.0, Float::NAN, nil).count # => 3 # # @example Count nil only. # Vector.new(1.0, -2.0, Float::NAN, nil).count(mode: :only_null) # => 1 # # @example Count both non-nil and nil. # Vector.new(1.0, -2.0, Float::NAN, nil).count(mode: :all) # => 4 # define_unary_aggregation :count # Count the number of unique values. # # @!method count_distinct(mode: :only_valid) # @macro count_options # @return [Integer] # unique count of self. # @example # vector = Vector.new(1, 1.0, nil, nil, Float::NAN, Float::NAN) # vector # # # => # # # [1.0, 1.0, nil, nil, NaN, NaN] # # # Float::NANs are counted as 1. # vector.count_uniq # => 2 # # # nils are counted as 1. # vector.count_uniq(mode: :only_null) # => 1 # # vector.count_uniq(mode: :all) # => 3 # define_unary_aggregation :count_distinct alias_method :count_uniq, :count_distinct # Compute maximum value of self. # # @!method max(skip_nulls: true, min_count: 1) # @macro scalar_aggregate_options # @return [Numeric] # maximum value of self. # define_unary_aggregation :max # Compute mean value of self. # # @!method mean(skip_nulls: true, min_count: 1) # @macro scalar_aggregate_options # @return [Numeric] # mean of self. # define_unary_aggregation :mean # Compute minimum value of self. # # @!method min(skip_nulls: true, min_count: 1) # @macro scalar_aggregate_options # @return [Numeric] # minimum of self. # define_unary_aggregation :min # Compute the min and max value of self. # # @!method min_max(skip_nulls: true, min_count: 1) # @macro scalar_aggregate_options # @return [Array] # min and max of self in an Array. # define_unary_aggregation :min_max # Compute the 1 most common values and their respective # occurence counts. # # @note Self must be a numeric or a boolean Vector. # @note ModeOptions are not supported in 0.5.0 . # Only one mode value is returned. # @api private # @return [Hash{'mode'=>mode, 'count'=>count}] # mode and count of self in an array. # @since 0.5.0 # def mode datum = find(:mode).execute([data]) datum.value.to_a.first end # Compute product value of self. # # @note Self must be a numeric Vector. # @!method product(skip_nulls: true, min_count: 1) # @macro scalar_aggregate_options # @return [Numeric] # product of self. # define_unary_aggregation :product # Calculate standard deviation of self. # # @note Self must be a numeric Vector. # @!method stddev(ddof: 0, skip_nulls: true, min_count: 1) # @macro variance_options # @return [Float] # standard deviation of self. Biased (ddof=0) by default. # define_unary_aggregation :stddev # Calculate unbiased standard deviation of self. # # @note Self must be a numeric Vector. # @!method sd(ddof: 1, skip_nulls: true, min_count: 1) # @macro variance_options # @return [Float] # standard deviation of self. Unviased (ddof=1)by default. # def sd stddev(ddof: 1) end alias_method :std, :sd # Compute sum of self. # # @note Self must be a numeric Vector. # @!method sum(skip_nulls: true, min_count: 1) # @macro scalar_aggregate_options # @return [Numeric] # sum of self. # define_unary_aggregation :sum # Calculate variance of self. # # @note Self must be a numeric Vector. # @!method variance(ddof: 0, skip_nulls: true, min_count: 1) # @macro variance_options # # @return [Float] # unviased (ddof=1) standard deviation of self by default. # # @return [Float] # variance of self. Biased (ddof=0) by default. # define_unary_aggregation :variance # Calculate unbiased variance of self. # # @note self must be a numeric Vector. # @!method unbiased_variance(ddof: 1, skip_nulls: true, min_count: 1) # @macro variance_options # @return [Float] # variance of self. Unviased (ddof=1) by default. # def unbiased_variance variance(ddof: 1) end alias_method :var, :unbiased_variance # @!macro quantile_interpolation # @param interpolation [Symbol] # specifies interpolation method to use, # when the quantile lies between the data i and j. # - Default value is :linear, which returns i + (j - i) * fraction. # - lower: returns i. # - higher: returns j. # - nearest: returns i or j, whichever is closer. # - midpoint: returns (i + j) / 2. # Get a non-nil element in self. # # @return [Object, nil] # first non-nil value detected. If all elements are nil, return nil. # @since 0.5.0 # def one each.find { !_1.nil? } end # Returns a quantile value. # - 0.5 quantile (median) is returned by default. # - Or return quantile for specified probability (prob). # - If quantile lies between two data points, interpolated value is # returned based on selected interpolation method. # - Nils and NaNs are ignored. # - Nil is returned if there are no valid data point. # # @param prob [Float] # probability. # @macro quantile_interpolation # @macro scalar_aggregate_options # @return [Float] # quantile of self. # @example # penguins[:bill_depth_mm].quantile # # # => # 17.3 # defaultis prob = 0.5 # def quantile(prob = 0.5, interpolation: :linear, skip_nulls: true, min_count: 0) unless (0..1).cover? prob raise VectorArgumentError, "Invalid: probability #{prob} must be between 0 and 1" end datum = find(:quantile).execute([data], q: prob, interpolation: interpolation, skip_nulls: skip_nulls, min_count: min_count) datum.value.to_a.first end # Return quantiles in a DataFrame # # @param probs [Array] # Array of probabilities. Default probabilities are 0.0, 0.25, 0.5 0.75, 1.0 . # @macro quantile_interpolation # @macro scalar_aggregate_options # @return [DataFrame] # quantiles of self. # @example # penguins[:bill_depth_mm].quantiles([0.05, 0.95]) # # # => # # # probs quantiles # # 0 0.05 13.9 # 1 0.95 20.0 # def quantiles(probs = [0.0, 0.25, 0.5, 0.75, 1.0], interpolation: :linear, skip_nulls: true, min_count: 0) if probs.empty? || !probs.all? { |q| (0..1).cover?(q) } raise VectorArgumentError, "Invarid probavilities #{probs}" end DataFrame.new( probs: probs, quantiles: probs.map do |q| quantile(q, interpolation: interpolation, skip_nulls: skip_nulls, min_count: min_count) end ) end end end