# frozen_string_literal: true module RedAmber # mix-in for the class DataFrame module DataFrameSelectable # select columns: [symbol] or [string] # select rows: [array of index], [range] def [](*args) args.flatten! raise DataFrameArgumentError, 'Empty dataframe' if empty? return remove_all_values if args.empty? || args[0].nil? vector = parse_to_vector(args) if vector.boolean? return filter_by_vector(vector.data) if vector.size == size raise DataFrameArgumentError, "Size is not match in booleans: #{args}" end return take_by_array(vector) if vector.numeric? return select_vars_by_keys(vector.to_a.map(&:to_sym)) if vector.string? || vector.type == :dictionary raise DataFrameArgumentError, "Invalid argument: #{args}" end # slice and select rows to create sub DataFrame def slice(*args, &block) slicer = args if block raise DataFrameArgumentError, 'Must not specify both arguments and block.' unless args.empty? slicer = [instance_eval(&block)] end slicer.flatten! raise DataFrameArgumentError, 'Self is an empty dataframe' if empty? return remove_all_values if slicer.empty? || slicer[0].nil? vector = parse_to_vector(slicer) if vector.boolean? return filter_by_vector(vector.data) if vector.size == size raise DataFrameArgumentError, "Size is not match in booleans: #{slicer}" end return take_by_array(vector) if vector.numeric? raise DataFrameArgumentError, "Invalid argument #{slicer}" end def slice_by(key, keep_key: false, &block) raise DataFrameArgumentError, 'Self is an empty dataframe' if empty? raise DataFrameArgumentError, 'No block given' unless block raise DataFrameArgumentError, "#{key} is no a key of self" unless key?(key) return self if key.nil? slicer = instance_eval(&block) return DataFrame.new unless slicer if slicer.is_a?(Range) from = slicer.begin from = if from.is_a?(String) self[key].index(from) elsif from.nil? 0 elsif from < 0 size + from else from end to = slicer.end to = if to.is_a?(String) self[key].index(to) elsif to.nil? size - 1 elsif to < 0 size + to else to end slicer = (from..to).to_a else slicer = slicer.map { |x| x.is_a?(String) ? self[key].index(x) : x } end if keep_key take(slicer) else take(slicer).drop(key) end end # remove selected rows to create remainer DataFrame def remove(*args, &block) remover = args if block raise DataFrameArgumentError, 'Must not specify both arguments and block.' unless args.empty? remover = [instance_eval(&block)] end remover.flatten! raise DataFrameArgumentError, 'Empty dataframe' if empty? return self if remover.empty? || remover[0].nil? vector = parse_to_vector(remover) if vector.boolean? return filter_by_vector(vector.primitive_invert.data) if vector.size == size raise DataFrameArgumentError, "Size is not match in booleans: #{remover}" end if vector.numeric? raise DataFrameArgumentError, "Index out of range: #{vector.min}" if vector.min <= -size - 1 normalized_indices = (vector < 0).if_else(vector + size, vector) # normalize index from tail if normalized_indices.max >= size raise DataFrameArgumentError, "Index out of range: #{normalized_indices.max}" end normalized_indices = normalized_indices.floor.to_a.map(&:to_i) # round to integer array return remove_all_values if normalized_indices == indices return self if normalized_indices.empty? index_array = indices - normalized_indices datum = Arrow::Function.find(:take).execute([table, index_array]) return DataFrame.new(datum.value) end raise DataFrameArgumentError, "Invalid argument #{remover}" end def remove_nil func = Arrow::Function.find(:drop_null) DataFrame.new(func.execute([table]).value) end alias_method :drop_nil, :remove_nil # Select a variable by a key in String or Symbol def v(key) unless key.is_a?(Symbol) || key.is_a?(String) raise DataFrameArgumentError, "Key is not a Symbol or String [#{key}]" end raise DataFrameArgumentError, "Key not exist [#{key}]" unless key?(key) variables[key.to_sym] end def head(n_obs = 5) raise DataFrameArgumentError, "Index is out of range #{n_obs}" if n_obs.negative? self[0...[n_obs, size].min] end def tail(n_obs = 5) raise DataFrameArgumentError, "Index is out of range #{n_obs}" if n_obs.negative? self[-[n_obs, size].min..] end def first(n_obs = 1) head(n_obs) end def last(n_obs = 1) tail(n_obs) end # Undocumented # TODO: support for option {boundscheck: true} def take(*indices) indices.flatten! return remove_all_values if indices.empty? indices = indices[0] if indices.one? && !indices[0].is_a?(Numeric) indices = Vector.new(indices) unless indices.is_a?(Vector) take_by_array(indices) end # Undocumented # TODO: support for option {null_selection_behavior: :drop} def filter(*booleans) booleans.flatten! return remove_all_values if booleans.empty? b = booleans[0] case b when Vector raise DataFrameArgumentError, 'Argument is not a boolean.' unless b.boolean? filter_by_vector(b.data) when Arrow::BooleanArray filter_by_vector(b) else raise DataFrameArgumentError, 'Argument is not a boolean.' unless booleans?(booleans) filter_by_vector(Arrow::BooleanArray.new(booleans)) end end private def select_vars_by_keys(keys) if keys.one? key = keys[0].to_sym raise DataFrameArgumentError, "Key does not exist #{keys}" unless key? key variables[key] else DataFrame.new(@table[keys]) end end # Accepts indices by numeric Vector def take_by_array(indices) raise DataFrameArgumentError, "Indices must be a numeric Vector: #{indices}" unless indices.numeric? raise DataFrameArgumentError, "Index out of range: #{indices.min}" if indices.min <= -size - 1 normalized_indices = (indices < 0).if_else(indices + size, indices) # normalize index from tail raise DataFrameArgumentError, "Index out of range: #{normalized_indices.max}" if normalized_indices.max >= size index_array = Arrow::UInt64ArrayBuilder.build(normalized_indices.data) # round to integer array datum = Arrow::Function.find(:take).execute([table, index_array]) DataFrame.new(datum.value) end # Accepts booleans by Arrow::BooleanArray def filter_by_vector(boolean_array) raise DataFrameArgumentError, 'Booleans must be same size as self.' unless boolean_array.length == size datum = Arrow::Function.find(:filter).execute([table, boolean_array]) DataFrame.new(datum.value) end # return a DataFrame with same keys as self without values def remove_all_values filter_by_vector(Arrow::BooleanArray.new([false] * size)) end end end