require 'daru/maths/arithmetic/vector.rb' require 'daru/maths/statistics/vector.rb' require 'daru/plotting/gruff.rb' require 'daru/plotting/nyaplot.rb' require 'daru/accessors/array_wrapper.rb' require 'daru/accessors/nmatrix_wrapper.rb' require 'daru/accessors/gsl_wrapper.rb' require 'daru/category.rb' module Daru class Vector # rubocop:disable Metrics/ClassLength include Enumerable include Daru::Maths::Arithmetic::Vector include Daru::Maths::Statistics::Vector extend Gem::Deprecate class << self # Create a new vector by specifying the size and an optional value # and block to generate values. # # == Description # # The *new_with_size* class method lets you create a Daru::Vector # by specifying the size as the argument. The optional block, if # supplied, is run once for populating each element in the Vector. # # The result of each run of the block is the value that is ultimately # assigned to that position in the Vector. # # == Options # :value # All the rest like .new def new_with_size n, opts={}, &block value = opts.delete :value block ||= ->(_) { value } Daru::Vector.new Array.new(n, &block), opts end # Create a vector using (almost) any object # * Array: flattened # * Range: transformed using to_a # * Daru::Vector # * Numeric and string values # # == Description # # The `Vector.[]` class method creates a vector from almost any # object that has a `#to_a` method defined on it. It is similar # to R's `c` method. # # == Usage # # a = Daru::Vector[1,2,3,4,6..10] # #=> # # # # nil # # 0 1 # # 1 2 # # 2 3 # # 3 4 # # 4 6 # # 5 7 # # 6 8 # # 7 9 # # 8 10 def [](*indexes) values = indexes.map do |a| a.respond_to?(:to_a) ? a.to_a : a end.flatten Daru::Vector.new(values) end def _load(data) # :nodoc: h = Marshal.load(data) Daru::Vector.new(h[:data], index: h[:index], name: h[:name], dtype: h[:dtype], missing_values: h[:missing_values]) end def coerce(data, options={}) case data when Daru::Vector data when Array, Hash new(data, options) else raise ArgumentError, "Can't coerce #{data.class} to #{self}" end end end def size @data.size end def each(&block) return to_enum(:each) unless block_given? @data.each(&block) self end def each_index(&block) return to_enum(:each_index) unless block_given? @index.each(&block) self end def each_with_index &block return to_enum(:each_with_index) unless block_given? @data.to_a.zip(@index.to_a).each(&block) self end def map!(&block) return to_enum(:map!) unless block_given? @data.map!(&block) self end def apply_method(method, keys: nil, by_position: true) vect = keys ? get_sub_vector(keys, by_position: by_position) : self case method when Symbol then vect.send(method) when Proc then method.call(vect) else raise end end alias :apply_method_on_sub_vector :apply_method # The name of the Daru::Vector. String. attr_reader :name # The row index. Can be either Daru::Index or Daru::MultiIndex. attr_reader :index # The underlying dtype of the Vector. Can be either :array, :nmatrix or :gsl. attr_reader :dtype # If the dtype is :nmatrix, this attribute represents the data type of the # underlying NMatrix object. See NMatrix docs for more details on NMatrix # data types. attr_reader :nm_dtype # An Array or the positions in the vector that are being treated as 'missing'. attr_reader :missing_positions deprecate :missing_positions, :indexes, 2016, 10 # Store a hash of labels for values. Supplementary only. Recommend using index # for proper usage. attr_accessor :labels # Store vector data in an array attr_reader :data # TODO: Make private. attr_reader :nil_positions, :nan_positions # Create a Vector object. # # == Arguments # # @param source[Array,Hash] - Supply elements in the form of an Array or a # Hash. If Array, a numeric index will be created if not supplied in the # options. Specifying more index elements than actual values in *source* # will insert *nil* into the surplus index elements. When a Hash is specified, # the keys of the Hash are taken as the index elements and the corresponding # values as the values that populate the vector. # # == Options # # * +:name+ - Name of the vector # # * +:index+ - Index of the vector # # * +:dtype+ - The underlying data type. Can be :array, :nmatrix or :gsl. # Default :array. # # * +:nm_dtype+ - For NMatrix, the data type of the numbers. See the NMatrix docs for # further information on supported data type. # # * +:missing_values+ - An Array of the values that are to be treated as 'missing'. # nil is the default missing value. # # == Usage # # vecarr = Daru::Vector.new [1,2,3,4], index: [:a, :e, :i, :o] # vechsh = Daru::Vector.new({a: 1, e: 2, i: 3, o: 4}) def initialize source, opts={} if opts[:type] == :category # Initialize category type vector extend Daru::Category initialize_category source, opts else # Initialize non-category type vector initialize_vector source, opts end end # attr_reader for :plotting_library def plotting_library init_plotting_library @plotting_library end def plotting_library= lib case lib when :gruff, :nyaplot @plotting_library = lib if Daru.send("has_#{lib}?".to_sym) extend Module.const_get( "Daru::Plotting::Vector::#{lib.to_s.capitalize}Library" ) end else raise ArgumentError, "Plotting library #{lib} not supported. "\ 'Supported libraries are :nyaplot and :gruff' end end # this method is overwritten: see Daru::Vector#plotting_library= def plot(*args, **options, &b) init_plotting_library plot(*args, **options, &b) end # Get one or more elements with specified index or a range. # # == Usage # # For vectors employing single layer Index # # v[:one, :two] # => Daru::Vector with indexes :one and :two # v[:one] # => Single element # v[:one..:three] # => Daru::Vector with indexes :one, :two and :three # # # For vectors employing hierarchial multi index # def [](*input_indexes) # Get array of positions indexes positions = @index.pos(*input_indexes) # If one object is asked return it return @data[positions] if positions.is_a? Numeric # Form a new Vector using positional indexes Daru::Vector.new( positions.map { |loc| @data[loc] }, name: @name, index: @index.subset(*input_indexes), dtype: @dtype ) end # Returns vector of values given positional values # @param positions [Array] positional values # @return [object] vector # @example # dv = Daru::Vector.new 'a'..'e' # dv.at 0, 1, 2 # # => # # # 0 a # # 1 b # # 2 c def at *positions # to be used to form index original_positions = positions positions = coerce_positions(*positions) validate_positions(*positions) if positions.is_a? Integer @data[positions] else values = positions.map { |pos| @data[pos] } Daru::Vector.new values, index: @index.at(*original_positions), dtype: dtype end end # Change value at given positions # @param positions [Array] positional values # @param [object] val value to assign # @example # dv = Daru::Vector.new 'a'..'e' # dv.set_at [0, 1], 'x' # dv # # => # # # 0 x # # 1 x # # 2 c # # 3 d # # 4 e def set_at positions, val validate_positions(*positions) positions.map { |pos| @data[pos] = val } update_position_cache end # Just like in Hashes, you can specify the index label of the Daru::Vector # and assign an element an that place in the Daru::Vector. # # == Usage # # v = Daru::Vector.new([1,2,3], index: [:a, :b, :c]) # v[:a] = 999 # #=> # ## # # nil # # a 999 # # b 2 # # c 3 def []=(*indexes, val) cast(dtype: :array) if val.nil? && dtype != :array guard_type_check(val) modify_vector(indexes, val) update_position_cache end # Two vectors are equal if they have the exact same index values corresponding # with the exact same elements. Name is ignored. def == other case other when Daru::Vector @index == other.index && size == other.size && @index.all? { |index| self[index] == other[index] } else super end end # !@method eq # Uses `==` and returns `true` for each **equal** entry # @param [#==, Daru::Vector] If scalar object, compares it with each # element in self. If Daru::Vector, compares elements with same indexes. # @example (see #where) # !@method not_eq # Uses `!=` and returns `true` for each **unequal** entry # @param [#!=, Daru::Vector] If scalar object, compares it with each # element in self. If Daru::Vector, compares elements with same indexes. # @example (see #where) # !@method lt # Uses `<` and returns `true` for each entry **less than** the supplied object # @param [#<, Daru::Vector] If scalar object, compares it with each # element in self. If Daru::Vector, compares elements with same indexes. # @example (see #where) # !@method lteq # Uses `<=` and returns `true` for each entry **less than or equal to** the supplied object # @param [#<=, Daru::Vector] If scalar object, compares it with each # element in self. If Daru::Vector, compares elements with same indexes. # @example (see #where) # !@method mt # Uses `>` and returns `true` for each entry **more than** the supplied object # @param [#>, Daru::Vector] If scalar object, compares it with each # element in self. If Daru::Vector, compares elements with same indexes. # @example (see #where) # !@method mteq # Uses `>=` and returns `true` for each entry **more than or equal to** the supplied object # @param [#>=, Daru::Vector] If scalar object, compares it with each # element in self. If Daru::Vector, compares elements with same indexes. # @example (see #where) # Define the comparator methods with metaprogramming. See documentation # written above for functionality of each method. Use these methods with the # `where` method to obtain the corresponding Vector/DataFrame. { eq: :==, not_eq: :!=, lt: :<, lteq: :<=, mt: :>, mteq: :>= }.each do |method, operator| define_method(method) do |other| mod = Daru::Core::Query if other.is_a?(Daru::Vector) mod.apply_vector_operator operator, self, other else mod.apply_scalar_operator operator, @data, other end end alias_method operator, method if operator != :== && operator != :!= end alias :gt :mt alias :gteq :mteq # Comparator for checking if any of the elements in *other* exist in self. # # @param [Array, Daru::Vector] other A collection which has elements that # need to be checked for in self. # @example Usage of `in`. # vector = Daru::Vector.new([1,2,3,4,5]) # vector.where(vector.in([3,5])) # #=> # ## # # nil # # 2 3 # # 4 5 def in other other = Hash[other.zip(Array.new(other.size, 0))] Daru::Core::Query::BoolArray.new( @data.each_with_object([]) do |d, memo| memo << (other.key?(d) ? true : false) end ) end # Return a new vector based on the contents of a boolean array. Use with the # comparator methods to obtain meaningful results. See this notebook for # a good overview of using #where. # # @param bool_array [Daru::Core::Query::BoolArray, Array] The # collection containing the true of false values. Each element in the Vector # corresponding to a `true` in the bool_arry will be returned alongwith it's # index. # @example Usage of #where. # vector = Daru::Vector.new([2,4,5,51,5,16,2,5,3,2,1,5,2,5,2,1,56,234,6,21]) # # # Simple logic statement passed to #where. # vector.where(vector.eq(5).or(vector.eq(1))) # # => # ## # # nil # # 2 5 # # 4 5 # # 7 5 # # 10 1 # # 11 5 # # 13 5 # # 15 1 # # # A somewhat more complex logic statement # vector.where((vector.eq(5) | vector.lteq(1)) & vector.in([4,5,1])) # #=> # ## # # nil # # 2 5 # # 4 5 # # 7 5 # # 10 1 # # 11 5 # # 13 5 # # 15 1 def where bool_array Daru::Core::Query.vector_where self, bool_array end # Return a new vector based on the contents of a boolean array and &block. # # @param bool_array [Daru::Core::Query::BoolArray, Array, &block] The # collection containing the true of false values. Each element in the Vector # corresponding to a `true` in the bool_array will be returned along with it's # index. The &block may contain manipulative functions for the Vector elements. # # @return [Daru::Vector] # # @example Usage of #apply_where. # dv = Daru::Vector.new ['3 days', '5 weeks', '2 weeks'] # dv = dv.apply_where(dv.match /weeks/) { |x| "#{x.split.first.to_i * 7} days" } # # => # ## # # 0 3 days # # 1 35 days # # 2 14 days def apply_where bool_array, &block Daru::Core::Query.vector_apply_where self, bool_array, &block end def head q=10 self[0..(q-1)] end def tail q=10 start = [size - q, 0].max self[start..(size-1)] end def last q=1 # The Enumerable mixin dose not provide the last method. tail(q) end def empty? @index.empty? end def numeric? type == :numeric end def object? type == :object end # Reports whether missing data is present in the Vector. def has_missing_data? !indexes(*Daru::MISSING_VALUES).empty? end alias :flawed? :has_missing_data? deprecate :has_missing_data?, :include_values?, 2016, 10 deprecate :flawed?, :include_values?, 2016, 10 # Check if any one of mentioned values occur in the vector # @param values [Array] values to check for # @return [true, false] returns true if any one of specified values # occur in the vector # @example # dv = Daru::Vector.new [1, 2, 3, 4, nil] # dv.include_values? nil, Float::NAN # # => true def include_values?(*values) values.any? { |v| include_with_nan? @data, v } end # @note Do not use it to check for Float::NAN as # Float::NAN == Float::NAN is false # Return vector of booleans with value at ith position is either # true or false depending upon whether value at position i is equal to # any of the values passed in the argument or not # @param values [Array] values to equate with # @return [Daru::Vector] vector of boolean values # @example # dv = Daru::Vector.new [1, 2, 3, 2, 1] # dv.is_values 1, 2 # # => # # # 0 true # # 1 true # # 2 false # # 3 true # # 4 true def is_values(*values) Daru::Vector.new values.map { |v| eq(v) }.inject(:|) end # Append an element to the vector by specifying the element and index def concat element, index raise IndexError, 'Expected new unique index' if @index.include? index @index |= [index] @data[@index[index]] = element update_position_cache end alias :push :concat alias :<< :concat # Cast a vector to a new data type. # # == Options # # * +:dtype+ - :array for Ruby Array. :nmatrix for NMatrix. def cast opts={} dt = opts[:dtype] raise ArgumentError, "Unsupported dtype #{opts[:dtype]}" unless %i[array nmatrix gsl].include?(dt) @data = cast_vector_to dt unless @dtype == dt end # Delete an element by value def delete element delete_at index_of(element) end # Delete element by index def delete_at index @data.delete_at @index[index] @index = Daru::Index.new(@index.to_a - [index]) update_position_cache end # The type of data contained in the vector. Can be :object or :numeric. If # the underlying dtype is an NMatrix, this method will return the data type # of the NMatrix object. # # Running through the data to figure out the kind of data is delayed to the # last possible moment. def type return @data.nm_dtype if dtype == :nmatrix if @type.nil? || @possibly_changed_type @type = :numeric each do |e| next if e.nil? || e.is_a?(Numeric) @type = :object break end @possibly_changed_type = false end @type end # Tells if vector is categorical or not. # @return [true, false] true if vector is of type category, false otherwise # @example # dv = Daru::Vector.new [1, 2, 3], type: :category # dv.category? # # => true def category? type == :category end # Get index of element def index_of element case dtype when :array then @index.key(@data.index { |x| x.eql? element }) else @index.key @data.index(element) end end # Keep only unique elements of the vector alongwith their indexes. def uniq uniq_vector = @data.uniq new_index = uniq_vector.map { |element| index_of(element) } Daru::Vector.new uniq_vector, name: @name, index: new_index, dtype: @dtype end def any? &block @data.data.any?(&block) end def all? &block @data.data.all?(&block) end # Sorts a vector according to its values. If a block is specified, the contents # will be evaluated and data will be swapped whenever the block evaluates # to *true*. Defaults to ascending order sorting. Any missing values will be # put at the end of the vector. Preserves indexing. Default sort algorithm is # quick sort. # # == Options # # * +:ascending+ - if false, will sort in descending order. Defaults to true. # # * +:type+ - Specify the sorting algorithm. Only supports quick_sort for now. # == Usage # # v = Daru::Vector.new ["My first guitar", "jazz", "guitar"] # # Say you want to sort these strings by length. # v.sort(ascending: false) { |a,b| a.length <=> b.length } def sort opts={}, &block opts = {ascending: true}.merge(opts) vector_index = resort_index(@data.each_with_index, opts, &block) vector, index = vector_index.transpose index = @index.reorder index Daru::Vector.new(vector, index: index, name: @name, dtype: @dtype) end # Sorts the vector according to it's`Index` values. Defaults to ascending # order sorting. # # @param [Hash] opts the options for sort_by_index method. # @option opts [Boolean] :ascending false, will sort `index` in # descending order. # # @return [Vector] new sorted `Vector` according to the index values. # # @example # # dv = Daru::Vector.new [11, 13, 12], index: [23, 21, 22] # # Say you want to sort index in ascending order # dv.sort_by_index(ascending: true) # #=> Daru::Vector.new [13, 12, 11], index: [21, 22, 23] # # Say you want to sort index in descending order # dv.sort_by_index(ascending: false) # #=> Daru::Vector.new [11, 12, 13], index: [23, 22, 21] def sort_by_index opts={} opts = {ascending: true}.merge(opts) _, new_order = resort_index(@index.each_with_index, opts).transpose reorder new_order end DEFAULT_SORTER = lambda { |(lv, li), (rv, ri)| case when lv.nil? && rv.nil? li <=> ri when lv.nil? -1 when rv.nil? 1 else lv <=> rv end } # Just sort the data and get an Array in return using Enumerable#sort. # Non-destructive. # :nocov: def sorted_data &block @data.to_a.sort(&block) end # :nocov: # Like map, but returns a Daru::Vector with the returned values. def recode dt=nil, &block return to_enum(:recode) unless block_given? dup.recode! dt, &block end # Destructive version of recode! def recode! dt=nil, &block return to_enum(:recode!) unless block_given? @data.map!(&block).data @data = cast_vector_to(dt || @dtype) self end # Delete an element if block returns true. Destructive. def delete_if return to_enum(:delete_if) unless block_given? keep_e, keep_i = each_with_index.reject { |n, _i| yield(n) }.transpose @data = cast_vector_to @dtype, keep_e @index = Daru::Index.new(keep_i) update_position_cache self end # Keep an element if block returns true. Destructive. def keep_if return to_enum(:keep_if) unless block_given? delete_if { |val| !yield(val) } end # Reports all values that doesn't comply with a condition. # Returns a hash with the index of data and the invalid data. def verify (0...size) .map { |i| [i, @data[i]] } .reject { |_i, val| yield(val) } .to_h end # Return an Array with the data splitted by a separator. # a=Daru::Vector.new(["a,b","c,d","a,b","d"]) # a.splitted # => # [["a","b"],["c","d"],["a","b"],["d"]] def splitted sep=',' @data.map do |s| if s.nil? nil elsif s.respond_to? :split s.split sep else [s] end end end # Returns a hash of Vectors, defined by the different values # defined on the fields # Example: # # a=Daru::Vector.new(["a,b","c,d","a,b"]) # a.split_by_separator # => {"a"=>#, # "b"=>#, # "c"=>#} # def split_by_separator sep=',' split_data = splitted sep split_data .flatten.uniq.compact.map do |key| [ key, Daru::Vector.new(split_data.map { |v| split_value(key, v) }) ] end.to_h end def split_by_separator_freq(sep=',') split_by_separator(sep).map { |k, v| [k, v.map(&:to_i).inject(:+)] }.to_h end def reset_index! @index = Daru::Index.new(Array.new(size) { |i| i }) self end # Replace all nils in the vector with the value passed as an argument. Destructive. # See #replace_nils for non-destructive version # # == Arguments # # * +replacement+ - The value which should replace all nils def replace_nils! replacement indexes(*Daru::MISSING_VALUES).each do |idx| self[idx] = replacement end self end # Rolling fillna # replace all Float::NAN and NIL values with the preceeding or following value # # @param direction [Symbol] (:forward, :backward) whether replacement value is preceeding or following # # @example # dv = Daru::Vector.new([1, 2, 1, 4, nil, Float::NAN, 3, nil, Float::NAN]) # # 2.3.3 :068 > dv.rolling_fillna(:forward) # => # # 0 1 # 1 2 # 2 1 # 3 4 # 4 4 # 5 4 # 6 3 # 7 3 # 8 3 # def rolling_fillna!(direction=:forward) enum = direction == :forward ? index : index.reverse_each last_valid_value = 0 enum.each do |idx| if valid_value?(self[idx]) last_valid_value = self[idx] else self[idx] = last_valid_value end end self end # Non-destructive version of rolling_fillna! def rolling_fillna(direction=:forward) dup.rolling_fillna!(direction) end # Lags the series by `k` periods. # # Lags the series by `k` periods, "shifting" data and inserting `nil`s # from beginning or end of a vector, while preserving original vector's # size. # # `k` can be positive or negative integer. If `k` is positive, `nil`s # are inserted at the beginning of the vector, otherwise they are # inserted at the end. # # @param [Integer] k "shift" the series by `k` periods. `k` can be # positive or negative. (default = 1) # # @return [Daru::Vector] a new vector with "shifted" inital values # and `nil` values inserted. The return vector is the same length # as the orignal vector. # # @example Lag a vector with different periods `k` # # ts = Daru::Vector.new(1..5) # # => [1, 2, 3, 4, 5] # # ts.lag # => [nil, 1, 2, 3, 4] # ts.lag(1) # => [nil, 1, 2, 3, 4] # ts.lag(2) # => [nil, nil, 1, 2, 3] # ts.lag(-1) # => [2, 3, 4, 5, nil] # def lag k=1 case k when 0 then dup when 1...size copy([nil] * k + data.to_a) when -size..-1 copy(data.to_a[k.abs...size]) else copy([]) end end def detach_index Daru::DataFrame.new( index: @index.to_a, values: @data.to_a ) end # Non-destructive version of #replace_nils! def replace_nils replacement dup.replace_nils!(replacement) end # number of non-missing elements def n_valid size - indexes(*Daru::MISSING_VALUES).size end deprecate :n_valid, :count_values, 2016, 10 # Count the number of values specified # @param values [Array] values to count for # @return [Integer] the number of times the values mentioned occurs # @example # dv = Daru::Vector.new [1, 2, 1, 2, 3, 4, nil, nil] # dv.count_values nil # # => 2 def count_values(*values) positions(*values).size end # Returns *true* if an index exists def has_index? index @index.include? index end # @param keys [Array] can be positions (if by_position is true) or indexes (if by_position if false) # @return [Daru::Vector] def get_sub_vector(keys, by_position: true) return Daru::Vector.new([]) if keys == [] keys = @index.pos(*keys) unless by_position sub_vect = at(*keys) sub_vect = Daru::Vector.new([sub_vect]) unless sub_vect.is_a?(Daru::Vector) sub_vect end # @return [Daru::DataFrame] the vector as a single-vector dataframe def to_df Daru::DataFrame.new({@name => @data}, name: @name, index: @index) end # Convert Vector to a horizontal or vertical Ruby Matrix. # # == Arguments # # * +axis+ - Specify whether you want a *:horizontal* or a *:vertical* matrix. def to_matrix axis=:horizontal if axis == :horizontal Matrix[to_a] elsif axis == :vertical Matrix.columns([to_a]) else raise ArgumentError, "axis should be either :horizontal or :vertical, not #{axis}" end end # Convert vector to nmatrix object # @param [Symbol] axis :horizontal or :vertical # @return [NMatrix] NMatrix object containing all values of the vector # @example # dv = Daru::Vector.new [1, 2, 3] # dv.to_nmatrix # # => # # [ # # [1, 2, 3] ] def to_nmatrix axis=:horizontal unless numeric? && !include?(nil) raise ArgumentError, 'Can not convert to nmatrix'\ 'because the vector is numeric' end case axis when :horizontal NMatrix.new [1, size], to_a when :vertical NMatrix.new [size, 1], to_a else raise ArgumentError, 'Invalid axis specified. '\ 'Valid axis are :horizontal and :vertical' end end # If dtype != gsl, will convert data to GSL::Vector with to_a. Otherwise returns # the stored GSL::Vector object. def to_gsl raise NoMethodError, 'Install gsl-nmatrix for access to this functionality.' unless Daru.has_gsl? if dtype == :gsl @data.data else GSL::Vector.alloc(reject_values(*Daru::MISSING_VALUES).to_a) end end # Convert to hash (explicit). Hash keys are indexes and values are the correspoding elements def to_h @index.map { |index| [index, self[index]] }.to_h end # Return an array def to_a @data.to_a end # Convert the hash from to_h to json def to_json(*) to_h.to_json end # Convert to html for iruby def to_html(threshold=30) table_thead = to_html_thead table_tbody = to_html_tbody(threshold) path = if index.is_a?(MultiIndex) File.expand_path('../iruby/templates/vector_mi.html.erb', __FILE__) else File.expand_path('../iruby/templates/vector.html.erb', __FILE__) end ERB.new(File.read(path).strip).result(binding) end def to_html_thead table_thead_path = if index.is_a?(MultiIndex) File.expand_path('../iruby/templates/vector_mi_thead.html.erb', __FILE__) else File.expand_path('../iruby/templates/vector_thead.html.erb', __FILE__) end ERB.new(File.read(table_thead_path).strip).result(binding) end def to_html_tbody(threshold=30) table_tbody_path = if index.is_a?(MultiIndex) File.expand_path('../iruby/templates/vector_mi_tbody.html.erb', __FILE__) else File.expand_path('../iruby/templates/vector_tbody.html.erb', __FILE__) end ERB.new(File.read(table_tbody_path).strip).result(binding) end def to_s "#<#{self.class}#{': ' + @name.to_s if @name}(#{size})#{':category' if category?}>" end # Create a summary of the Vector # @param indent_level [Fixnum] indent level # @return [String] String containing the summary of the Vector # @example # dv = Daru::Vector.new [1, 2, 3] # puts dv.summary # # # = # # n :3 # # non-missing:3 # # median: 2 # # mean: 2.0000 # # std.dev.: 1.0000 # # std.err.: 0.5774 # # skew: 0.0000 # # kurtosis: -2.3333 def summary(indent_level=0) non_missing = size - count_values(*Daru::MISSING_VALUES) summary = ' =' * indent_level + "= #{name}" \ "\n n :#{size}" \ "\n non-missing:#{non_missing}" case type when :object summary << object_summary when :numeric summary << numeric_summary end summary.split("\n").join("\n" + ' ' * indent_level) end # Displays summary for an object type Vector # @return [String] String containing object vector summary def object_summary nval = count_values(*Daru::MISSING_VALUES) summary = "\n factors: #{factors.to_a.join(',')}" \ "\n mode: #{mode.to_a.join(',')}" \ "\n Distribution\n" data = frequencies.sort.each_with_index.map do |v, k| [k, v, '%0.2f%%' % ((nval.zero? ? 1 : v.quo(nval))*100)] end summary + Formatters::Table.format(data) end # Displays summary for an numeric type Vector # @return [String] String containing numeric vector summary def numeric_summary summary = "\n median: #{median}" + "\n mean: %0.4f" % mean if sd summary << "\n std.dev.: %0.4f" % sd + "\n std.err.: %0.4f" % se end if count_values(*Daru::MISSING_VALUES).zero? summary << "\n skew: %0.4f" % skew + "\n kurtosis: %0.4f" % kurtosis end summary end # Over rides original inspect for pretty printing in irb def inspect spacing=20, threshold=15 row_headers = index.is_a?(MultiIndex) ? index.sparse_tuples : index.to_a "#<#{self.class}(#{size})#{':category' if category?}>\n" + Formatters::Table.format( to_a.lazy.map { |v| [v] }, headers: @name && [@name], row_headers: row_headers, threshold: threshold, spacing: spacing ) end # Sets new index for vector. Preserves index->value correspondence. # Sets nil for new index keys absent from original index. # @note Unlike #reorder! which takes positions as input it takes # index as an input to reorder the vector # @param [Daru::Index, Daru::MultiIndex] new_index new index to order with # @return [Daru::Vector] vector reindexed with new index def reindex! new_index values = [] each_with_index do |val, i| values[new_index[i]] = val if new_index.include?(i) end values.fill(nil, values.size, new_index.size - values.size) @data = cast_vector_to @dtype, values @index = new_index update_position_cache self end # Reorder the vector with given positions # @note Unlike #reindex! which takes index as input, it takes # positions as an input to reorder the vector # @param [Array] order the order to reorder the vector with # @return reordered vector # @example # dv = Daru::Vector.new [3, 2, 1], index: ['c', 'b', 'a'] # dv.reorder! [2, 1, 0] # # => # # # a 1 # # b 2 # # c 3 def reorder! order @index = @index.reorder order data_array = order.map { |i| @data[i] } @data = cast_vector_to @dtype, data_array, @nm_dtype update_position_cache self end # Non-destructive version of #reorder! def reorder order dup.reorder! order end # Create a new vector with a different index, and preserve the indexing of # current elements. def reindex new_index dup.reindex!(new_index) end def index= idx idx = Index.coerce idx if idx.size != size raise ArgumentError, "Size of supplied index #{idx.size} does not match size of Vector" end unless idx.is_a?(Daru::Index) raise ArgumentError, 'Can only assign type Index and its subclasses.' end @index = idx self end # Give the vector a new name # # @param new_name [Symbol] The new name. def rename new_name @name = new_name self end alias_method :name=, :rename # Duplicated a vector # @return [Daru::Vector] duplicated vector def dup Daru::Vector.new @data.dup, name: @name, index: @index.dup end # == Bootstrap # Generate +nr+ resamples (with replacement) of size +s+ # from vector, computing each estimate from +estimators+ # over each resample. # +estimators+ could be # a) Hash with variable names as keys and lambdas as values # a.bootstrap(:log_s2=>lambda {|v| Math.log(v.variance)},1000) # b) Array with names of method to bootstrap # a.bootstrap([:mean, :sd],1000) # c) A single method to bootstrap # a.jacknife(:mean, 1000) # If s is nil, is set to vector size by default. # # Returns a DataFrame where each vector is a vector # of length +nr+ containing the computed resample estimates. def bootstrap(estimators, nr, s=nil) s ||= size h_est, es, bss = prepare_bootstrap(estimators) nr.times do bs = sample_with_replacement(s) es.each do |estimator| bss[estimator].push(h_est[estimator].call(bs)) end end es.each do |est| bss[est] = Daru::Vector.new bss[est] end Daru::DataFrame.new bss end # == Jacknife # Returns a dataset with jacknife delete-+k+ +estimators+ # +estimators+ could be: # a) Hash with variable names as keys and lambdas as values # a.jacknife(:log_s2=>lambda {|v| Math.log(v.variance)}) # b) Array with method names to jacknife # a.jacknife([:mean, :sd]) # c) A single method to jacknife # a.jacknife(:mean) # +k+ represent the block size for block jacknife. By default # is set to 1, for classic delete-one jacknife. # # Returns a dataset where each vector is an vector # of length +cases+/+k+ containing the computed jacknife estimates. # # == Reference: # * Sawyer, S. (2005). Resampling Data: Using a Statistical Jacknife. def jackknife(estimators, k=1) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength raise "n should be divisible by k:#{k}" unless (size % k).zero? nb = (size / k).to_i h_est, es, ps = prepare_bootstrap(estimators) est_n = es.map { |v| [v, h_est[v].call(self)] }.to_h nb.times do |i| other = @data.dup other.slice!(i*k, k) other = Daru::Vector.new other es.each do |estimator| # Add pseudovalue ps[estimator].push( nb * est_n[estimator] - (nb-1) * h_est[estimator].call(other) ) end end es.each do |est| ps[est] = Daru::Vector.new ps[est] end Daru::DataFrame.new ps end # Returns an array of either none or integer values, indicating the # +regexp+ matching with the given array. # # @param regexp [Regexp] A regular matching expression. For example, +/weeks/+. # # @return [Array] Containing either +nil+ or integer values, according to the match with the given +regexp+ # # @example # dv = Daru::Vector.new(['3 days', '5 weeks', '2 weeks']) # dv.match(/weeks/) # # # => [false, true, true] def match(regexp) @data.map { |value| !!(value =~ regexp) } end # Creates a new vector consisting only of non-nil data # # == Arguments # # @param as_a [Symbol] Passing :array will return only the elements # as an Array. Otherwise will return a Daru::Vector. # # @param _duplicate [Symbol] In case no missing data is found in the # vector, setting this to false will return the same vector. # Otherwise, a duplicate will be returned irrespective of # presence of missing data. def only_valid as_a=:vector, _duplicate=true # FIXME: Now duplicate is just ignored. # There are no spec that fail on this case, so I'll leave it # this way for now - zverok, 2016-05-07 new_index = @index.to_a - indexes(*Daru::MISSING_VALUES) new_vector = new_index.map { |idx| self[idx] } if as_a == :vector Daru::Vector.new new_vector, index: new_index, name: @name, dtype: dtype else new_vector end end deprecate :only_valid, :reject_values, 2016, 10 # Return a vector with specified values removed # @param values [Array] values to reject from resultant vector # @return [Daru::Vector] vector with specified values removed # @example # dv = Daru::Vector.new [1, 2, nil, Float::NAN] # dv.reject_values nil, Float::NAN # # => # # # 0 1 # # 1 2 def reject_values(*values) resultant_pos = size.times.to_a - positions(*values) dv = at(*resultant_pos) # Handle the case when number of positions is 1 # and hence #at doesn't return a vector if dv.is_a?(Daru::Vector) dv else pos = resultant_pos.first at(pos..pos) end end # Return indexes of values specified # @param values [Array] values to find indexes for # @return [Array] array of indexes of values specified # @example # dv = Daru::Vector.new [1, 2, nil, Float::NAN], index: 11..14 # dv.indexes nil, Float::NAN # # => [13, 14] def indexes(*values) index.to_a.values_at(*positions(*values)) end # Replaces specified values with a new value # @param [Array] old_values array of values to replace # @param [object] new_value new value to replace with # @note It performs the replace in place. # @return [Daru::Vector] Same vector itself with values # replaced with new value # @example # dv = Daru::Vector.new [1, 2, :a, :b] # dv.replace_values [:a, :b], nil # dv # # => # # # # # nil # # 0 1 # # 1 2 # # 2 nil # # 3 nil def replace_values(old_values, new_value) old_values = [old_values] unless old_values.is_a? Array size.times do |pos| set_at([pos], new_value) if include_with_nan? old_values, at(pos) end self end # Returns a Vector containing only missing data (preserves indexes). def only_missing as_a=:vector if as_a == :vector self[*indexes(*Daru::MISSING_VALUES)] elsif as_a == :array self[*indexes(*Daru::MISSING_VALUES)].to_a end end deprecate :only_missing, nil, 2016, 10 # Returns a Vector with only numerical data. Missing data is included # but non-Numeric objects are excluded. Preserves index. def only_numerics numeric_indexes = each_with_index .select { |v, _i| v.is_a?(Numeric) || v.nil? } .map(&:last) self[*numeric_indexes] end DATE_REGEXP = /^(\d{2}-\d{2}-\d{4}|\d{4}-\d{2}-\d{2})$/ # Returns the database type for the vector, according to its content def db_type # first, detect any character not number case when @data.any? { |v| v.to_s =~ DATE_REGEXP } 'DATE' when @data.any? { |v| v.to_s =~ /[^0-9e.-]/ } 'VARCHAR (255)' when @data.any? { |v| v.to_s =~ /\./ } 'DOUBLE' else 'INTEGER' end end # Copies the structure of the vector (i.e the index, size, etc.) and fills all # all values with nils. def clone_structure Daru::Vector.new(([nil]*size), name: @name, index: @index.dup) end # Save the vector to a file # # == Arguments # # * filename - Path of file where the vector is to be saved def save filename Daru::IO.save self, filename end def _dump(*) # :nodoc: Marshal.dump( data: @data.to_a, dtype: @dtype, name: @name, index: @index ) end # :nocov: def daru_vector(*) self end # :nocov: alias :dv :daru_vector # Converts a non category type vector to category type vector. # @param [Hash] opts options to convert to category # @option opts [true, false] :ordered Specify if vector is ordered or not. # If it is ordered, it can be sorted and min, max like functions would work # @option opts [Array] :categories set categories in the specified order # @return [Daru::Vector] vector with type category def to_category opts={} dv = Daru::Vector.new to_a, type: :category, name: @name, index: @index dv.ordered = opts[:ordered] || false dv.categories = opts[:categories] if opts[:categories] dv end def method_missing(name, *args, &block) # FIXME: it is shamefully fragile. Should be either made stronger # (string/symbol dychotomy, informative errors) or removed totally. - zverok if name =~ /(.+)\=/ self[$1.to_sym] = args[0] elsif has_index?(name) self[name] else super end end def respond_to_missing?(name, include_private=false) name.to_s.end_with?('=') || has_index?(name) || super end # Partition a numeric variable into categories. # @param [Array] partitions an array whose consecutive elements # provide intervals for categories # @param [Hash] opts options to cut the partition # @option opts [:left, :right] :close_at specifies whether the interval closes at # the right side of left side # @option opts [Array] :labels names of the categories # @return [Daru::Vector] numeric variable converted to categorical variable # @example # heights = Daru::Vector.new [30, 35, 32, 50, 42, 51] # height_cat = heights.cut [30, 40, 50, 60], labels=['low', 'medium', 'high'] # # => # # # 0 low # # 1 low # # 2 low # # 3 high # # 4 medium # # 5 high def cut partitions, opts={} close_at, labels = opts[:close_at] || :right, opts[:labels] partitions = partitions.to_a values = to_a.map { |val| cut_find_category partitions, val, close_at } cats = cut_categories(partitions, close_at) dv = Daru::Vector.new values, index: @index, type: :category, categories: cats # Rename categories if new labels provided if labels dv.rename_categories Hash[cats.zip(labels)] else dv end end def positions(*values) case values when [nil] nil_positions when [Float::NAN] nan_positions when [nil, Float::NAN], [Float::NAN, nil] nil_positions + nan_positions else size.times.select { |i| include_with_nan? values, @data[i] } end end def group_by(*args) to_df.group_by(*args) end private # Will lazily load the plotting library being used for this vector def init_plotting_library self.plotting_library = Daru.plotting_library end def copy(values) # Make sure values is right-justified to the size of the vector values.concat([nil] * (size-values.size)) if values.size < size Daru::Vector.new(values[0...size], index: @index, name: @name) end def nil_positions @nil_positions || @nil_positions = size.times.select { |i| @data[i].nil? } end def nan_positions @nan_positions || @nan_positions = size.times.select do |i| @data[i].respond_to?(:nan?) && @data[i].nan? end end # Helper method returning validity of arbitrary value def valid_value?(v) v.respond_to?(:nan?) && v.nan? || v.nil? ? false : true end def initialize_vector source, opts index, source = parse_source(source, opts) set_name opts[:name] @data = cast_vector_to(opts[:dtype] || :array, source, opts[:nm_dtype]) @index = Index.coerce(index || @data.size) guard_sizes! @possibly_changed_type = true end def parse_source source, opts if source.is_a?(Hash) [source.keys, source.values] else [opts[:index], source || []] end end def guard_sizes! if @index.size > @data.size cast(dtype: :array) # NM with nils seg faults @data.fill(nil, @data.size...@index.size) elsif @index.size < @data.size raise IndexError, "Expected index size >= vector size. Index size : #{@index.size}, vector size : #{@data.size}" end end def guard_type_check value @possibly_changed_type = true \ if object? && (value.nil? || value.is_a?(Numeric)) || numeric? && !value.is_a?(Numeric) && !value.nil? end def split_value key, v case when v.nil? then nil when v.include?(key) then 1 else 0 end end # For an array or hash of estimators methods, returns # an array with three elements # 1.- A hash with estimators names as keys and lambdas as values # 2.- An array with estimators names # 3.- A Hash with estimators names as keys and empty arrays as values def prepare_bootstrap(estimators) h_est = estimators h_est = [h_est] unless h_est.is_a?(Array) || h_est.is_a?(Hash) if h_est.is_a? Array h_est = h_est.map do |est| [est, ->(v) { Daru::Vector.new(v).send(est) }] end.to_h end bss = h_est.keys.map { |v| [v, []] }.to_h [h_est, h_est.keys, bss] end # Note: To maintain sanity, this _MUST_ be the _ONLY_ place in daru where the # @param dtype [db_type] variable is set and the underlying data type of vector changed. def cast_vector_to dtype, source=nil, nm_dtype=nil source = @data.to_a if source.nil? new_vector = case dtype when :array then Daru::Accessors::ArrayWrapper.new(source, self) when :nmatrix then Daru::Accessors::NMatrixWrapper.new(source, self, nm_dtype) when :gsl then Daru::Accessors::GSLWrapper.new(source, self) when :mdarray then raise NotImplementedError, 'MDArray not yet supported.' else raise ArgumentError, "Unknown dtype #{dtype}" end @dtype = dtype new_vector end def set_name name # rubocop:disable Style/AccessorMethodName @name = if name.is_a?(Numeric) then name elsif name.is_a?(Array) then name.join # in case of MultiIndex tuple elsif name then name # anything but Numeric or nil else nil end end # Raises IndexError when one of the positions is an invalid position def validate_positions *positions positions.each do |pos| raise IndexError, "#{pos} is not a valid position." if pos >= size end end # coerce ranges, integers and array in appropriate ways def coerce_positions *positions if positions.size == 1 case positions.first when Integer positions.first when Range size.times.to_a[positions.first] else raise ArgumentError, 'Unkown position type.' end else positions end end # Helper method for []=. # Assigs existing index to another value def modify_vector(indexes, val) positions = @index.pos(*indexes) if positions.is_a? Numeric @data[positions] = val else positions.each { |pos| @data[pos] = val } end end # Helper method for []=. # Add a new index and assign it value def insert_vector(indexes, val) new_index = @index.add(*indexes) # May be create += (new_index.size - @index.size).times { @data << val } @index = new_index end # Works similar to #[]= but also insert the vector in case index is not valid # It is there only to be accessed by Daru::DataFrame and not meant for user. def set indexes, val cast(dtype: :array) if val.nil? && dtype != :array guard_type_check(val) if @index.valid?(*indexes) modify_vector(indexes, val) else insert_vector(indexes, val) end update_position_cache end def cut_find_category partitions, val, close_at case close_at when :right right_index = partitions.index { |i| i > val } raise ArgumentError, 'Invalid partition' if right_index.nil? left_index = right_index - 1 "#{partitions[left_index]}-#{partitions[right_index]-1}" when :left right_index = partitions.index { |i| i >= val } raise ArgumentError, 'Invalid partition' if right_index.nil? left_index = right_index - 1 "#{partitions[left_index]+1}-#{partitions[right_index]}" else raise ArgumentError, "Invalid parameter #{close_at} to close_at." end end def cut_categories partitions, close_at case close_at when :right Array.new(partitions.size-1) do |left_index| "#{partitions[left_index]}-#{partitions[left_index+1]-1}" end when :left Array.new(partitions.size-1) do |left_index| "#{partitions[left_index]+1}-#{partitions[left_index+1]}" end end end def include_with_nan? array, value # Returns true if value is included in array. # Similar to include? but also works if value is Float::NAN if value.respond_to?(:nan?) && value.nan? array.any? { |i| i.respond_to?(:nan?) && i.nan? } else array.include? value end end def update_position_cache @nil_positions = nil @nan_positions = nil end def resort_index vector_index, opts if block_given? vector_index.sort { |(lv, _li), (rv, _ri)| yield(lv, rv) } else vector_index.sort(&DEFAULT_SORTER) end .tap { |res| res.reverse! unless opts[:ascending] } end end end