lib/daru/dataframe.rb in daru-0.1.2 vs lib/daru/dataframe.rb in daru-0.1.3

- old
+ new

@@ -1,16 +1,13 @@ -$:.unshift File.dirname(__FILE__) +require 'daru/accessors/dataframe_by_row.rb' +require 'daru/maths/arithmetic/dataframe.rb' +require 'daru/maths/statistics/dataframe.rb' +require 'daru/plotting/dataframe.rb' +require 'daru/io/io.rb' -require 'accessors/dataframe_by_row.rb' -require 'maths/arithmetic/dataframe.rb' -require 'maths/statistics/dataframe.rb' -require 'plotting/dataframe.rb' -require 'io/io.rb' - module Daru class DataFrame - include Daru::Maths::Arithmetic::DataFrame include Daru::Maths::Statistics::DataFrame include Daru::Plotting::DataFrame if Daru.has_nyaplot? class << self @@ -113,35 +110,34 @@ end # Create DataFrame by specifying rows as an Array of Arrays or Array of # Daru::Vector objects. def rows source, opts={} - df = nil - if source.all? { |v| v.size == source[0].size } - first = source[0] - index = [] - opts[:order] ||= - if first.is_a?(Daru::Vector) # assume that all are Vectors - source.each { |vec| index << vec.name } + first = source.first + + raise SizeError, 'All vectors must have same length' \ + unless source.all? { |v| v.size == first.size } + + index = [] + opts[:order] ||= + case first + when Daru::Vector # assume that all are Vectors + index = source.map(&:name) first.index.to_a - elsif first.is_a?(Array) - Array.new(first.size) { |i| i.to_s } + when Array + Array.new(first.size, &:to_s) end - if source.all? { |s| s.is_a?(Array) } - df = Daru::DataFrame.new(source.transpose, opts) - else # array of Daru::Vectors - df = Daru::DataFrame.new({}, opts) + if source.all? { |s| s.is_a?(Array) } + Daru::DataFrame.new(source.transpose, opts) + else # array of Daru::Vectors + Daru::DataFrame.new({}, opts).tap do |df| source.each_with_index do |row, idx| - df[(index[idx] || idx), :row] = row + df[index[idx] || idx, :row] = row end end - else - raise SizeError, "All vectors must have same length" end - - df end # Generates a new dataset, using three vectors # - Rows # - Columns @@ -160,22 +156,20 @@ # a 0 1 # b 1 0 # # Useful to process outputs from databases def crosstab_by_assignation rows, columns, values - raise "Three vectors should be equal size" if - rows.size != columns.size or rows.size!=values.size + raise 'Three vectors should be equal size' if + rows.size != columns.size || rows.size!=values.size cols_values = columns.factors cols_n = cols_values.size - h_rows = rows.factors.inject({}) do |a,v| - a[v] = cols_values.inject({}) do |a1,v1| + h_rows = rows.factors.each_with_object({}) do |v, a| + a[v] = cols_values.each_with_object({}) do |v1, a1| a1[v1]=nil - a1 end - a end values.each_index do |i| h_rows[rows[i]][columns[i]] = values[i] end @@ -248,11 +242,11 @@ index = opts[:index] clone = opts[:clone] == false ? false : true @data = [] temp_name = opts[:name] - @name = temp_name || SecureRandom.uuid + @name = temp_name || SecureRandom.uuid if source.empty? @vectors = try_create_index vectors @index = try_create_index index create_empty_vectors @@ -264,44 +258,44 @@ equal order size (#{source.size})" if source.size != vectors.size @index = try_create_index(index || source[0].size) @vectors = try_create_index(vectors) - @vectors.each_with_index do |vec,idx| + @vectors.each_with_index do |_vec,idx| @data << Daru::Vector.new(source[idx], index: @index) end elsif source.all? { |s| s.is_a?(Daru::Vector) } hsh = {} vectors.each_with_index do |name, idx| hsh[name] = source[idx] end initialize(hsh, index: index, order: vectors, name: @name, clone: clone) else # array of hashes - if vectors.nil? - @vectors = Daru::Index.new source[0].keys - else - @vectors = Daru::Index.new( - (vectors + (source[0].keys - vectors)).uniq) - end + @vectors = + if vectors.nil? + Daru::Index.new source[0].keys + else + Daru::Index.new((vectors + (source[0].keys - vectors)).uniq) + end @index = Daru::Index.new(index || source.size) @vectors.each do |name| v = [] - source.each do |hsh| - v << (hsh[name] || hsh[name.to_s]) + source.each do |h| + v << (h[name] || h[name.to_s]) end @data << Daru::Vector.new(v, name: set_name(name), index: @index) end end when Hash create_vectors_index_with vectors, source if all_daru_vectors_in_source? source + vectors_have_same_index = all_vectors_have_equal_indexes?(source) if !index.nil? @index = try_create_index index - elsif all_vectors_have_equal_indexes?(source) - vectors_have_same_index = true + elsif vectors_have_same_index @index = source.values[0].index.dup else all_indexes = [] source.each_value do |vector| all_indexes << vector.index.to_a @@ -318,18 +312,14 @@ # avoids matching indexes of vectors if all the supplied vectors # have the same index. if vectors_have_same_index v = source[vector].dup else - v = Daru::Vector.new([], name: vector, index: @index) + v = Daru::Vector.new([], name: vector, metadata: source[vector].metadata.dup, index: @index) @index.each do |idx| - if source[vector].index.include? idx - v[idx] = source[vector][idx] - else - v[idx] = nil - end + v[idx] = source[vector].index.include?(idx) ? source[vector][idx] : nil end end @data << v end else @@ -337,42 +327,42 @@ end else @index = try_create_index(index || source.values[0].size) @vectors.each do |name| - @data << Daru::Vector.new(source[name].dup, name: set_name(name), index: @index) + meta_opt = source[name].respond_to?(:metadata) ? {metadata: source[name].metadata.dup} : {} + @data << Daru::Vector.new(source[name].dup, name: set_name(name), **meta_opt, index: @index) end end end end set_size validate update end - def vector *args - $stderr.puts "#vector has been deprecated in favour of #[]. Please use that." + def vector(*) + $stderr.puts '#vector has been deprecated in favour of #[]. Please use that.' self[*names] end # Access row or vector. Specify name of row/vector followed by axis(:row, :vector). # Defaults to *:vector*. Use of this method is not recommended for accessing - # rows or vectors. Use df.row[:a] for accessing row with index ':a' or - # df.vector[:vec] for accessing vector with index *:vec*. + # rows. Use df.row[:a] for accessing row with index ':a'. def [](*names) - if names[-1] == :vector or names[-1] == :row + if names[-1] == :vector || names[-1] == :row axis = names[-1] names = names[0..-2] else axis = :vector end if axis == :vector - access_vector *names + access_vector(*names) elsif axis == :row - access_row *names + access_row(*names) else raise IndexError, "Expected axis to be row or vector not #{axis}" end end @@ -431,11 +421,11 @@ def dup vectors_to_dup=nil vectors_to_dup = @vectors.to_a unless vectors_to_dup src = [] vectors_to_dup.each do |vec| - src << @data[@vectors[vec]].to_a.dup + src << @data[@vectors[vec]].dup end new_order = Daru::Index.new(vectors_to_dup) Daru::DataFrame.new src, order: new_order, index: @index.dup, name: @name, clone: true end @@ -452,15 +442,14 @@ # # +vectors_to_clone+ - Names of vectors to clone. Optional. Will return # a view of the whole data frame otherwise. def clone *vectors_to_clone vectors_to_clone.flatten! unless vectors_to_clone.all? { |a| !a.is_a?(Array) } - return super if vectors_to_clone.empty? + vectors_to_clone = @vectors.to_a if vectors_to_clone.empty? - h = vectors_to_clone.inject({}) do |hsh, vec| + h = vectors_to_clone.each_with_object({}) do |vec, hsh| hsh[vec] = self[vec] - hsh end Daru::DataFrame.new(h, clone: false) end # Returns a 'shallow' copy of DataFrame if missing data is not present, @@ -474,13 +463,12 @@ end # Creates a new duplicate dataframe containing only rows # without a single missing value. def dup_only_valid vecs=nil - rows_with_nil = @data.inject([]) do |memo, vector| + rows_with_nil = @data.each_with_object([]) do |vector, memo| memo.concat vector.missing_positions - memo end.uniq row_indexes = @index.to_a (vecs.nil? ? self : dup(vecs)).row[*(row_indexes - rows_with_nil)] end @@ -503,11 +491,11 @@ end alias_method :each_column, :each_vector # Iterate over each vector alongwith the name of the vector - def each_vector_with_index(&block) + def each_vector_with_index return to_enum(:each_vector_with_index) unless block_given? @vectors.each do |vector| yield @data[@vectors[vector]], vector end @@ -516,21 +504,21 @@ end alias_method :each_column_with_index, :each_vector_with_index # Iterate over each row - def each_row(&block) + def each_row return to_enum(:each_row) unless block_given? @index.each do |index| yield access_row(index) end self end - def each_row_with_index(&block) + def each_row_with_index return to_enum(:each_row_with_index) unless block_given? @index.each do |index| yield access_row(index), index end @@ -550,11 +538,11 @@ # == Arguments # # * +axis+ - The axis to iterate over. Can be :vector (or :column) # or :row. Default to :vector. def each axis=:vector, &block - if axis == :vector or axis == :column + if axis == :vector || axis == :column each_vector(&block) elsif axis == :row each_row(&block) else raise ArgumentError, "Unknown axis #{axis}" @@ -575,11 +563,11 @@ # == Arguments # # * +axis+ - The axis to iterate over. Can be :vector (or :column) # or :row. Default to :vector. def collect axis=:vector, &block - if axis == :vector or axis == :column + if axis == :vector || axis == :column collect_vectors(&block) elsif axis == :row collect_rows(&block) else raise ArgumentError, "Unknown axis #{axis}" @@ -601,11 +589,11 @@ # == Arguments # # * +axis+ - The axis to map over. Can be :vector (or :column) or :row. # Default to :vector. def map axis=:vector, &block - if axis == :vector or axis == :column + if axis == :vector || axis == :column map_vectors(&block) elsif axis == :row map_rows(&block) else raise ArgumentError, "Unknown axis #{axis}" @@ -619,11 +607,11 @@ # == Arguments # # * +axis+ - The axis to map over. Can be :vector (or :column) or :row. # Default to :vector. def map! axis=:vector, &block - if axis == :vector or axis == :column + if axis == :vector || axis == :column map_vectors!(&block) elsif axis == :row map_rows!(&block) end end @@ -644,11 +632,11 @@ # == Arguments # # * +axis+ - The axis to map over. Can be :vector (or :column) or :row. # Default to :vector. def recode axis=:vector, &block - if axis == :vector or axis == :column + if axis == :vector || axis == :column recode_vectors(&block) elsif axis == :row recode_rows(&block) end end @@ -680,46 +668,46 @@ # # df.filter(:row) do |row| # row[:a] + row[:d] < 100 # end def filter axis=:vector, &block - if axis == :vector or axis == :column + if axis == :vector || axis == :column filter_vectors(&block) elsif axis == :row filter_rows(&block) end end - def recode_vectors &block + def recode_vectors block_given? or return to_enum(:recode_vectors) - df = self.dup + df = dup df.each_vector_with_index do |v, i| ret = yield v ret.is_a?(Daru::Vector) or raise TypeError, "Every iteration must return Daru::Vector not #{ret.class}" df[*i] = ret end df end - def recode_rows &block + def recode_rows block_given? or return to_enum(:recode_rows) - df = self.dup + df = dup df.each_row_with_index do |r, i| ret = yield r ret.is_a?(Daru::Vector) or raise TypeError, "Every iteration must return Daru::Vector not #{ret.class}" df.row[i] = ret end df end # Map each vector and return an Array. - def map_vectors(&block) + def map_vectors return to_enum(:map_vectors) unless block_given? arry = [] @data.each do |vec| arry << yield(vec) @@ -727,11 +715,11 @@ arry end # Destructive form of #map_vectors - def map_vectors!(&block) + def map_vectors! return to_enum(:map_vectors!) unless block_given? vectors.dup.each do |n| v = yield self[n] v.is_a?(Daru::Vector) or raise TypeError, "Must return a Daru::Vector not #{v.class}" @@ -740,11 +728,11 @@ self end # Map vectors alongwith the index. - def map_vectors_with_index(&block) + def map_vectors_with_index return to_enum(:map_vectors_with_index) unless block_given? dt = [] each_vector_with_index do |vector, name| dt << yield(vector, name) @@ -752,58 +740,58 @@ dt end # Map each row - def map_rows(&block) + def map_rows return to_enum(:map_rows) unless block_given? dt = [] each_row do |row| dt << yield(row) end dt end - def map_rows_with_index(&block) + def map_rows_with_index return to_enum(:map_rows_with_index) unless block_given? dt = [] each_row_with_index do |row, index| dt << yield(row, index) end dt end - def map_rows!(&block) + def map_rows! return to_enum(:map_rows!) unless block_given? index.dup.each do |i| - r = yield self.row[i] + r = yield row[i] r.is_a?(Daru::Vector) or raise TypeError, "Returned object must be Daru::Vector not #{r.class}" - self.row[i] = r + row[i] = r end self end # Retrieves a Daru::Vector, based on the result of calculation # performed on each row. - def collect_rows &block + def collect_rows return to_enum(:collect_rows) unless block_given? data = [] each_row do |row| data.push yield(row) end Daru::Vector.new(data, index: @index) end - def collect_row_with_index &block + def collect_row_with_index return to_enum(:collect_row_with_index) unless block_given? data = [] each_row_with_index do |row, i| data.push yield(row, i) @@ -812,22 +800,22 @@ Daru::Vector.new(data, index: @index) end # Retrives a Daru::Vector, based on the result of calculation # performed on each vector. - def collect_vectors &block + def collect_vectors return to_enum(:collect_vectors) unless block_given? data = [] each_vector do |vec| data.push yield(vec) end Daru::Vector.new(data, index: @vectors) end - def collect_vector_with_index &block + def collect_vector_with_index return to_enum(:collect_vector_with_index) unless block_given? data = [] each_vector_with_index do |vec, i| data.push yield(vec, i) @@ -850,34 +838,35 @@ } Matrix.rows(rows) end - # Delete a vector def delete_vector vector - if @vectors.include? vector - @data.delete_at @vectors[vector] - @vectors = Daru::Index.new @vectors.to_a - [vector] - else - raise IndexError, "Vector #{vector} does not exist." - end + raise IndexError, "Vector #{vector} does not exist." unless @vectors.include?(vector) + @data.delete_at @vectors[vector] + @vectors = Daru::Index.new @vectors.to_a - [vector] + self end + # Deletes a list of vectors + def delete_vectors *vectors + Array(vectors).each { |vec| delete_vector vec } + + self + end + # Delete a row def delete_row index idx = named_index_for index - if @index.include? idx - @index = Daru::Index.new(@index.to_a - [idx]) - self.each_vector do |vector| - vector.delete_at idx - end - else - raise IndexError, "Index #{index} does not exist." + raise IndexError, "Index #{index} does not exist." unless @index.include? idx + @index = Daru::Index.new(@index.to_a - [idx]) + each_vector do |vector| + vector.delete_at idx end set_size end @@ -893,11 +882,11 @@ end ds_boot.update ds_boot end - def keep_row_if &block + def keep_row_if deletion = [] @index.each do |index| keep_row = yield access_row(index) @@ -906,11 +895,11 @@ deletion.each { |idx| delete_row idx } end - def keep_vector_if &block + def keep_vector_if @vectors.each do |vector| keep_vector = yield @data[@vectors[vector]], vector delete_vector vector unless keep_vector end @@ -921,50 +910,40 @@ d = [] each_row do |row| d.push(row[vec]) if yield row end - Daru::Vector.new(d) + Daru::Vector.new(d, metadata: self[vec].metadata.dup) end # Iterates over each row and retains it in a new DataFrame if the block returns # true for that row. - def filter_rows &block + def filter_rows return to_enum(:filter_rows) unless block_given? - df = Daru::DataFrame.new({}, order: @vectors.to_a) - marked = [] + keep_rows = @index.map { |index| yield access_row(index) } - @index.each do |index| - keep_row = yield access_row(index) - marked << index if keep_row - end - - marked.each do |idx| - df.row[idx] = self[idx, :row] - end - - df + where keep_rows end # Iterates over each vector and retains it in a new DataFrame if the block returns # true for that vector. def filter_vectors &block return to_enum(:filter_vectors) unless block_given? - df = self.dup - df.keep_vector_if &block + df = dup + df.keep_vector_if(&block) df end # Test each row with one or more tests. Each test is a Proc with the form # *Proc.new {|row| row[:age] > 0}* # # The function returns an array with all errors. def verify(*tests) - if(tests[0].is_a? Symbol) + if tests[0].is_a? Symbol id = tests[0] tests.shift else id = @vectors.first end @@ -972,17 +951,16 @@ vr = [] i = 0 each(:row) do |row| i += 1 tests.each do |test| - if !test[2].call(row) - values = "" - if test[1].size>0 - values = " (" + test[1].collect{ |k| "#{k}=#{row[k]}" }.join(", ") + ")" - end - vr.push("#{i} [#{row[id]}]: #{test[0]}#{values}") + next if test[2].call(row) + values = '' + unless test[1].empty? + values = ' (' + test[1].collect { |k| "#{k}=#{row[k]}" }.join(', ') + ')' end + vr.push("#{i} [#{row[id]}]: #{test[0]}#{values}") end end vr end @@ -1049,11 +1027,11 @@ # TODO: remove next version alias :vector_missing_values :missing_values_rows def has_missing_data? - !!@data.any? { |v| v.has_missing_data? } + !!@data.any?(&:has_missing_data?) end alias :flawed? :has_missing_data? # Return a nested hash using vector names as keys and an array constructed of @@ -1073,23 +1051,23 @@ current = current[root] end name = row[tree_keys.last] if !block current[name] ||= [] - current[name].push(row.to_hash.delete_if { |key,value| tree_keys.include? key}) + current[name].push(row.to_h.delete_if { |key,_value| tree_keys.include? key }) else - current[name] = block.call(row, current,name) + current[name] = yield(row, current, name) end end out end def vector_count_characters vecs=nil vecs ||= @vectors.to_a - collect_row_with_index do |row, i| + collect_rows do |row| vecs.inject(0) do |memo, vec| memo + (row[vec].nil? ? 0 : row[vec].to_s.size) end end end @@ -1127,11 +1105,11 @@ # df = Daru::DataFrame.new({a: [1,2,3,4,5], b: ['a', 'b', 'c', 'd', 'e']}) # df.any?(:row) do |row| # row[:a] < 3 and row[:b] == 'b' # end #=> true def any? axis=:vector, &block - if axis == :vector or axis == :column + if axis == :vector || axis == :column @data.any?(&block) elsif axis == :row each_row do |row| return true if yield(row) end @@ -1149,11 +1127,11 @@ # df = Daru::DataFrame.new({a: [1,2,3,4,5], b: ['a', 'b', 'c', 'd', 'e']}) # df.all?(:row) do |row| # row[:a] < 10 # end #=> true def all? axis=:vector, &block - if axis == :vector or axis == :column + if axis == :vector || axis == :column @data.all?(&block) elsif axis == :row each_row do |row| return false unless yield(row) end @@ -1234,50 +1212,56 @@ # # ["foo", "one", 3]=>[6], # # ["foo", "three", 8]=>[7], # # ["foo", "two", 3]=>[2, 4]} def group_by *vectors vectors.flatten! - vectors.each { |v| raise(ArgumentError, "Vector #{v} does not exist") unless - has_vector?(v) } + vectors.each { |v| + raise(ArgumentError, "Vector #{v} does not exist") unless has_vector?(v) + } Daru::Core::GroupBy.new(self, vectors) end def reindex_vectors new_vectors - raise ArgumentError, "Must pass the new index of type Index or its "\ - "subclasses, not #{new_index.class}" unless new_vectors.kind_of?(Daru::Index) + raise ArgumentError, 'Must pass the new index of type Index or its '\ + "subclasses, not #{new_index.class}" unless new_vectors.is_a?(Daru::Index) cl = Daru::DataFrame.new({}, order: new_vectors, index: @index, name: @name) new_vectors.each do |vec| - if @vectors.include?(vec) - cl[vec] = self[vec] - else - cl[vec] = [nil]*nrows - end + cl[vec] = @vectors.include?(vec) ? self[vec] : cl[vec] = [nil]*nrows end cl end # Concatenate another DataFrame along corresponding columns. - # Very premature implementation. Use with caution. + # If columns do not exist in both dataframes, they are filled with nils def concat other_df - vectors = [] - @vectors.each do |v| - vectors << self[v].to_a.dup.concat(other_df[v].to_a) + vectors = @vectors.to_a + data = [] + + vectors.each do |v| + other_vec = other_df.vectors.include?(v) ? other_df[v].to_a : [nil] * other_df.size + data << self[v].dup.to_a.concat(other_vec) end - Daru::DataFrame.new(vectors, order: @vectors) + other_df.vectors.each do |v| + next if vectors.include?(v) + vectors << v + data << ([nil] * size).concat(other_df[v].to_a) + end + + Daru::DataFrame.new(data, order: vectors) end # Set a particular column as the new DF def set_index new_index, opts={} - raise ArgumentError, "All elements in new index must be unique." if + raise ArgumentError, 'All elements in new index must be unique.' if @size != self[new_index].uniq.size self.index = Daru::Index.new(self[new_index].to_a) - self.delete_vector(new_index) unless opts[:keep] + delete_vector(new_index) unless opts[:keep] self end # Change the index of the DataFrame and preserve the labels of the previous @@ -1301,20 +1285,16 @@ # # b 2 22 # # 0 nil nil # # a 1 11 # # g nil nil def reindex new_index - raise ArgumentError, "Must pass the new index of type Index or its "\ - "subclasses, not #{new_index.class}" unless new_index.kind_of?(Daru::Index) + raise ArgumentError, 'Must pass the new index of type Index or its '\ + "subclasses, not #{new_index.class}" unless new_index.is_a?(Daru::Index) cl = Daru::DataFrame.new({}, order: @vectors, index: new_index, name: @name) new_index.each do |idx| - if @index.include?(idx) - cl.row[idx] = self.row[idx] - else - cl.row[idx] = [nil]*ncols - end + cl.row[idx] = @index.include?(idx) ? row[idx] : [nil]*ncols end cl end @@ -1328,11 +1308,11 @@ # # df.index = Daru::Index.new(['a','b','c','d']) # df.index.to_a #=> ['a','b','c','d'] # df.row['a'].to_a #=> [1,11] def index= idx - @data.each { |vec| vec.index = idx} + @data.each { |vec| vec.index = idx } @index = idx self end @@ -1345,114 +1325,214 @@ # df.vectors.to_a #=> [:a, :b, :c] # # df.vectors = Daru::Index.new([:foo, :bar, :baz]) # df.vectors.to_a #=> [:foo, :bar, :baz] def vectors= idx - raise ArgumentError, "Can only reindex with Index and its subclasses" unless - index.kind_of?(Daru::Index) + raise ArgumentError, 'Can only reindex with Index and its subclasses' unless + index.is_a?(Daru::Index) raise ArgumentError, "Specified index length #{idx.size} not equal to"\ "dataframe size #{ncols}" if idx.size != ncols @vectors = idx self end + # Renames the vectors + # + # == Arguments + # + # * name_map - A hash where the keys are the exising vector names and + # the values are the new names. If a vector is renamed + # to a vector name that is already in use, the existing + # one is overwritten. + # + # == Usage + # + # df = Daru::DataFrame.new({ a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44] }) + # df.rename_vectors :a => :alpha, :c => :gamma + # df.vectors.to_a #=> [:alpha, :b, :gamma] + def rename_vectors name_map + existing_targets = name_map.select { |k,v| k != v }.values & vectors.to_a + delete_vectors(*existing_targets) + + new_names = vectors.to_a.map { |v| name_map[v] ? name_map[v] : v } + self.vectors = Daru::Index.new new_names + end + # Return the indexes of all the numeric vectors. Will include vectors with nils # alongwith numbers. def numeric_vectors numerics = [] each_vector_with_index do |vec, i| - numerics << i if(vec.type == :numeric) + numerics << i if vec.type == :numeric end numerics end def numeric_vector_names numerics = [] @vectors.each do |v| - numerics << v if (self[v].type == :numeric) + numerics << v if self[v].type == :numeric end numerics end # Return a DataFrame of only the numerical Vectors. If clone: false # is specified as option, only a *view* of the Vectors will be # returned. Defaults to clone: true. def only_numerics opts={} cln = opts[:clone] == false ? false : true nv = numeric_vectors - arry = nv.inject([]) do |arr, v| + arry = nv.each_with_object([]) do |v, arr| arr << self[v] - arr end order = Index.new(nv) Daru::DataFrame.new(arry, clone: cln, order: order, index: @index) end # Generate a summary of this DataFrame with ReportBuilder. - def summary(method = :to_text) + def summary(method=:to_text) ReportBuilder.new(no_title: true).add(self).send(method) end def report_building(b) # :nodoc: # - b.section(:name=>@name) do |g| + b.section(name: @name) do |g| g.text "Number of rows: #{nrows}" @vectors.each do |v| g.text "Element:[#{v}]" g.parse_element(self[v]) end end end - # Sorts a dataframe (ascending/descending)according to the given sequence of - # vectors, using the attributes provided in the blocks. + # Sorts a dataframe (ascending/descending) in the given pripority sequence of + # vectors, with or without a block. # # @param order [Array] The order of vector names in which the DataFrame # should be sorted. # @param [Hash] opts The options to sort with. # @option opts [TrueClass,FalseClass,Array] :ascending (true) Sort in ascending # or descending order. Specify Array corresponding to *order* for multiple # sort orders. - # @option opts [Hash] :by ({|a,b| a <=> b}) Specify attributes of objects to + # @option opts [Hash] :by (lambda{|a| a }) Specify attributes of objects to # to be used for sorting, for each vector name in *order* as a hash of - # vector name and lambda pairs. In case a lambda for a vector is not + # vector name and lambda expressions. In case a lambda for a vector is not # specified, the default will be used. + # @option opts [TrueClass,FalseClass,Array] :handle_nils (false) Handle nils + # automatically or not when a block is provided. + # If set to True, nils will appear at top after sorting. # - # == Usage + # @example Sort a dataframe with a vector sequence. # - # df = Daru::DataFrame.new({a: [-3,2,-1,4], b: [4,3,2,1]}) # - # #<Daru::DataFrame:140630680 @name = 04e00197-f8d5-4161-bca2-93266bfabc6f @size = 4> - # # a b - # # 0 -3 4 - # # 1 2 3 - # # 2 -1 2 - # # 3 4 1 - # df.sort([:a], by: { a: lambda { |a,b| a.abs <=> b.abs } }) + # df = Daru::DataFrame.new({a: [1,2,1,2,3], b: [5,4,3,2,1]}) + # + # df.sort [:a, :b] + # # => + # # <Daru::DataFrame:30604000 @name = d6a9294e-2c09-418f-b646-aa9244653444 @size = 5> + # # a b + # # 2 1 3 + # # 0 1 5 + # # 3 2 2 + # # 1 2 4 + # # 4 3 1 + # + # @example Sort a dataframe without a block. Here nils will be handled automatically. + # + # df = Daru::DataFrame.new({a: [-3,nil,-1,nil,5], b: [4,3,2,1,4]}) + # + # df.sort([:a]) + # # => + # # <Daru::DataFrame:14810920 @name = c07fb5c7-2201-458d-b679-6a1f7ebfe49f @size = 5> + # # a b + # # 1 nil 3 + # # 3 nil 1 + # # 0 -3 4 + # # 2 -1 2 + # # 4 5 4 + # + # @example Sort a dataframe with a block with nils handled automatically. + # + # df = Daru::DataFrame.new({a: [nil,-1,1,nil,-1,1], b: ['aaa','aa',nil,'baaa','x',nil] }) + # + # df.sort [:b], by: {b: lambda { |a| a.length } } + # # NoMethodError: undefined method `length' for nil:NilClass + # # from (pry):8:in `block in __pry__' + # + # df.sort [:b], by: {b: lambda { |a| a.length } }, handle_nils: true + # + # # => + # # <Daru::DataFrame:28469540 @name = 5f986508-556f-468b-be0c-88cc3534445c @size = 6> + # # a b + # # 2 1 nil + # # 5 1 nil + # # 4 -1 x + # # 1 -1 aa + # # 0 nil aaa + # # 3 nil baaa + # + # @example Sort a dataframe with a block with nils handled manually. + # + # df = Daru::DataFrame.new({a: [nil,-1,1,nil,-1,1], b: ['aaa','aa',nil,'baaa','x',nil] }) + # + # # To print nils at the bottom one can use lambda { |a| (a.nil?)[1]:[0,a.length] } + # df.sort [:b], by: {b: lambda { |a| (a.nil?)?[1]:[0,a.length] } }, handle_nils: true + # + # # => + # #<Daru::DataFrame:22214180 @name = cd7703c7-1dca-4560-840b-5ea51a852ef9 @size = 6> + # # a b + # # 4 -1 x + # # 1 -1 aa + # # 0 nil aaa + # # 3 nil baaa + # # 2 1 nil + # # 5 1 nil + def sort! vector_order, opts={} - raise ArgumentError, "Required atleast one vector name" if vector_order.size < 1 + raise ArgumentError, 'Required atleast one vector name' if vector_order.empty? opts = { ascending: true, - type: :quick_sort, + handle_nils: false, by: {} }.merge(opts) - opts[:by] = create_logic_blocks vector_order, opts[:by] opts[:ascending] = sort_order_array vector_order, opts[:ascending] - idx = @index.to_a - send(opts[:type], vector_order, idx, opts[:by], opts[:ascending]) - self.index = Daru::Index.new(idx) + opts[:handle_nils] = handle_nils_array vector_order, opts[:handle_nils] + blocks = create_logic_blocks vector_order, opts[:by], opts[:ascending] + block = lambda do |r1, r2| + # Build left and right array to compare two rows + left = build_array_from_blocks vector_order, opts, blocks, r1, r2 + right = build_array_from_blocks vector_order, opts, blocks, r2, r1 + + # Resolve conflict by Index if all attributes are same + left << r1 + right << r2 + left <=> right + end + + idx = (0..@index.size-1).sort(&block) + + old_index = @index.to_a + self.index = Daru::Index.new(idx.map { |i| old_index[i] }) + + vectors.each do |v| + @data[@vectors[v]] = Daru::Vector.new( + idx.map { |i| @data[@vectors[v]].data[i] }, + name: self[v].name, metadata: self[v].metadata.dup, index: index + ) + end + self end # Non-destructive version of #sort! def sort vector_order, opts={} - self.dup.sort! vector_order, opts + dup.sort! vector_order, opts end # Pivots a data frame on specified vectors and applies an aggregate function # to quickly generate a summary. # @@ -1487,29 +1567,31 @@ # # [:e, :one] [:e, :two] # # [:bar] 18 26 # # [:foo] 10 12 def pivot_table opts={} raise ArgumentError, - "Specify grouping index" if !opts[:index] or opts[:index].empty? + 'Specify grouping index' if !opts[:index] || opts[:index].empty? index = opts[:index] vectors = opts[:vectors] || [] aggregate_function = opts[:agg] || :mean values = - if opts[:values].is_a?(Symbol) - [opts[:values]] - elsif opts[:values].is_a?(Array) - opts[:values] - else # nil - (@vectors.to_a - (index | vectors)) & numeric_vector_names - end + if opts[:values].is_a?(Symbol) + [opts[:values]] + elsif opts[:values].is_a?(Array) + opts[:values] + else # nil + (@vectors.to_a - (index | vectors)) & numeric_vector_names + end - raise IndexError, "No numeric vectors to aggregate" if values.empty? + raise IndexError, 'No numeric vectors to aggregate' if values.empty? - grouped = group_by(index) + grouped = group_by(index) - unless vectors.empty? + if vectors.empty? + grouped.send(aggregate_function) + else super_hash = {} values.each do |value| grouped.groups.each do |group_name, row_numbers| super_hash[group_name] ||= {} @@ -1546,12 +1628,10 @@ # pivoted_dataframe[symbolize(vector_index)][symbolize(row_index)] = val pivoted_dataframe[vector_index][row_index] = val end end return pivoted_dataframe - else - grouped.send(aggregate_function) end end # Merge vectors from two DataFrames. In case of name collision, # the vectors names are changed to x_1, x_2 .... @@ -1559,12 +1639,12 @@ # @return {Daru::DataFrame} def merge other_df raise "Number of rows must be equal in this: #{nrows} and other: #{other_df.nrows}" unless nrows == other_df.nrows new_fields = (@vectors.to_a + other_df.vectors.to_a) - .recode_repeated - .map(&:to_sym) + .recode_repeated + .map(&:to_sym) df_new = DataFrame.new({}, order: new_fields) (0...nrows).to_a.each do |i| row = self.row[i].to_a + other_df.row[i].to_a df_new.add_row(row) @@ -1601,11 +1681,10 @@ # # 1 3 Ninja 4 def join(other_df,opts={}) Daru::Core::Merge.join(self, other_df, opts) end - # Creates a new dataset for one to many relations # on a dataset, based on pattern of field names. # # for example, you have a survey for number of children # with this structure: @@ -1630,46 +1709,45 @@ # # ["green", "2", 15], # # ["orange", "2", 30], # # ["white", "2", 20] # # ] def one_to_many(parent_fields, pattern) - re = Regexp.new pattern.gsub("%v","(.+?)").gsub("%n","(\\d+?)") + re = Regexp.new pattern.gsub('%v','(.+?)').gsub('%n','(\\d+?)') ds_vars = parent_fields.dup vars = [] max_n = 0 - h = parent_fields.inject({}) { |a,v| + h = parent_fields.each_with_object({}) { |v, a| a[v] = Daru::Vector.new([]) - a } # Adding _row_id h['_col_id'] = Daru::Vector.new([]) ds_vars.push('_col_id') @vectors.each do |f| - if f =~ re - if !vars.include? $1 - vars.push($1) - h[$1] = Daru::Vector.new([]) - end - max_n = $2.to_i if max_n < $2.to_i + next unless f =~ re + unless vars.include? $1 + vars.push($1) + h[$1] = Daru::Vector.new([]) end + + max_n = $2.to_i if max_n < $2.to_i end ds = DataFrame.new(h, order: ds_vars+vars) each_row do |row| row_out = {} parent_fields.each do |f| row_out[f] = row[f] end max_n.times do |n1| - n = n1+1 + n = n1+1 any_data = false vars.each do |v| - data = row[pattern.gsub("%v",v.to_s).gsub("%n",n.to_s)] + data = row[pattern.gsub('%v',v.to_s).gsub('%n',n.to_s)] row_out[v] = data - any_data = true if !data.nil? + any_data = true unless data.nil? end if any_data row_out['_col_id'] = n ds.add_row(row_out) @@ -1683,11 +1761,11 @@ def add_vectors_by_split_recode(name_, join='-', sep=Daru::SPLIT_TOKEN) split = self[name_].split_by_separator(sep) i = 1 split.each { |k,v| new_field = name_.to_s + join + i.to_s - v.rename name_.to_s + ":" + k.to_s + v.rename name_.to_s + ':' + k.to_s self[new_field.to_sym] = v i += 1 } end @@ -1705,15 +1783,15 @@ # :name => Daru::Vector.new(%w{Alex Peter Susan Mary John}) # }) # ds.create_sql('names') # #=>"CREATE TABLE names (id INTEGER,\n name VARCHAR (255)) CHARACTER SET=UTF8;" # - def create_sql(table,charset="UTF8") + def create_sql(table,charset='UTF8') sql = "CREATE TABLE #{table} (" - fields = self.vectors.to_a.collect do |f| + fields = vectors.to_a.collect do |f| v = self[f] - f.to_s + " " + v.db_type + f.to_s + ' ' + v.db_type end sql + fields.join(",\n ")+") CHARACTER SET=#{charset};" end @@ -1722,18 +1800,18 @@ numerics_as_arrays = [] numeric_vectors.each do |n| numerics_as_arrays << self[n].to_a end - GSL::Matrix.alloc *numerics_as_arrays.transpose + GSL::Matrix.alloc(*numerics_as_arrays.transpose) end # Convert all vectors of type *:numeric* into a Matrix. def to_matrix numerics_as_arrays = [] each_vector do |vector| - numerics_as_arrays << vector.to_a if(vector.type == :numeric) + numerics_as_arrays << vector.to_a if vector.type == :numeric end Matrix.columns numerics_as_arrays end @@ -1744,12 +1822,12 @@ # Convert all vectors of type *:numeric* and not containing nils into an NMatrix. def to_nmatrix numerics_as_arrays = [] each_vector do |vector| - numerics_as_arrays << vector.to_a if(vector.type == :numeric and - vector.missing_positions.size == 0) + numerics_as_arrays << vector.to_a if vector.type == :numeric && + vector.missing_positions.empty? end numerics_as_arrays.transpose.to_nm end @@ -1758,75 +1836,75 @@ # the array of hashes while the 1th index contains the indexes of each row # of the dataframe. Each element in the index array corresponds to its row # in the array of hashes, which has the same index. def to_a arry = [[],[]] - self.each_row do |row| - arry[0] << row.to_hash + each_row do |row| + arry[0] << row.to_h end arry[1] = @index.to_a arry end # Convert to json. If no_index is false then the index will NOT be included # in the JSON thus created. def to_json no_index=true if no_index - self.to_a[0].to_json + to_a[0].to_json else - self.to_a.to_json + to_a.to_json end end - # Converts DataFrame to a hash with keys as vector names and values as + # Converts DataFrame to a hash (explicit) with keys as vector names and values as # the corresponding vectors. - def to_hash + def to_h hsh = {} @vectors.each_with_index do |vec_name, idx| hsh[vec_name] = @data[idx] end hsh end # Convert to html for IRuby. def to_html threshold=30 - html = "<table>" + - "<tr>" + - "<th colspan=\"#{@vectors.size+1}\">" + - "Daru::DataFrame:#{self.object_id} " + " rows: #{nrows} " + " cols: #{ncols}" - "</th>" + - "</tr>" + html = '<table>' \ + '<tr>' \ + "<th colspan=\"#{@vectors.size+1}\">" \ + "Daru::DataFrame:#{object_id} " + " rows: #{nrows} " + " cols: #{ncols}" \ + '</th>' \ + '</tr>' html +='<tr><th></th>' @vectors.each { |vector| html += '<th>' + vector.to_s + '</th>' } html += '</tr>' @index.each_with_index do |index, num| html += '<tr>' html += '<td>' + index.to_s + '</td>' - self.row[index].each do |element| + row[index].each do |element| html += '<td>' + element.to_s + '</td>' end html += '</tr>' - if num > threshold - html += '<tr>' - (@vectors.size + 1).times { html += '<td>...</td>' } - html += '</tr>' + next if num <= threshold - last_index = @index.to_a.last - last_row = self.row[last_index] - html += '<tr>' - html += "<td>" + last_index.to_s + "</td>" - (0..(ncols - 1)).to_a.each do |i| - html += '<td>' + last_row[i].to_s + '</td>' - end - html += '</tr>' - break + html += '<tr>' + (@vectors.size + 1).times { html += '<td>...</td>' } + html += '</tr>' + + last_index = @index.to_a.last + last_row = row[last_index] + html += '<tr>' + html += '<td>' + last_index.to_s + '</td>' + (0..(ncols - 1)).to_a.each do |i| + html += '<td>' + last_row[i].to_s + '</td>' end + html += '</tr>' + break end html += '</table>' html end @@ -1839,11 +1917,11 @@ # after assingment/deletion etc. are complete. This is provided so that # time is not wasted in creating the metadata for the vector each time # assignment/deletion of elements is done. Updating data this way is called # lazy loading. To set or unset lazy loading, see the .lazy_update= method. def update - @data.each { |v| v.update } if Daru.lazy_update + @data.each(&:update) if Daru.lazy_update end # Rename the DataFrame. def rename new_name @name = new_name @@ -1888,23 +1966,22 @@ # ds.write_sql(dbh,"test") def write_sql dbh, table Daru::IO.dataframe_write_sql self, dbh, table end - # Use marshalling to save dataframe to a file. def save filename Daru::IO.save self, filename end - def _dump depth - Marshal.dump({ + def _dump(_depth) + Marshal.dump( data: @data, index: @index.to_a, order: @vectors.to_a, name: @name - }) + ) end def self._load data h = Marshal.load data Daru::DataFrame.new(h[:data], @@ -1937,33 +2014,33 @@ # Pretty print in a nice table format for the command line (irb/pry/iruby) def inspect spacing=10, threshold=15 longest = [@name.to_s.size, (@vectors.map(&:to_s).map(&:size).max || 0), (@index .map(&:to_s).map(&:size).max || 0), - (@data .map{ |v| v.map(&:to_s).map(&:size).max}.max || 0)].max + (@data .map { |v| v.map(&:to_s).map(&:size).max }.max || 0)].max name = @name || 'nil' - content = "" + content = '' longest = spacing if longest > spacing formatter = "\n" (@vectors.size + 1).times { formatter += "%#{longest}.#{longest}s " } - content += "\n#<" + self.class.to_s + ":" + self.object_id.to_s + " @name = " + - name.to_s + " @size = " + @size.to_s + ">" - content += sprintf formatter, "" , *@vectors.map(&:to_s) + content += "\n#<" + self.class.to_s + ':' + object_id.to_s + ' @name = ' + + name.to_s + ' @size = ' + @size.to_s + '>' + content += formatter % ['', *@vectors.map(&:to_s)] row_num = 1 - self.each_row_with_index do |row, index| - content += sprintf formatter, index.to_s, *row.to_hash.values.map { |e| (e || 'nil').to_s } + each_row_with_index do |row, index| + content += formatter % [index.to_s, *row.to_h.values.map { |e| (e || 'nil').to_s }] row_num += 1 - if row_num > threshold - dots = [] + next if row_num <= threshold - (@vectors.size + 1).times { dots << "..." } - content += sprintf formatter, *dots - break - end + dots = [] + + (@vectors.size + 1).times { dots << '...' } + content += formatter % dots + break end content += "\n" content end @@ -1972,139 +2049,99 @@ def where bool_array Daru::Core::Query.df_where self, bool_array end def == other - self.class == other.class and - @size == other.size and - @index == other.index and - @vectors == other.vectors and - @vectors.to_a.all? { |v| self[v] == other[v] } + self.class == other.class && + @size == other.size && + @index == other.index && + @vectors == other.vectors && + @vectors.to_a.all? { |v| self[v] == other[v] } end def method_missing(name, *args, &block) - if md = name.match(/(.+)\=/) - insert_or_modify_vector name[/(.+)\=/].delete("=").to_sym, args[0] - elsif self.has_vector? name + if name =~ /(.+)\=/ + insert_or_modify_vector name[/(.+)\=/].delete('=').to_sym, args[0] + elsif has_vector? name self[name] else super(name, *args, &block) end end - private + private def possibly_multi_index? index if @index.is_a?(MultiIndex) Daru::MultiIndex.from_tuples(index) else Daru::Index.new(index) end end - def quick_sort vector_order, index, by, ascending - recursive_quick_sort vector_order, index, by, ascending, 0, @size-1 - end - - # == Arguments - # - # vector_order - - # index - - # by - - # ascending - - # left_lower - - # right_upper - - def recursive_quick_sort vector_order, index, by, ascending, left_lower, right_upper - if left_lower < right_upper - left_upper, right_lower = partition(vector_order, index, by, ascending, left_lower, right_upper) - if left_upper - left_lower < right_upper - right_lower - recursive_quick_sort(vector_order, index, by, ascending, left_lower, left_upper) - recursive_quick_sort(vector_order, index, by, ascending, right_lower, right_upper) - else - recursive_quick_sort(vector_order, index, by, ascending, right_lower, right_upper) - recursive_quick_sort(vector_order, index, by, ascending, left_lower, left_upper) - end + def create_logic_blocks vector_order, _by, ascending + # Create blocks to handle nils + blocks = {} + universal_block_ascending = ->(a) { [a.nil? ? 0 : 1, a] } + universal_block_decending = ->(a) { [a.nil? ? 1 : 0, a] } + vector_order.each_with_index do |vector, i| + blocks[vector] = + if ascending[i] + universal_block_ascending + else + universal_block_decending + end end + + blocks end - def partition vector_order, index, by, ascending, left_lower, right_upper - mindex = (left_lower + right_upper) / 2 - mvalues = vector_order.inject([]) { |a, vector_name| a << self[vector_name][mindex]; a } - i = left_lower - j = right_upper - descending = ascending.map { |a| !a } + def build_array_from_blocks vector_order, opts, blocks, r1, r2 + # Create an array to be used for comparison of two rows in sorting + vector_order.map.each_with_index do |v, i| + value = if opts[:ascending][i] + @data[@vectors[v]].data[r1] + else + @data[@vectors[v]].data[r2] + end - i += 1 while(keep?(i, mvalues, vector_order, ascending , by, 0)) - j -= 1 while(keep?(j, mvalues, vector_order, descending, by, 0)) + if opts[:by][v] && !opts[:handle_nils][i] + # Block given and nils handled manually + value = opts[:by][v].call value - while i < j - 1 - @data.each do |vector| - vector[i], vector[j] = vector[j], vector[i] - end - index[i], index[j] = index[j], index[i] - i += 1 - j -= 1 + elsif opts[:by][v] && opts[:handle_nils][i] + # Block given and nils handled automatically + value = opts[:by][v].call value rescue nil + blocks[v].call value - i += 1 while(keep?(i, mvalues, vector_order, ascending , by,0)) - j -= 1 while(keep?(j, mvalues, vector_order, descending, by,0)) - end - - if i <= j - if i < j - @data.each do |vector| - vector[i], vector[j] = vector[j], vector[i] - end - index[i], index[j] = index[j], index[i] + else + # Block not given and nils handled automatically + blocks[v].call value end - i += 1 - j -= 1 end - - [j,i] end - def keep? current_index, mvalues, vector_order, sort_order, by, vector_order_index - vector_name = vector_order[vector_order_index] - if vector_name - vec = self[vector_name] - eval = by[vector_name].call(vec[current_index], mvalues[vector_order_index]) - - if sort_order[vector_order_index] # sort in ascending order - return false if eval == 1 - return true if eval == -1 - if eval == 0 - keep?(current_index, mvalues, vector_order, sort_order, by, vector_order_index + 1) - end - else # sort in descending order - return false if eval == -1 - return true if eval == 1 - if eval == 0 - keep?(current_index, mvalues, vector_order, sort_order, by, vector_order_index + 1) - end - end - end - end - - def create_logic_blocks vector_order, by={} - universal_block = lambda { |a,b| a <=> b } - vector_order.each do |vector| - by[vector] ||= universal_block - end - - by - end - def sort_order_array vector_order, ascending - if ascending.is_a?(Array) - raise ArgumentError, "Specify same number of vector names and sort orders" if + if ascending.is_a? Array + raise ArgumentError, 'Specify same number of vector names and sort orders' if vector_order.size != ascending.size return ascending else Array.new(vector_order.size, ascending) end end + def handle_nils_array vector_order, handle_nils + if handle_nils.is_a? Array + raise ArgumentError, 'Specify same number of vector names and handle nils' if + vector_order.size != handle_nils.size + return handle_nils + else + Array.new(vector_order.size, handle_nils) + end + end + def vectors_index_for location if @vectors.include?(location) @vectors[location] elsif location[0].is_a?(Integer) location[0] @@ -2116,63 +2153,58 @@ return dup(@vectors[location]) if location.is_a?(Range) if @vectors.is_a?(MultiIndex) pos = @vectors[names] - if pos.is_a?(Integer) - return @data[pos] - else # MultiIndex - new_vectors = pos.map do |tuple| - @data[@vectors[tuple]] - end + return @data[pos] if pos.is_a?(Integer) - if !location.is_a?(Range) and names.size < @vectors.width - pos = pos.drop_left_level names.size - end + # MultiIndex + new_vectors = pos.map do |tuple| + @data[@vectors[tuple]] + end - Daru::DataFrame.new( - new_vectors, index: @index, order: pos) + if !location.is_a?(Range) && names.size < @vectors.width + pos = pos.drop_left_level names.size end + + Daru::DataFrame.new(new_vectors, index: @index, order: pos) else unless names[1] pos = @vectors[location] - if pos.is_a?(Numeric) - return @data[pos] - else - names = pos - end + return @data[pos] if pos.is_a?(Numeric) + + names = pos end - new_vcs = [] + new_vectors = {} names.each do |name| - new_vcs << @data[@vectors[name]].to_a + new_vectors[name] = @data[@vectors[name]] end order = names.is_a?(Array) ? Daru::Index.new(names) : names - Daru::DataFrame.new(new_vcs, order: order, - index: @index, name: @name) + Daru::DataFrame.new(new_vectors, order: order, + index: @index, name: @name) end end def access_row *names location = names[0] if @index.is_a?(MultiIndex) pos = @index[names] if pos.is_a?(Integer) return Daru::Vector.new(populate_row_for(pos), index: @vectors, name: pos) - else - new_rows = pos.map { |tuple| populate_row_for(tuple) } + end - if !location.is_a?(Range) and names.size < @index.width - pos = pos.drop_left_level names.size - end + new_rows = pos.map { |tuple| populate_row_for(tuple) } - Daru::DataFrame.rows( - new_rows, order: @vectors, name: @name, index: pos) + if !location.is_a?(Range) && names.size < @index.width + pos = pos.drop_left_level names.size end + + Daru::DataFrame.rows(new_rows, order: @vectors, name: @name, index: pos) else if names[1].nil? names = @index[location] if names.is_a?(Numeric) row = [] @@ -2187,11 +2219,11 @@ rows = [] names.each do |name| rows << self.row[name].to_a end - Daru::DataFrame.rows rows, index: names ,name: @name, order: @vectors + Daru::DataFrame.rows rows, index: names,name: @name, order: @vectors end end def populate_row_for pos @data.map do |vector| @@ -2199,88 +2231,93 @@ end end def insert_or_modify_vector name, vector name = name[0] unless @vectors.is_a?(MultiIndex) - v = nil + vec = nil if @index.empty? - v = vector.is_a?(Daru::Vector) ? vector : Daru::Vector.new(vector.to_a) - @index = v.index - assign_or_add_vector name, v + vec = if vector.is_a?(Daru::Vector) + vector + else + Daru::Vector.new(vector.to_a, name: set_name(name)) + end + + @index = vec.index + assign_or_add_vector name, vec set_size @data.map! do |v| - if v.size == 0 - Daru::Vector.new([nil]*@size, name: set_name(name), index: @index) + if v.empty? + Daru::Vector.new([nil]*@size, name: set_name(name), metadata: v.metadata, index: @index) else v end end else if vector.is_a?(Daru::Vector) if vector.index == @index # so that index-by-index assignment is avoided when possible. - v = vector.dup + vec = vector.dup else - v = Daru::Vector.new [], name: set_name(name), index: @index + vec = Daru::Vector.new [], name: set_name(name), metadata: vector.metadata.dup, index: @index @index.each do |idx| - if vector.index.include? idx - v[idx] = vector[idx] - else - v[idx] = nil - end + vec[idx] = vector.index.include?(idx) ? vector[idx] : nil end end else raise SizeError, "Specified vector of length #{vector.size} cannot be inserted in DataFrame of size #{@size}" if @size != vector.size - v = Daru::Vector.new(vector, name: set_name(name), index: @index) + vec = Daru::Vector.new(vector, name: set_name(name), index: @index) end - assign_or_add_vector name, v + assign_or_add_vector name, vec end end def assign_or_add_vector name, v - #FIXME: fix this jugaad. need to make changes in Indexing itself. - pos = @vectors[name] + # FIXME: fix this jugaad. need to make changes in Indexing itself. + begin + pos = @vectors[name] + rescue IndexError + pos = name + end - if !pos.kind_of?(Daru::Index) and pos == name and - (@vectors.include?(name) or (pos.is_a?(Integer) and pos < @data.size)) + if !pos.is_a?(Daru::Index) && pos == name && + (@vectors.include?(name) || (pos.is_a?(Integer) && pos < @data.size)) @data[pos] = v - elsif pos.kind_of?(Daru::Index) + elsif pos.is_a?(Daru::Index) pos.each do |p| @data[@vectors[p]] = v end else - @vectors = @vectors | [name] if !@vectors.include?(name) + @vectors |= [name] unless @vectors.include?(name) @data[@vectors[name]] = v end end def insert_or_modify_row name, vector if index.is_a?(MultiIndex) # TODO else name = name[0] - v = - if vector.is_a?(Daru::Vector) - vector - else - Daru::Vector.new(vector, name: set_name(name), index: @vectors) - end + vec = + if vector.is_a?(Daru::Vector) + vector + else + Daru::Vector.new(vector, name: set_name(name), index: @vectors) + end if @index.include? name - self.each_vector_with_index do |vector,i| - vector[name] = v.index.include?(i) ? v[i] : nil + each_vector_with_index do |v,i| + v[name] = vec.index.include?(i) ? vec[i] : nil end else - @index = @index | [name] - self.each_vector_with_index do |vector,i| - vector.concat((v.index.include?(i) ? v[i] : nil), name) + @index |= [name] + each_vector_with_index do |v,i| + v.concat((vec.index.include?(i) ? vec[i] : nil), name) end end set_size end @@ -2292,19 +2329,19 @@ end end def validate_labels raise IndexError, "Expected equal number of vector names (#{@vectors.size}) for number of vectors (#{@data.size})." if - @vectors and @vectors.size != @data.size + @vectors && @vectors.size != @data.size - raise IndexError, "Expected number of indexes same as number of rows" if - @index and @data[0] and @index.size != @data[0].size + raise IndexError, 'Expected number of indexes same as number of rows' if + @index && @data[0] && @index.size != @data[0].size end def validate_vector_sizes @data.each do |vector| - raise IndexError, "Expected vectors with equal length" if vector.size != @size + raise IndexError, 'Expected vectors with equal length' if vector.size != @size end end def validate validate_labels @@ -2330,18 +2367,18 @@ raise IndexError, "Specified index #{index} does not exist." end end def create_vectors_index_with vectors, source - vectors = source.keys.sort_by { |a| a.to_s } if vectors.nil? + vectors = source.keys.sort_by(&:to_s) if vectors.nil? @vectors = - unless vectors.is_a?(Index) or vectors.is_a?(MultiIndex) - Daru::Index.new((vectors + (source.keys - vectors)).uniq) - else - vectors - end + if vectors.is_a?(Index) || vectors.is_a?(MultiIndex) + vectors + else + Daru::Index.new((vectors + (source.keys - vectors)).uniq) + end end def all_vectors_have_equal_indexes? source idx = source.values[0].index @@ -2349,27 +2386,27 @@ idx == vector.index end end def try_create_index index - index.kind_of?(Index) ? index : Daru::Index.new(index) + index.is_a?(Index) ? index : Daru::Index.new(index) end - def set_name potential_name + def set_name potential_name # rubocop:disable Style/AccessorMethodName potential_name.is_a?(Array) ? potential_name.join : potential_name end def symbolize arry symbolized_arry = - if arry.all? { |e| e.is_a?(Array) } - arry.map do |sub_arry| - sub_arry.map do |e| - e.is_a?(Numeric) ? e : e.to_sym + if arry.all? { |e| e.is_a?(Array) } + arry.map do |sub_arry| + sub_arry.map do |e| + e.is_a?(Numeric) ? e : e.to_sym + end end + else + arry.map { |e| e.is_a?(Numeric) ? e : e.to_sym } end - else - arry.map { |e| e.is_a?(Numeric) ? e : e.to_sym } - end symbolized_arry end end end