require_relative 'dataframe_by_row.rb' require_relative 'dataframe_by_vector.rb' require_relative 'io.rb' module Daru class DataFrame class << self def from_csv path, opts={}, &block Daru::IO.from_csv path, opts, &block end end attr_reader :vectors attr_reader :index attr_reader :name attr_reader :size # DataFrame basically consists of an Array of Vector objects. # These objects are indexed by row and column by vectors and index Index objects. # Arguments - source, vectors, index, name in that order. Last 3 are optional. def initialize source, *args vectors = args.shift index = args.shift @name = args.shift || SecureRandom.uuid @data = [] if source.empty? @vectors = Daru::Index.new vectors @index = Daru::Index.new index create_empty_vectors else case source when Array if vectors.nil? @vectors = Daru::Index.new source[0].keys.map(&:to_sym) else @vectors = Daru::Index.new (vectors + (source[0].keys - vectors)).uniq.map(&:to_sym) end if index.nil? @index = Daru::Index.new source.size else @index = Daru::Index.new index end @vectors.each do |name| v = [] source.each do |hsh| v << (hsh[name] || hsh[name.to_s]) end @data << v.dv(name, @index) end when Hash create_vectors_index_with vectors, source if all_daru_vectors_in_source? source if !index.nil? @index = index.to_index elsif all_vectors_have_equal_indexes? source @index = source.values[0].index.dup else all_indexes = [] source.each_value do |vector| all_indexes << vector.index.to_a end # sort only if missing indexes detected all_indexes.flatten!.uniq!.sort! @index = Daru::Index.new all_indexes end @vectors.each do |vector| @data << Daru::Vector.new(vector, [], @index) @index.each do |idx| begin @data[@vectors[vector]][idx] = source[vector][idx] rescue IndexError # If the index is not present in the vector under consideration # (in source) then an error is raised. Put a nil in that place if # that is the case. @data[@vectors[vector]][idx] = nil end end end else index = source.values[0].size if index.nil? if index.is_a?(Daru::Index) @index = index.to_index else @index = Daru::Index.new index end @vectors.each do |name| @data << source[name].dup.dv(name, @index) end end end end set_size validate end def [](*names, axis) if axis == :vector access_vector *names elsif axis == :row access_row *names else raise IndexError, "Expected axis to be row or vector not #{axis}" end end def []=(name, axis ,vector) if axis == :vector insert_or_modify_vector name, vector elsif axis == :row insert_or_modify_row name, vector else raise IndexError, "Expected axis to be row or vector, not #{axis}." end end def vector Daru::DataFrameByVector.new(self) end def row Daru::DataFrameByRow.new(self) end def dup src = {} @vectors.each do |vector| src[vector] = @data[@vectors[vector]] end Daru::DataFrame.new src, @vectors.dup, @index.dup, @name end def each_vector(&block) @data.each(&block) self end def each_vector_with_index(&block) @vectors.each do |vector| yield @data[@vectors[vector]], vector end self end def each_row(&block) @index.each do |index| yield access_row(index) end self end def each_row_with_index(&block) @index.each do |index| yield access_row(index), index end self end def map_vectors(&block) df = self.dup df.each_vector_with_index do |vector, name| df[name, :vector] = yield(vector) end df end def map_vectors_with_index(&block) df = self.dup df.each_vector_with_index do |vector, name| df[name, :vector] = yield(vector, name) end df end def map_rows(&block) df = self.dup df.each_row_with_index do |row, index| df[index, :row] = yield(row) end df end def map_rows_with_index(&block) df = self.dup df.each_row_with_index do |row, index| df[index, :row] = yield(row, index) end df end def delete_vector vector if @vectors.include? vector @data.delete_at @vectors[vector] @vectors = Daru::Index.new @vectors.to_a - [vector] else raise IndexError, "Vector #{vector} does not exist." end end def delete_row index idx = named_index_for index if @index.include? idx @index = (@index.to_a - [idx]).to_index self.each_vector do |vector| vector.delete_at idx end else raise IndexError, "Index #{index} does not exist." end set_size end def keep_row_if &block @index.each do |index| keep_row = yield access_row(index) delete_row index unless keep_row end end def keep_vector_if &block @vectors.each do |vector| keep_vector = yield @data[@vectors[vector]], vector delete_vector vector unless keep_vector end end def has_vector? name !!@vectors[name] end # Converts the DataFrame into an array of hashes where key is vector name # and value is the corresponding element. # The 0th index of the array contains the array of hashes while the 1th # index contains the indexes of each row of the dataframe. Each element in # the index array corresponds to its row in the array of hashes, which has # the same index. def to_a arry = [[],[]] self.each_row do |row| arry[0] << row.to_hash end arry[1] = @index.to_a arry end def to_json no_index=true if no_index self.to_a[0].to_json else self.to_a.to_json end end def to_html threshold=15 html = '
' @vectors.each { |vector| html += ' | ' + vector.to_s + ' | ' } html += '
---|---|
' + index.to_s + ' | ' self.row[index].each do |element| html += '' + element.to_s + ' | ' end html += '
... | ' } html += '