lib/daru/dataframe.rb in daru-0.0.3.1 vs lib/daru/dataframe.rb in daru-0.0.4
- old
+ new
@@ -1,42 +1,96 @@
require_relative 'accessors/dataframe_by_row.rb'
require_relative 'accessors/dataframe_by_vector.rb'
-require_relative 'math/arithmetic/dataframe.rb'
-require_relative 'math/statistics/dataframe.rb'
+require_relative 'maths/arithmetic/dataframe.rb'
+require_relative 'maths/statistics/dataframe.rb'
+require_relative 'plotting/dataframe.rb'
require_relative 'io/io.rb'
module Daru
class DataFrame
- include Daru::Math::Arithmetic::DataFrame
- include Daru::Math::Statistics::DataFrame
+ include Daru::Maths::Arithmetic::DataFrame
+ include Daru::Maths::Statistics::DataFrame
+ include Daru::Plotting::DataFrame
class << self
+ # Load data from a CSV file.
+ # Arguments - path, options, block(optional)
+ #
+ # Accepts a block for pre-conditioning of CSV data if any.
def from_csv path, opts={}, &block
Daru::IO.from_csv path, opts, &block
end
+
+ # Create DataFrame by specifying rows as an Array of Arrays or Array of
+ # Daru::Vector objects.
+ def rows source, opts={}
+ if source.all? { |v| v.size == source[0].size }
+ first = source[0]
+ index = []
+ order =
+ unless opts[:order]
+ if first.is_a?(Daru::Vector) # assume that all are Vectors only
+ source.each { |vec| index << vec.name }
+ first.index.to_a
+ elsif first.is_a?(Array)
+ Array.new(first.size) { |i| i.to_s }
+ end
+ else
+ opts[:order]
+ end
+
+ opts[:order] = order
+ df = Daru::DataFrame.new({}, opts)
+ source.each_with_index do |row,idx|
+ df[(index[idx] || idx), :row] = row
+ end
+ else
+ raise SizeError, "All vectors must have same length"
+ end
+
+ df
+ end
end
+ # The vectors (columns) index of the DataFrame
attr_reader :vectors
+
+ # The index of the rows of the DataFrame
attr_reader :index
+
+ # The name of the DataFrame
attr_reader :name
+
+ # The number of rows present in the DataFrame
attr_reader :size
# DataFrame basically consists of an Array of Vector objects.
# These objects are indexed by row and column by vectors and index Index objects.
- # Arguments - source, vectors, index, name in that order. Last 3 are optional.
+ # Arguments - source, vectors, index, name.
+ #
+ # == Usage
+ # df = Daru::DataFrame.new({a: [1,2,3,4], b: [6,7,8,9]}, order: [:b, :a],
+ # index: [:a, :b, :c, :d], name: :spider_man)
+ #
+ # # =>
+ # # <Daru::DataFrame:80766980 @name = spider_man @size = 4>
+ # # b a
+ # # a 6 1
+ # # b 7 2
+ # # c 8 3
+ # # d 9 4
def initialize source, opts={}
- vectors = opts[:vectors]
+ vectors = opts[:order]
index = opts[:index]
+ @dtype = opts[:dtype] || Array
@name = (opts[:name] || SecureRandom.uuid).to_sym
+ @data = []
- @data = []
-
if source.empty?
@vectors = Daru::Index.new vectors
@index = Daru::Index.new index
-
create_empty_vectors
else
case source
when Array
if vectors.nil?
@@ -51,40 +105,35 @@
@index = Daru::Index.new index
end
@vectors.each do |name|
v = []
-
source.each do |hsh|
v << (hsh[name] || hsh[name.to_s])
end
- @data << v.dv(name, @index)
+ @data << v.dv(name, @index, @dtype)
end
when Hash
create_vectors_index_with vectors, source
-
if all_daru_vectors_in_source? source
-
if !index.nil?
@index = index.to_index
elsif all_vectors_have_equal_indexes? source
@index = source.values[0].index.dup
else
all_indexes = []
-
source.each_value do |vector|
all_indexes << vector.index.to_a
end
# sort only if missing indexes detected
all_indexes.flatten!.uniq!.sort!
@index = Daru::Index.new all_indexes
end
-
@vectors.each do |vector|
- @data << Daru::Vector.new([], name: vector, index: @index)
+ @data << Daru::Vector.new([], name: vector, index: @index, dtype: @dtype)
@index.each do |idx|
begin
@data[@vectors[vector]][idx] = source[vector][idx]
rescue IndexError
@@ -95,80 +144,103 @@
end
end
end
else
index = source.values[0].size if index.nil?
-
if index.is_a?(Daru::Index)
@index = index.to_index
else
@index = Daru::Index.new index
end
@vectors.each do |name|
- @data << source[name].dup.dv(name, @index)
+ @data << source[name].dup.dv(name, @index, @dtype)
end
end
-
end
end
set_size
validate
end
+ # Access row or vector. Specify name of row/vector followed by axis(:row, :vector).
+ # Use of this method is not recommended for accessing rows or vectors.
+ # Use df.row[:a] for accessing row with index ':a' or df.vector[:vec] for
+ # accessing vector with index ':vec'
def [](*names, axis)
if axis == :vector
access_vector *names
elsif axis == :row
access_row *names
else
raise IndexError, "Expected axis to be row or vector not #{axis}"
end
end
+ # Insert a new row/vector of the specified name or modify a previous row.
+ # Instead of using this method directly, use df.row[:a] = [1,2,3] to set/create
+ # a row ':a' to [1,2,3], or df.vector[:vec] = [1,2,3] for vectors.
+ #
+ # In case a Daru::Vector is specified after the equality the sign, the indexes
+ # of the vector will be matched against the row/vector indexes of the DataFrame
+ # before an insertion is performed. Unmatched indexes will be set to nil.
def []=(name, axis ,vector)
if axis == :vector
insert_or_modify_vector name, vector
elsif axis == :row
insert_or_modify_row name, vector
else
raise IndexError, "Expected axis to be row or vector, not #{axis}."
end
end
+ # Access a vector or set/create a vector. Refer #[] and #[]= docs for details.
+ #
+ # == Usage
+ # df.vector[:a] # access vector named ':a'
+ # df.vector[:b] = [1,2,3] # set vector ':b' to [1,2,3]
def vector
Daru::Accessors::DataFrameByVector.new(self)
end
+ # Access a row or set/create a row. Refer #[] and #[]= docs for details.
+ #
+ # == Usage
+ # df.row[:a] # access row named ':a'
+ # df.row[:b] = [1,2,3] # set row ':b' to [1,2,3]
def row
Daru::Accessors::DataFrameByRow.new(self)
end
+ # Duplicate the DataFrame entirely.
def dup
src = {}
@vectors.each do |vector|
- src[vector] = @data[@vectors[vector]]
+ src[vector] = @data[@vectors[vector]].dup
end
- Daru::DataFrame.new src, vectors: @vectors.dup, index: @index.dup, name: @name
+ Daru::DataFrame.new src, order: @vectors.dup, index: @index.dup, name: @name, dtype: @dtype
end
+ # Iterate over each vector
def each_vector(&block)
@data.each(&block)
self
end
+ # Iterate over each vector alongwith the name of the vector
def each_vector_with_index(&block)
@vectors.each do |vector|
yield @data[@vectors[vector]], vector
end
self
end
+ # Iterate over each row
def each_row(&block)
@index.each do |index|
yield access_row(index)
end
@@ -181,50 +253,50 @@
end
self
end
+ # Map each vector. Returns a DataFrame whose vectors are modified according
+ # to the value returned by the block.
def map_vectors(&block)
df = self.dup
-
df.each_vector_with_index do |vector, name|
df[name, :vector] = yield(vector)
end
df
end
def map_vectors_with_index(&block)
df = self.dup
-
df.each_vector_with_index do |vector, name|
df[name, :vector] = yield(vector, name)
end
df
end
+ # Map each row
def map_rows(&block)
df = self.dup
-
df.each_row_with_index do |row, index|
df[index, :row] = yield(row)
end
df
end
def map_rows_with_index(&block)
df = self.dup
-
df.each_row_with_index do |row, index|
df[index, :row] = yield(row, index)
end
df
end
+ # Delete a vector
def delete_vector vector
if @vectors.include? vector
@data.delete_at @vectors[vector]
@vectors = Daru::Index.new @vectors.to_a - [vector]
else
@@ -235,11 +307,10 @@
def delete_row index
idx = named_index_for index
if @index.include? idx
@index = (@index.to_a - [idx]).to_index
-
self.each_vector do |vector|
vector.delete_at idx
end
else
raise IndexError, "Index #{index} does not exist."
@@ -254,11 +325,10 @@
@index.each do |index|
keep_row = yield access_row(index)
deletion << index unless keep_row
end
-
deletion.each { |idx|
delete_row idx
}
end
@@ -268,52 +338,69 @@
delete_vector vector unless keep_vector
end
end
+ # Iterates over each row and retains it in a new DataFrame if the block returns
+ # true for that row.
def filter_rows &block
- df = Daru::DataFrame.new({}, vectors: @vectors.to_a)
+ df = Daru::DataFrame.new({}, order: @vectors.to_a)
marked = []
@index.each do |index|
keep_row = yield access_row(index)
-
marked << index if keep_row
end
marked.each do |idx|
df.row[idx] = self[idx, :row]
end
df
end
+ # Iterates over each vector and retains it in a new DataFrame if the block returns
+ # true for that vector.
def filter_vectors &block
df = self.dup
-
df.keep_vector_if &block
df
end
+ # Check if a vector is present
def has_vector? name
!!@vectors[name]
end
+ def head quantity=10
+ self[0..quantity, :row]
+ end
+
+ def tail quantity=10
+ self[(@size - quantity)..@size, :row]
+ end
+
+ # def sort_by_row name
+
+ # end
+
+ # def sort_by_vector name
+
+ # end
+
# Converts the DataFrame into an array of hashes where key is vector name
# and value is the corresponding element.
# The 0th index of the array contains the array of hashes while the 1th
# index contains the indexes of each row of the dataframe. Each element in
# the index array corresponds to its row in the array of hashes, which has
# the same index.
def to_a
arry = [[],[]]
-
self.each_row do |row|
arry[0] << row.to_hash
end
-
arry[1] = @index.to_a
arry
end
@@ -323,43 +410,42 @@
else
self.to_a.to_json
end
end
- def to_html threshold=15
- html = '<table><tr><th></th>'
-
+ # Convert to html for IRuby.
+ def to_html threshold=30
+ html = '<table><tr><th></th>'
@vectors.each { |vector| html += '<th>' + vector.to_s + '</th>' }
-
html += '</tr>'
@index.each_with_index do |index, num|
html += '<tr>'
html += '<td>' + index.to_s + '</td>'
self.row[index].each do |element|
html += '<td>' + element.to_s + '</td>'
end
- html += '</tr>'
+ html += '</tr>'
if num > threshold
html += '<tr>'
(@vectors + 1).size.times { html += '<td>...</td>' }
html += '</tr>'
break
end
end
-
html += '</table>'
html
end
def to_s
to_html
end
+ # Pretty print in a nice table format for the command line (irb)
def inspect spacing=10, threshold=15
longest = [@name.to_s.size,
@vectors.map(&:to_s).map(&:size).max,
@index .map(&:to_s).map(&:size).max,
@data .map{ |v| v.map(&:to_s).map(&:size).max }.max].max
@@ -368,36 +454,40 @@
content = ""
longest = spacing if longest > spacing
formatter = "\n"
(@vectors.size + 1).times { formatter += "%#{longest}.#{longest}s " }
-
content += "\n#<" + self.class.to_s + ":" + self.object_id.to_s + " @name = " +
name.to_s + " @size = " + @size.to_s + ">"
-
content += sprintf formatter, "" , *@vectors.map(&:to_s)
+ row_num = 1
- row_num = 1
-
self.each_row_with_index do |row, index|
content += sprintf formatter, index.to_s, *row.to_hash.values.map { |e| (e || 'nil').to_s }
-
row_num += 1
if row_num > threshold
dots = []
(@vectors.size + 1).times { dots << "..." }
- content += sprint formatter, *dots
+ content += sprintf formatter, *dots
break
end
end
-
content += "\n"
content
end
+ def dtype= dtype
+ @dtype = dtype
+
+ @vectors.each do |vec|
+ pos = @vectors[vec]
+ @data[pos] = @data[pos].coerce(@dtype)
+ end
+ end
+
def == other
@index == other.index and @size == other.size and @vectors.all? { |vector|
self[vector, :vector] == other[vector, :vector] }
end
@@ -405,11 +495,11 @@
if md = name.match(/(.+)\=/)
insert_or_modify_vector name[/(.+)\=/].delete("="), args[0]
elsif self.has_vector? name
self[name, :vector]
else
- super(name, *args)
+ super(name, *args, &block)
end
end
private
@@ -421,54 +511,63 @@
return @data[names[0]]
else
raise IndexError, "Specified index #{names[0]} does not exist."
end
end
-
new_vcs = {}
names.each do |name|
name = name.to_sym unless name.is_a?(Integer)
new_vcs[name] = @data[@vectors[name]]
end
-
- Daru::DataFrame.new new_vcs, vectors: new_vcs.keys, index: @index, name: @name
+ Daru::DataFrame.new new_vcs, order: new_vcs.keys, index: @index, name: @name
end
def access_row *names
- unless names[1]
- row = []
+ if names[1].nil?
+ access_token = names[0]
+ if access_token.is_a?(Range)
+ index_arry = @index.to_a
- name = nil
+ range =
+ if access_token.first.is_a?(Numeric)
+ access_token
+ else
+ first_index = index_arry.index access_token.first
+ last_index = index_arry.index access_token.last
- if @index.include? names[0]
- name = names[0]
- elsif @index.key names[0]
- name = @index.key names[0]
+ first_index..last_index
+ end
+
+ names = index_arry[range]
else
- raise IndexError, "Specified row #{names[0]} does not exist."
- end
+ row = []
+ name = named_index_for names[0]
+ @vectors.each do |vector|
+ row << @data[@vectors[vector]][name]
+ end
- @vectors.each do |vector|
- row << @data[@vectors[vector]][name]
+ return Daru::Vector.new(row, index: @vectors, name: name, dtype: @dtype)
end
-
- Daru::Vector.new row, index: @vectors, name: name
- else
- # TODO: Access multiple rows
end
+ # Access multiple rows
+ rows = []
+ names.each do |name|
+ rows << self.row[name]
+ end
+
+ Daru::DataFrame.rows rows, name: @name, dtype: @dtype
end
def insert_or_modify_vector name, vector
@vectors = @vectors.re_index(@vectors + name)
+ v = nil
- v = nil
-
if vector.is_a?(Daru::Vector)
- v = Daru::Vector.new [], name: name, index: @index
-
+ v = Daru::Vector.new [], name: name, index: @index, dtype: @dtype
+ nil_data = false
@index.each do |idx|
begin
v[idx] = vector[idx]
rescue IndexError
v[idx] = nil
@@ -476,30 +575,30 @@
end
else
raise Exception, "Specified vector of length #{vector.size} cannot be inserted in DataFrame of size #{@size}" if
@size != vector.size
- v = vector.dv(name, @index)
+ v = vector.dv(name, @index, @dtype)
end
@data[@vectors[name]] = v
end
def insert_or_modify_row name, vector
if @index.include? name
- v = vector.dv(name, @vectors)
+ v = vector.dv(name, @vectors, @dtype)
@vectors.each do |vector|
begin
@data[@vectors[vector]][name] = v[vector]
rescue IndexError
@data[@vectors[vector]][name] = nil
end
end
else
@index = @index.re_index(@index + name)
- v = vector.dv(name, @vectors)
+ v = vector.dv(name, @vectors, @dtype)
@vectors.each do |vector|
begin
@data[@vectors[vector]].concat v[vector], name
rescue IndexError
@@ -511,10 +610,10 @@
set_size
end
def create_empty_vectors
@vectors.each do |name|
- @data << Daru::Vector.new([],name: name, index: @index)
+ @data << Daru::Vector.new([],name: name, index: @index, dtype: @dtype)
end
end
def validate_labels
raise IndexError, "Expected equal number of vectors for number of Hash pairs" if
\ No newline at end of file