lib/daru/dataframe.rb in daru-0.1.1 vs lib/daru/dataframe.rb in daru-0.1.2
- old
+ new
@@ -12,78 +12,103 @@
include Daru::Maths::Arithmetic::DataFrame
include Daru::Maths::Statistics::DataFrame
include Daru::Plotting::DataFrame if Daru.has_nyaplot?
class << self
- # Load data from a CSV file. Specify an optional block to grab the CSV
- # object and pre-condition it (for example use the `convert` or
+ # Load data from a CSV file. Specify an optional block to grab the CSV
+ # object and pre-condition it (for example use the `convert` or
# `header_convert` methods).
- #
+ #
# == Arguments
- #
+ #
# * path - Path of the file to load specified as a String.
- #
+ #
# == Options
- #
+ #
# Accepts the same options as the Daru::DataFrame constructor and CSV.open()
# and uses those to eventually construct the resulting DataFrame.
#
# == Verbose Description
#
- # You can specify all the options to the `.from_csv` function that you
+ # You can specify all the options to the `.from_csv` function that you
# do to the Ruby `CSV.read()` function, since this is what is used internally.
#
- # For example, if the columns in your CSV file are separated by something
- # other that commas, you can use the `:col_sep` option. If you want to
- # convert numeric values to numbers and not keep them as strings, you can
+ # For example, if the columns in your CSV file are separated by something
+ # other that commas, you can use the `:col_sep` option. If you want to
+ # convert numeric values to numbers and not keep them as strings, you can
# use the `:converters` option and set it to `:numeric`.
#
- # The `.from_csv` function uses the following defaults for reading CSV files
+ # The `.from_csv` function uses the following defaults for reading CSV files
# (that are passed into the `CSV.read()` function):
#
# {
# :col_sep => ',',
# :converters => :numeric
# }
def from_csv path, opts={}, &block
- Daru::IO.from_csv path, opts, &block
+ Daru::IO.from_csv path, opts, &block
end
# Read data from an Excel file into a DataFrame.
- #
+ #
# == Arguments
- #
+ #
# * path - Path of the file to be read.
- #
+ #
# == Options
- #
+ #
# *:worksheet_id - ID of the worksheet that is to be read.
- def from_excel path, opts={}, &block
+ def from_excel path, opts={}, &block
Daru::IO.from_excel path, opts, &block
end
# Read a database query and returns a Dataset
#
+ # @param dbh [DBI::DatabaseHandle] A DBI connection to be used to run the query
+ # @param query [String] The query to be executed
+ #
+ # @return A dataframe containing the data resulting from the query
+ #
# USE:
#
# dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
# Daru::DataFrame.from_sql(dbh, "SELECT * FROM test")
def from_sql dbh, query
Daru::IO.from_sql dbh, query
end
+ # Read a dataframe from AR::Relation
+ #
+ # @param relation [ActiveRecord::Relation] An AR::Relation object from which data is loaded
+ # @params fields [Array] Field names to be loaded (optional)
+ #
+ # @return A dataframe containing the data loaded from the relation
+ #
+ # USE:
+ #
+ # # When Post model is defined as:
+ # class Post < ActiveRecord::Base
+ # scope :active, -> { where.not(published_at: nil) }
+ # end
+ #
+ # # You can load active posts into a dataframe by:
+ # Daru::DataFrame.from_activerecord(Post.active, :title, :published_at)
+ def from_activerecord relation, *fields
+ Daru::IO.from_activerecord relation, *fields
+ end
+
# Read the database from a plaintext file. For this method to work,
# the data should be present in a plain text file in columns. See
# spec/fixtures/bank2.dat for an example.
- #
+ #
# == Arguments
- #
+ #
# * path - Path of the file to be read.
# * fields - Vector names of the resulting database.
- #
+ #
# == Usage
- #
+ #
# df = Daru::DataFrame.from_plaintext 'spec/fixtures/bank2.dat', [:v1,:v2,:v3,:v4,:v5,:v6]
def from_plaintext path, fields
Daru::IO.from_plaintext path, fields
end
@@ -135,19 +160,19 @@
# a 0 1
# b 1 0
#
# Useful to process outputs from databases
def crosstab_by_assignation rows, columns, values
- raise "Three vectors should be equal size" if
+ raise "Three vectors should be equal size" if
rows.size != columns.size or rows.size!=values.size
cols_values = columns.factors
cols_n = cols_values.size
- h_rows = rows.factors.inject({}) do |a,v|
- a[v] = cols_values.inject({}) do |a1,v1|
- a1[v1]=nil
+ h_rows = rows.factors.inject({}) do |a,v|
+ a[v] = cols_values.inject({}) do |a1,v1|
+ a1[v1]=nil
a1
end
a
end
@@ -184,42 +209,42 @@
# DataFrame basically consists of an Array of Vector objects.
# These objects are indexed by row and column by vectors and index Index objects.
#
# == Arguments
- #
+ #
# * source - Source from the DataFrame is to be initialized. Can be a Hash
# of names and vectors (array or Daru::Vector), an array of arrays or
# array of Daru::Vectors.
- #
+ #
# == Options
- #
- # +:order+ - An *Array*/*Daru::Index*/*Daru::MultiIndex* containing the order in
+ #
+ # +:order+ - An *Array*/*Daru::Index*/*Daru::MultiIndex* containing the order in
# which Vectors should appear in the DataFrame.
- #
+ #
# +:index+ - An *Array*/*Daru::Index*/*Daru::MultiIndex* containing the order
# in which rows of the DataFrame will be named.
- #
+ #
# +:name+ - A name for the DataFrame.
#
# +:clone+ - Specify as *true* or *false*. When set to false, and Vector
# objects are passed for the source, the Vector objects will not duplicated
- # when creating the DataFrame. Will have no effect if Array is passed in
- # the source, or if the passed Daru::Vectors have different indexes.
+ # when creating the DataFrame. Will have no effect if Array is passed in
+ # the source, or if the passed Daru::Vectors have different indexes.
# Default to *true*.
- #
+ #
# == Usage
- # df = Daru::DataFrame.new({a: [1,2,3,4], b: [6,7,8,9]}, order: [:b, :a],
+ # df = Daru::DataFrame.new({a: [1,2,3,4], b: [6,7,8,9]}, order: [:b, :a],
# index: [:a, :b, :c, :d], name: :spider_man)
- #
- # # =>
+ #
+ # # =>
# # <Daru::DataFrame:80766980 @name = spider_man @size = 4>
- # # b a
- # # a 6 1
- # # b 7 2
- # # c 8 3
- # # d 9 4
+ # # b a
+ # # a 6 1
+ # # b 7 2
+ # # c 8 3
+ # # d 9 4
def initialize source, opts={}
vectors = opts[:order]
index = opts[:index]
clone = opts[:clone] == false ? false : true
@data = []
@@ -290,11 +315,11 @@
if clone
@vectors.each do |vector|
# avoids matching indexes of vectors if all the supplied vectors
# have the same index.
- if vectors_have_same_index
+ if vectors_have_same_index
v = source[vector].dup
else
v = Daru::Vector.new([], name: vector, index: @index)
@index.each do |idx|
@@ -329,12 +354,12 @@
$stderr.puts "#vector has been deprecated in favour of #[]. Please use that."
self[*names]
end
# Access row or vector. Specify name of row/vector followed by axis(:row, :vector).
- # Defaults to *:vector*. Use of this method is not recommended for accessing
- # rows or vectors. Use df.row[:a] for accessing row with index ':a' or
+ # Defaults to *:vector*. Use of this method is not recommended for accessing
+ # rows or vectors. Use df.row[:a] for accessing row with index ':a' or
# df.vector[:vec] for accessing vector with index *:vec*.
def [](*names)
if names[-1] == :vector or names[-1] == :row
axis = names[-1]
names = names[0..-2]
@@ -352,11 +377,11 @@
end
# Insert a new row/vector of the specified name or modify a previous row.
# Instead of using this method directly, use df.row[:a] = [1,2,3] to set/create
# a row ':a' to [1,2,3], or df.vector[:vec] = [1,2,3] for vectors.
- #
+ #
# In case a Daru::Vector is specified after the equality the sign, the indexes
# of the vector will be matched against the row/vector indexes of the DataFrame
# before an insertion is performed. Unmatched indexes will be set to nil.
def []=(*args)
axis = args.include?(:row) ? :row : :vector
@@ -366,11 +391,11 @@
name = args[0..-2]
vector = args[-1]
if axis == :vector
insert_or_modify_vector name, vector
- elsif axis == :row
+ elsif axis == :row
insert_or_modify_row name, vector
else
raise IndexError, "Expected axis to be row or vector, not #{axis}."
end
end
@@ -387,30 +412,30 @@
def add_vector n, vector
self[n] = vector
end
# Access a row or set/create a row. Refer #[] and #[]= docs for details.
- #
+ #
# == Usage
# df.row[:a] # access row named ':a'
# df.row[:b] = [1,2,3] # set row ':b' to [1,2,3]
def row
Daru::Accessors::DataFrameByRow.new(self)
end
# Duplicate the DataFrame entirely.
- #
+ #
# == Arguments
- #
- # * +vectors_to_dup+ - An Array specifying the names of Vectors to
+ #
+ # * +vectors_to_dup+ - An Array specifying the names of Vectors to
# be duplicated. Will duplicate the entire DataFrame if not specified.
def dup vectors_to_dup=nil
vectors_to_dup = @vectors.to_a unless vectors_to_dup
src = []
vectors_to_dup.each do |vec|
- src << @data[@vectors[vec]].to_a
+ src << @data[@vectors[vec]].to_a.dup
end
new_order = Daru::Index.new(vectors_to_dup)
Daru::DataFrame.new src, order: new_order, index: @index.dup, name: @name, clone: true
end
@@ -420,13 +445,13 @@
Daru::DataFrame.new([], order: @vectors.dup, index: @index.dup, name: @name)
end
# Returns a 'view' of the DataFrame, i.e the object ID's of vectors are
# preserved.
- #
+ #
# == Arguments
- #
+ #
# +vectors_to_clone+ - Names of vectors to clone. Optional. Will return
# a view of the whole data frame otherwise.
def clone *vectors_to_clone
vectors_to_clone.flatten! unless vectors_to_clone.all? { |a| !a.is_a?(Array) }
return super if vectors_to_clone.empty?
@@ -436,21 +461,21 @@
hsh
end
Daru::DataFrame.new(h, clone: false)
end
- # Returns a 'shallow' copy of DataFrame if missing data is not present,
+ # Returns a 'shallow' copy of DataFrame if missing data is not present,
# or a full copy of only valid data if missing data is present.
def clone_only_valid
if has_missing_data?
dup_only_valid
else
clone
end
end
- # Creates a new duplicate dataframe containing only rows
+ # Creates a new duplicate dataframe containing only rows
# without a single missing value.
def dup_only_valid vecs=nil
rows_with_nil = @data.inject([]) do |memo, vector|
memo.concat vector.missing_positions
memo
@@ -483,11 +508,11 @@
def each_vector_with_index(&block)
return to_enum(:each_vector_with_index) unless block_given?
@vectors.each do |vector|
yield @data[@vectors[vector]], vector
- end
+ end
self
end
alias_method :each_column_with_index, :each_vector_with_index
@@ -516,16 +541,16 @@
# Iterate over each row or vector of the DataFrame. Specify axis
# by passing :vector or :row as the argument. Default to :vector.
#
# == Description
#
- # `#each` works exactly like Array#each. The default mode for `each`
- # is to iterate over the columns of the DataFrame. To iterate over
+ # `#each` works exactly like Array#each. The default mode for `each`
+ # is to iterate over the columns of the DataFrame. To iterate over
# rows you must pass the axis, i.e `:row` as an argument.
- #
+ #
# == Arguments
- #
+ #
# * +axis+ - The axis to iterate over. Can be :vector (or :column)
# or :row. Default to :vector.
def each axis=:vector, &block
if axis == :vector or axis == :column
each_vector(&block)
@@ -539,18 +564,18 @@
# Iterate over a row or vector and return results in a Daru::Vector.
# Specify axis with :vector or :row. Default to :vector.
#
# == Description
#
- # The #collect iterator works similar to #map, the only difference
- # being that it returns a Daru::Vector comprising of the results of
- # each block run. The resultant Vector has the same index as that
- # of the axis over which collect has iterated. It also accepts the
+ # The #collect iterator works similar to #map, the only difference
+ # being that it returns a Daru::Vector comprising of the results of
+ # each block run. The resultant Vector has the same index as that
+ # of the axis over which collect has iterated. It also accepts the
# optional axis argument.
#
# == Arguments
- #
+ #
# * +axis+ - The axis to iterate over. Can be :vector (or :column)
# or :row. Default to :vector.
def collect axis=:vector, &block
if axis == :vector or axis == :column
collect_vectors(&block)
@@ -563,20 +588,20 @@
# Map over each vector or row of the data frame according to
# the argument specified. Will return an Array of the resulting
# elements. To map over each row/vector and get a DataFrame,
# see #recode.
- #
+ #
# == Description
- #
- # The #map iterator works like Array#map. The value returned by
- # each run of the block is added to an Array and the Array is
- # returned. This method also accepts an axis argument, like #each.
+ #
+ # The #map iterator works like Array#map. The value returned by
+ # each run of the block is added to an Array and the Array is
+ # returned. This method also accepts an axis argument, like #each.
# The default is :vector.
- #
+ #
# == Arguments
- #
+ #
# * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
# Default to :vector.
def map axis=:vector, &block
if axis == :vector or axis == :column
map_vectors(&block)
@@ -588,13 +613,13 @@
end
# Destructive map. Modifies the DataFrame. Each run of the block
# must return a Daru::Vector. You can specify the axis to map over
# as the argument. Default to :vector.
- #
+ #
# == Arguments
- #
+ #
# * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
# Default to :vector.
def map! axis=:vector, &block
if axis == :vector or axis == :column
map_vectors!(&block)
@@ -607,19 +632,19 @@
# block must return a Daru::Vector object. You can specify the axis
# to map over. Default to :vector.
#
# == Description
#
- # Recode works similarly to #map, but an important difference between
- # the two is that recode returns a modified Daru::DataFrame instead
- # of an Array. For this reason, #recode expects that every run of the
+ # Recode works similarly to #map, but an important difference between
+ # the two is that recode returns a modified Daru::DataFrame instead
+ # of an Array. For this reason, #recode expects that every run of the
# block to return a Daru::Vector.
#
# Just like map and each, recode also accepts an optional _axis_ argument.
- #
+ #
# == Arguments
- #
+ #
# * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
# Default to :vector.
def recode axis=:vector, &block
if axis == :vector or axis == :column
recode_vectors(&block)
@@ -627,26 +652,26 @@
recode_rows(&block)
end
end
# Retain vectors or rows if the block returns a truthy value.
- #
+ #
# == Description
- #
- # For filtering out certain rows/vectors based on their values,
- # use the #filter method. By default it iterates over vectors and
- # keeps those vectors for which the block returns true. It accepts
- # an optional axis argument which lets you specify whether you want
+ #
+ # For filtering out certain rows/vectors based on their values,
+ # use the #filter method. By default it iterates over vectors and
+ # keeps those vectors for which the block returns true. It accepts
+ # an optional axis argument which lets you specify whether you want
# to iterate over vectors or rows.
- #
+ #
# == Arguments
- #
+ #
# * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
# Default to :vector.
- #
+ #
# == Usage
- #
+ #
# # Filter vectors
#
# df.filter do |vector|
# vector.type == :numeric and vector.median < 50
# end
@@ -663,16 +688,16 @@
filter_rows(&block)
end
end
def recode_vectors &block
- block_given? or return to_enum(:recode_vectors)
+ block_given? or return to_enum(:recode_vectors)
df = self.dup
df.each_vector_with_index do |v, i|
ret = yield v
- ret.is_a?(Daru::Vector) or
+ ret.is_a?(Daru::Vector) or
raise TypeError, "Every iteration must return Daru::Vector not #{ret.class}"
df[*i] = ret
end
df
@@ -761,11 +786,11 @@
end
self
end
- # Retrieves a Daru::Vector, based on the result of calculation
+ # Retrieves a Daru::Vector, based on the result of calculation
# performed on each row.
def collect_rows &block
return to_enum(:collect_rows) unless block_given?
data = []
@@ -876,19 +901,19 @@
@index.each do |index|
keep_row = yield access_row(index)
deletion << index unless keep_row
end
- deletion.each { |idx|
- delete_row idx
+ deletion.each { |idx|
+ delete_row idx
}
end
def keep_vector_if &block
@vectors.each do |vector|
keep_vector = yield @data[@vectors[vector]], vector
-
+
delete_vector vector unless keep_vector
end
end
# creates a new vector with the data of a given field which the block returns true
@@ -923,20 +948,20 @@
# Iterates over each vector and retains it in a new DataFrame if the block returns
# true for that vector.
def filter_vectors &block
return to_enum(:filter_vectors) unless block_given?
-
+
df = self.dup
df.keep_vector_if &block
df
end
# Test each row with one or more tests. Each test is a Proc with the form
# *Proc.new {|row| row[:age] > 0}*
- #
+ #
# The function returns an array with all errors.
def verify(*tests)
if(tests[0].is_a? Symbol)
id = tests[0]
tests.shift
@@ -961,13 +986,13 @@
vr
end
# DSL for yielding each row and returning a Daru::Vector based on the
# value each run of the block returns.
- #
+ #
# == Usage
- #
+ #
# a1 = Daru::Vector.new([1, 2, 3, 4, 5, 6, 7])
# a2 = Daru::Vector.new([10, 20, 30, 40, 50, 60, 70])
# a3 = Daru::Vector.new([100, 200, 300, 400, 500, 600, 700])
# ds = Daru::DataFrame.new({ :a => a1, :b => a2, :c => a3 })
# total = ds.vector_by_calculation { a + b + c }
@@ -989,30 +1014,30 @@
Daru::Vector.new a, index: @index
end
# Returns a vector, based on a string with a calculation based
# on vector.
- #
+ #
# The calculation will be eval'ed, so you can put any variable
# or expression valid on ruby.
- #
+ #
# For example:
# a = Daru::Vector.new [1,2]
# b = Daru::Vector.new [3,4]
# ds = Daru::DataFrame.new({:a => a,:b => b})
# ds.compute("a+b")
# => Vector [4,6]
def compute text, &block
return instance_eval(&block) if block_given?
- instance_eval(text)
+ instance_eval(text)
end
# Return a vector with the number of missing values in each row.
- #
+ #
# == Arguments
- #
- # * +missing_values+ - An Array of the values that should be
+ #
+ # * +missing_values+ - An Array of the values that should be
# treated as 'missing'. The default missing value is *nil*.
def missing_values_rows missing_values=[nil]
number_of_missing = []
each_row do |row|
row.missing_values = missing_values
@@ -1029,13 +1054,13 @@
!!@data.any? { |v| v.has_missing_data? }
end
alias :flawed? :has_missing_data?
- # Return a nested hash using vector names as keys and an array constructed of
+ # Return a nested hash using vector names as keys and an array constructed of
# hashes with other values. If block provided, is used to provide the
- # values, with parameters +row+ of dataset, +current+ last hash on
+ # values, with parameters +row+ of dataset, +current+ last hash on
# hierarchy and +name+ of the key to include
def nest *tree_keys, &block
tree_keys = tree_keys[0] if tree_keys[0].is_a? Array
out = {}
@@ -1099,11 +1124,11 @@
# @param [Symbol] axis (:vector) The axis to iterate over. Can be :vector or
# :row. A Daru::Vector object is yielded in the block.
# @example Using any?
# df = Daru::DataFrame.new({a: [1,2,3,4,5], b: ['a', 'b', 'c', 'd', 'e']})
# df.any?(:row) do |row|
- # row[:a] < 3 and row[:b] == 'b'
+ # row[:a] < 3 and row[:b] == 'b'
# end #=> true
def any? axis=:vector, &block
if axis == :vector or axis == :column
@data.any?(&block)
elsif axis == :row
@@ -1121,11 +1146,11 @@
# @param [Symbol] axis (:vector) The axis to iterate over. Can be :vector or
# :row. A Daru::Vector object is yielded in the block.
# @example Using all?
# df = Daru::DataFrame.new({a: [1,2,3,4,5], b: ['a', 'b', 'c', 'd', 'e']})
# df.all?(:row) do |row|
- # row[:a] < 10
+ # row[:a] < 10
# end #=> true
def all? axis=:vector, &block
if axis == :vector or axis == :column
@data.all?(&block)
elsif axis == :row
@@ -1143,18 +1168,22 @@
# @param [Fixnum] quantity (10) The number of elements to display from the top.
def head quantity=10
self[0..(quantity-1), :row]
end
+ alias :first :head
+
# The last ten elements of the DataFrame
- #
+ #
# @param [Fixnum] quantity (10) The number of elements to display from the bottom.
def tail quantity=10
self[(@size - quantity)..(@size-1), :row]
end
- # Returns a vector with sum of all vectors specified in the argument.
+ alias :last :tail
+
+ # Returns a vector with sum of all vectors specified in the argument.
# Tf vecs parameter is empty, sum all numeric vector.
def vector_sum vecs=nil
vecs ||= numeric_vectors
sum = Daru::Vector.new [0]*@size, index: @index, name: @name, dtype: @dtype
@@ -1164,13 +1193,13 @@
sum
end
# Calculate mean of the rows of the dataframe.
- #
+ #
# == Arguments
- #
+ #
# * +max_missing+ - The maximum number of elements in the row that can be
# zero for the mean calculation to happen. Default to 0.
def vector_mean max_missing=0
mean_vec = Daru::Vector.new [0]*@size, index: @index, name: "mean_#{@name}"
@@ -1179,20 +1208,20 @@
end
mean_vec
end
- # Group elements by vector to perform operations on them. Returns a
+ # Group elements by vector to perform operations on them. Returns a
# Daru::Core::GroupBy object.See the Daru::Core::GroupBy docs for a detailed
# list of possible operations.
- #
+ #
# == Arguments
- #
+ #
# * vectors - An Array contatining names of vectors to group by.
- #
+ #
# == Usage
- #
+ #
# df = Daru::DataFrame.new({
# a: %w{foo bar foo bar foo bar foo foo},
# b: %w{one one two three two two one three},
# c: [1 ,2 ,3 ,1 ,3 ,6 ,3 ,8],
# d: [11 ,22 ,33 ,44 ,55 ,66 ,77 ,88]
@@ -1207,11 +1236,11 @@
# # ["foo", "two", 3]=>[2, 4]}
def group_by *vectors
vectors.flatten!
vectors.each { |v| raise(ArgumentError, "Vector #{v} does not exist") unless
has_vector?(v) }
-
+
Daru::Core::GroupBy.new(self, vectors)
end
def reindex_vectors new_vectors
raise ArgumentError, "Must pass the new index of type Index or its "\
@@ -1232,48 +1261,48 @@
# Concatenate another DataFrame along corresponding columns.
# Very premature implementation. Use with caution.
def concat other_df
vectors = []
@vectors.each do |v|
- vectors << self[v].to_a.concat(other_df[v].to_a)
+ vectors << self[v].to_a.dup.concat(other_df[v].to_a)
end
Daru::DataFrame.new(vectors, order: @vectors)
end
# Set a particular column as the new DF
def set_index new_index, opts={}
- raise ArgumentError, "All elements in new index must be unique." if
+ raise ArgumentError, "All elements in new index must be unique." if
@size != self[new_index].uniq.size
-
+
self.index = Daru::Index.new(self[new_index].to_a)
self.delete_vector(new_index) unless opts[:keep]
self
end
# Change the index of the DataFrame and preserve the labels of the previous
# indexing. New index can be Daru::Index or any of its subclasses.
- #
+ #
# @param [Daru::Index] new_index The new Index for reindexing the DataFrame.
# @example Reindexing DataFrame
- # df = Daru::DataFrame.new({a: [1,2,3,4], b: [11,22,33,44]},
+ # df = Daru::DataFrame.new({a: [1,2,3,4], b: [11,22,33,44]},
# index: ['a','b','c','d'])
- # #=>
+ # #=>
# ##<Daru::DataFrame:83278130 @name = b19277b8-c548-41da-ad9a-2ad8c060e273 @size = 4>
- # # a b
- # # a 1 11
- # # b 2 22
- # # c 3 33
- # # d 4 44
+ # # a b
+ # # a 1 11
+ # # b 2 22
+ # # c 3 33
+ # # d 4 44
# df.reindex Daru::Index.new(['b', 0, 'a', 'g'])
- # #=>
+ # #=>
# ##<Daru::DataFrame:83177070 @name = b19277b8-c548-41da-ad9a-2ad8c060e273 @size = 4>
- # # a b
- # # b 2 22
- # # 0 nil nil
- # # a 1 11
+ # # a b
+ # # b 2 22
+ # # 0 nil nil
+ # # a 1 11
# # g nil nil
def reindex new_index
raise ArgumentError, "Must pass the new index of type Index or its "\
"subclasses, not #{new_index.class}" unless new_index.kind_of?(Daru::Index)
@@ -1294,33 +1323,33 @@
# @param [Daru::Index] idx New index object on which the rows of the dataframe
# are to be indexed.
# @example Reassgining index of a DataFrame
# df = Daru::DataFrame.new({a: [1,2,3,4], b: [11,22,33,44]})
# df.index.to_a #=> [0,1,2,3]
- #
+ #
# df.index = Daru::Index.new(['a','b','c','d'])
# df.index.to_a #=> ['a','b','c','d']
- # df.row['a'].to_a #=> [1,11]
+ # df.row['a'].to_a #=> [1,11]
def index= idx
@data.each { |vec| vec.index = idx}
@index = idx
self
end
# Reassign vectors with a new index of type Daru::Index or any of its subclasses.
- #
+ #
# @param [Daru::Index] idx The new index object on which the vectors are to
# be indexed. Must of the same size as ncols.
# @example Reassigning vectors of a DataFrame
# df = Daru::DataFrame.new({a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44]})
# df.vectors.to_a #=> [:a, :b, :c]
- #
+ #
# df.vectors = Daru::Index.new([:foo, :bar, :baz])
# df.vectors.to_a #=> [:foo, :bar, :baz]
def vectors= idx
- raise ArgumentError, "Can only reindex with Index and its subclasses" unless
+ raise ArgumentError, "Can only reindex with Index and its subclasses" unless
index.kind_of?(Daru::Index)
raise ArgumentError, "Specified index length #{idx.size} not equal to"\
"dataframe size #{ncols}" if idx.size != ncols
@vectors = idx
@@ -1375,35 +1404,35 @@
g.parse_element(self[v])
end
end
end
- # Sorts a dataframe (ascending/descending)according to the given sequence of
+ # Sorts a dataframe (ascending/descending)according to the given sequence of
# vectors, using the attributes provided in the blocks.
- #
+ #
# @param order [Array] The order of vector names in which the DataFrame
# should be sorted.
# @param [Hash] opts The options to sort with.
# @option opts [TrueClass,FalseClass,Array] :ascending (true) Sort in ascending
# or descending order. Specify Array corresponding to *order* for multiple
# sort orders.
# @option opts [Hash] :by ({|a,b| a <=> b}) Specify attributes of objects to
- # to be used for sorting, for each vector name in *order* as a hash of
+ # to be used for sorting, for each vector name in *order* as a hash of
# vector name and lambda pairs. In case a lambda for a vector is not
# specified, the default will be used.
- #
+ #
# == Usage
- #
+ #
# df = Daru::DataFrame.new({a: [-3,2,-1,4], b: [4,3,2,1]})
- #
+ #
# #<Daru::DataFrame:140630680 @name = 04e00197-f8d5-4161-bca2-93266bfabc6f @size = 4>
- # # a b
- # # 0 -3 4
- # # 1 2 3
- # # 2 -1 2
- # # 3 4 1
- # df.sort([:a], by: { a: lambda { |a,b| a.abs <=> b.abs } })
+ # # a b
+ # # 0 -3 4
+ # # 1 2 3
+ # # 2 -1 2
+ # # 3 4 1
+ # df.sort([:a], by: { a: lambda { |a,b| a.abs <=> b.abs } })
def sort! vector_order, opts={}
raise ArgumentError, "Required atleast one vector name" if vector_order.size < 1
opts = {
ascending: true,
type: :quick_sort,
@@ -1424,58 +1453,58 @@
self.dup.sort! vector_order, opts
end
# Pivots a data frame on specified vectors and applies an aggregate function
# to quickly generate a summary.
- #
+ #
# == Options
- #
+ #
# +:index+ - Keys to group by on the pivot table row index. Pass vector names
# contained in an Array.
- #
+ #
# +:vectors+ - Keys to group by on the pivot table column index. Pass vector
# names contained in an Array.
- #
+ #
# +:agg+ - Function to aggregate the grouped values. Default to *:mean*. Can
- # use any of the statistics functions applicable on Vectors that can be found in
+ # use any of the statistics functions applicable on Vectors that can be found in
# the Daru::Statistics::Vector module.
- #
- # +:values+ - Columns to aggregate. Will consider all numeric columns not
+ #
+ # +:values+ - Columns to aggregate. Will consider all numeric columns not
# specified in *:index* or *:vectors*. Optional.
- #
+ #
# == Usage
- #
+ #
# df = Daru::DataFrame.new({
- # a: ['foo' , 'foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar'],
+ # a: ['foo' , 'foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar'],
# b: ['one' , 'one', 'one', 'two', 'two', 'one', 'one', 'two', 'two'],
# c: ['small','large','large','small','small','large','small','large','small'],
# d: [1,2,2,3,3,4,5,6,7],
# e: [2,4,4,6,6,8,10,12,14]
# })
# df.pivot_table(index: [:a], vectors: [:b], agg: :sum, values: :e)
- #
- # #=>
+ #
+ # #=>
# # #<Daru::DataFrame:88342020 @name = 08cdaf4e-b154-4186-9084-e76dd191b2c9 @size = 2>
- # # [:e, :one] [:e, :two]
- # # [:bar] 18 26
- # # [:foo] 10 12
+ # # [:e, :one] [:e, :two]
+ # # [:bar] 18 26
+ # # [:foo] 10 12
def pivot_table opts={}
- raise ArgumentError,
+ raise ArgumentError,
"Specify grouping index" if !opts[:index] or opts[:index].empty?
index = opts[:index]
vectors = opts[:vectors] || []
aggregate_function = opts[:agg] || :mean
- values =
+ values =
if opts[:values].is_a?(Symbol)
[opts[:values]]
elsif opts[:values].is_a?(Array)
opts[:values]
else # nil
(@vectors.to_a - (index | vectors)) & numeric_vector_names
end
-
+
raise IndexError, "No numeric vectors to aggregate" if values.empty?
grouped = group_by(index)
unless vectors.empty?
@@ -1522,11 +1551,11 @@
else
grouped.send(aggregate_function)
end
end
- # Merge vectors from two DataFrames. In case of name collision,
+ # Merge vectors from two DataFrames. In case of name collision,
# the vectors names are changed to x_1, x_2 ....
#
# @return {Daru::DataFrame}
def merge other_df
raise "Number of rows must be equal in this: #{nrows} and other: #{other_df.nrows}" unless nrows == other_df.nrows
@@ -1543,13 +1572,13 @@
df_new.update
df_new
end
- # Join 2 DataFrames with SQL style joins. Currently supports inner, left
+ # Join 2 DataFrames with SQL style joins. Currently supports inner, left
# outer, right outer and full outer joins.
- #
+ #
# @param [Daru::DataFrame] other_df Another DataFrame on which the join is
# to be performed.
# @param [Hash] opts Options Hash
# @option :how [Symbol] Can be one of :inner, :left, :right or :outer.
# @option :on [Array] The columns on which the join is to be performed.
@@ -1563,15 +1592,15 @@
# right = Daru::DataFrame.new({
# :id => [1,2,3,4],
# :name => ['Rutabaga', 'Pirate', 'Darth Vader', 'Ninja']
# })
# left.join(right, how: :inner, on: [:name])
- # #=>
+ # #=>
# ##<Daru::DataFrame:82416700 @name = 74c0811b-76c6-4c42-ac93-e6458e82afb0 @size = 2>
- # # id_1 name id_2
- # # 0 1 Pirate 2
- # # 1 3 Ninja 4
+ # # id_1 name id_2
+ # # 0 1 Pirate 2
+ # # 1 3 Ninja 4
def join(other_df,opts={})
Daru::Core::Merge.join(self, other_df, opts)
end
@@ -1584,11 +1613,11 @@
# with
# ds.one_to_many([:id], "child_%v_%n"
# the field of first parameters will be copied verbatim
# to new dataset, and fields which responds to second
# pattern will be added one case for each different %n.
- #
+ #
# @example
# cases=[
# ['1','george','red',10,'blue',20,nil,nil],
# ['2','fred','green',15,'orange',30,'white',20],
# ['3','alfred',nil,nil,nil,nil,nil,nil]
@@ -1605,13 +1634,13 @@
def one_to_many(parent_fields, pattern)
re = Regexp.new pattern.gsub("%v","(.+?)").gsub("%n","(\\d+?)")
ds_vars = parent_fields.dup
vars = []
max_n = 0
- h = parent_fields.inject({}) { |a,v|
+ h = parent_fields.inject({}) { |a,v|
a[v] = Daru::Vector.new([])
- a
+ a
}
# Adding _row_id
h['_col_id'] = Daru::Vector.new([])
ds_vars.push('_col_id')
@@ -1661,16 +1690,16 @@
i += 1
}
end
# Create a sql, basen on a given Dataset
- #
+ #
# == Arguments
- #
+ #
# * table - String specifying name of the table that will created in SQL.
# * charset - Character set. Default is "UTF8".
- #
+ #
# @example
#
# ds = Daru::DataFrame.new({
# :id => Daru::Vector.new([1,2,3,4,5]),
# :name => Daru::Vector.new(%w{Alex Peter Susan Mary John})
@@ -1715,21 +1744,21 @@
# Convert all vectors of type *:numeric* and not containing nils into an NMatrix.
def to_nmatrix
numerics_as_arrays = []
each_vector do |vector|
- numerics_as_arrays << vector.to_a if(vector.type == :numeric and
+ numerics_as_arrays << vector.to_a if(vector.type == :numeric and
vector.missing_positions.size == 0)
end
numerics_as_arrays.transpose.to_nm
end
-
+
# Converts the DataFrame into an array of hashes where key is vector name
- # and value is the corresponding element. The 0th index of the array contains
- # the array of hashes while the 1th index contains the indexes of each row
- # of the dataframe. Each element in the index array corresponds to its row
+ # and value is the corresponding element. The 0th index of the array contains
+ # the array of hashes while the 1th index contains the indexes of each row
+ # of the dataframe. Each element in the index array corresponds to its row
# in the array of hashes, which has the same index.
def to_a
arry = [[],[]]
self.each_row do |row|
arry[0] << row.to_hash
@@ -1760,14 +1789,14 @@
hsh
end
# Convert to html for IRuby.
def to_html threshold=30
- html = "<table>" +
+ html = "<table>" +
"<tr>" +
- "<th colspan=\"#{@vectors.size+1}\">" +
- "Daru::DataFrame:#{self.object_id} " + " rows: #{nrows} " + " cols: #{ncols}"
+ "<th colspan=\"#{@vectors.size+1}\">" +
+ "Daru::DataFrame:#{self.object_id} " + " rows: #{nrows} " + " cols: #{ncols}"
"</th>" +
"</tr>"
html +='<tr><th></th>'
@vectors.each { |vector| html += '<th>' + vector.to_s + '</th>' }
html += '</tr>'
@@ -1789,11 +1818,11 @@
last_index = @index.to_a.last
last_row = self.row[last_index]
html += '<tr>'
html += "<td>" + last_index.to_s + "</td>"
(0..(ncols - 1)).to_a.each do |i|
- html += '<td>' + last_row[i].to_s + '</td>'
+ html += '<td>' + last_row[i].to_s + '</td>'
end
html += '</tr>'
break
end
end
@@ -1823,37 +1852,37 @@
# Write this DataFrame to a CSV file.
#
# == Arguements
#
# * filename - Path of CSV file where the DataFrame is to be saved.
- #
+ #
# == Options
- #
+ #
# * convert_comma - If set to *true*, will convert any commas in any
# of the data to full stops ('.').
- # All the options accepted by CSV.read() can also be passed into this
+ # All the options accepted by CSV.read() can also be passed into this
# function.
def write_csv filename, opts={}
Daru::IO.dataframe_write_csv self, filename, opts
end
# Write this dataframe to an Excel Spreadsheet
- #
+ #
# == Arguments
- #
+ #
# * filename - The path of the file where the DataFrame should be written.
def write_excel filename, opts={}
Daru::IO.dataframe_write_excel self, filename, opts
end
# Insert each case of the Dataset on the selected table
#
# == Arguments
- #
+ #
# * dbh - DBI database connection object.
# * query - Query string.
- #
+ #
# == Usage
#
# ds = Daru::DataFrame.new({:id=>Daru::Vector.new([1,2,3]), :name=>Daru::Vector.new(["a","b","c"])})
# dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
# ds.write_sql(dbh,"test")
@@ -1867,27 +1896,27 @@
Daru::IO.save self, filename
end
def _dump depth
Marshal.dump({
- data: @data,
- index: @index.to_a,
+ data: @data,
+ index: @index.to_a,
order: @vectors.to_a,
name: @name
})
end
def self._load data
h = Marshal.load data
- Daru::DataFrame.new(h[:data],
- index: h[:index],
+ Daru::DataFrame.new(h[:data],
+ index: h[:index],
order: h[:order],
name: h[:name])
end
# Change dtypes of vectors by supplying a hash of :vector_name => :new_dtype
- #
+ #
# == Usage
# df = Daru::DataFrame.new({a: [1,2,3], b: [1,2,3], c: [1,2,3]})
# df.recast a: :nmatrix, c: :nmatrix
def recast opts={}
opts.each do |vector_name, dtype|
@@ -1906,21 +1935,21 @@
end
# Pretty print in a nice table format for the command line (irb/pry/iruby)
def inspect spacing=10, threshold=15
longest = [@name.to_s.size,
- (@vectors.map(&:to_s).map(&:size).max || 0),
+ (@vectors.map(&:to_s).map(&:size).max || 0),
(@index .map(&:to_s).map(&:size).max || 0),
(@data .map{ |v| v.map(&:to_s).map(&:size).max}.max || 0)].max
name = @name || 'nil'
content = ""
longest = spacing if longest > spacing
formatter = "\n"
(@vectors.size + 1).times { formatter += "%#{longest}.#{longest}s " }
- content += "\n#<" + self.class.to_s + ":" + self.object_id.to_s + " @name = " +
+ content += "\n#<" + self.class.to_s + ":" + self.object_id.to_s + " @name = " +
name.to_s + " @size = " + @size.to_s + ">"
content += sprintf formatter, "" , *@vectors.map(&:to_s)
row_num = 1
self.each_row_with_index do |row, index|
@@ -1943,14 +1972,14 @@
def where bool_array
Daru::Core::Query.df_where self, bool_array
end
def == other
- self.class == other.class and
- @size == other.size and
+ self.class == other.class and
+ @size == other.size and
@index == other.index and
- @vectors == other.vectors and
+ @vectors == other.vectors and
@vectors.to_a.all? { |v| self[v] == other[v] }
end
def method_missing(name, *args, &block)
if md = name.match(/(.+)\=/)
@@ -1975,13 +2004,13 @@
def quick_sort vector_order, index, by, ascending
recursive_quick_sort vector_order, index, by, ascending, 0, @size-1
end
# == Arguments
- #
- # vector_order -
- # index -
+ #
+ # vector_order -
+ # index -
# by -
# ascending -
# left_lower -
# right_upper -
def recursive_quick_sort vector_order, index, by, ascending, left_lower, right_upper
@@ -2118,11 +2147,11 @@
names.each do |name|
new_vcs << @data[@vectors[name]].to_a
end
order = names.is_a?(Array) ? Daru::Index.new(names) : names
- Daru::DataFrame.new(new_vcs, order: order,
+ Daru::DataFrame.new(new_vcs, order: order,
index: @index, name: @name)
end
end
def access_row *names
@@ -2132,20 +2161,20 @@
pos = @index[names]
if pos.is_a?(Integer)
return Daru::Vector.new(populate_row_for(pos), index: @vectors, name: pos)
else
new_rows = pos.map { |tuple| populate_row_for(tuple) }
-
+
if !location.is_a?(Range) and names.size < @index.width
pos = pos.drop_left_level names.size
end
Daru::DataFrame.rows(
new_rows, order: @vectors, name: @name, index: pos)
end
else
- if names[1].nil?
+ if names[1].nil?
names = @index[location]
if names.is_a?(Numeric)
row = []
@data.each do |vector|
row << vector[location]
@@ -2157,27 +2186,27 @@
# Access multiple rows
rows = []
names.each do |name|
rows << self.row[name].to_a
end
-
- Daru::DataFrame.rows rows, index: names ,name: @name, order: @vectors
+
+ Daru::DataFrame.rows rows, index: names ,name: @name, order: @vectors
end
end
def populate_row_for pos
@data.map do |vector|
vector[pos]
end
end
def insert_or_modify_vector name, vector
- name = name[0] unless @vectors.is_a?(MultiIndex)
+ name = name[0] unless @vectors.is_a?(MultiIndex)
v = nil
if @index.empty?
- v = vector.is_a?(Daru::Vector) ? vector : Daru::Vector.new(vector.to_a)
+ v = vector.is_a?(Daru::Vector) ? vector : Daru::Vector.new(vector.to_a)
@index = v.index
assign_or_add_vector name, v
set_size
@data.map! do |v|
@@ -2215,24 +2244,24 @@
def assign_or_add_vector name, v
#FIXME: fix this jugaad. need to make changes in Indexing itself.
pos = @vectors[name]
- if !pos.kind_of?(Daru::Index) and pos == name and
+ if !pos.kind_of?(Daru::Index) and pos == name and
(@vectors.include?(name) or (pos.is_a?(Integer) and pos < @data.size))
@data[pos] = v
elsif pos.kind_of?(Daru::Index)
pos.each do |p|
@data[@vectors[p]] = v
end
else
@vectors = @vectors | [name] if !@vectors.include?(name)
@data[@vectors[name]] = v
- end
+ end
end
- def insert_or_modify_row name, vector
+ def insert_or_modify_row name, vector
if index.is_a?(MultiIndex)
# TODO
else
name = name[0]
v =
@@ -2262,11 +2291,11 @@
@data << Daru::Vector.new([], name: set_name(name), index: @index)
end
end
def validate_labels
- raise IndexError, "Expected equal number of vector names (#{@vectors.size}) for number of vectors (#{@data.size})." if
+ raise IndexError, "Expected equal number of vector names (#{@vectors.size}) for number of vectors (#{@data.size})." if
@vectors and @vectors.size != @data.size
raise IndexError, "Expected number of indexes same as number of rows" if
@index and @data[0] and @index.size != @data[0].size
end
@@ -2328,11 +2357,11 @@
def set_name potential_name
potential_name.is_a?(Array) ? potential_name.join : potential_name
end
def symbolize arry
- symbolized_arry =
+ symbolized_arry =
if arry.all? { |e| e.is_a?(Array) }
arry.map do |sub_arry|
sub_arry.map do |e|
e.is_a?(Numeric) ? e : e.to_sym
end
@@ -2342,6 +2371,6 @@
end
symbolized_arry
end
end
-end
\ No newline at end of file
+end