lib/daru/dataframe.rb in daru-0.1.5 vs lib/daru/dataframe.rb in daru-0.1.6
- old
+ new
@@ -18,11 +18,11 @@
# object and pre-condition it (for example use the `convert` or
# `header_convert` methods).
#
# == Arguments
#
- # * path - Path of the file to load specified as a String.
+ # * path - Local path / Remote URL of the file to load specified as a String.
#
# == Options
#
# Accepts the same options as the Daru::DataFrame constructor and CSV.open()
# and uses those to eventually construct the resulting DataFrame.
@@ -61,19 +61,24 @@
Daru::IO.from_excel path, opts, &block
end
# Read a database query and returns a Dataset
#
- # @param dbh [DBI::DatabaseHandle] A DBI connection to be used to run the query
+ # @param dbh [DBI::DatabaseHandle, String] A DBI connection OR Path to a SQlite3 database.
# @param query [String] The query to be executed
#
# @return A dataframe containing the data resulting from the query
#
# USE:
#
# dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
# Daru::DataFrame.from_sql(dbh, "SELECT * FROM test")
+ #
+ # #Alternatively
+ #
+ # require 'dbi'
+ # Daru::DataFrame.from_sql("path/to/sqlite.db", "SELECT * FROM test")
def from_sql dbh, query
Daru::IO.from_sql dbh, query
end
# Read a dataframe from AR::Relation
@@ -110,10 +115,53 @@
# df = Daru::DataFrame.from_plaintext 'spec/fixtures/bank2.dat', [:v1,:v2,:v3,:v4,:v5,:v6]
def from_plaintext path, fields
Daru::IO.from_plaintext path, fields
end
+ # Read the table data from a remote html file. Please note that this module
+ # works only for static table elements on a HTML page, and won't work in
+ # cases where the data is being loaded into the HTML table by Javascript.
+ #
+ # By default - all <th> tag elements in the first proper row are considered
+ # as the order, and all the <th> tag elements in the first column are
+ # considered as the index.
+ #
+ # == Arguments
+ #
+ # * path [String] - URL of the target HTML file.
+ # * fields [Hash] -
+ #
+ # +:match+ - A *String* to match and choose a particular table(s) from multiple tables of a HTML page.
+ #
+ # +:order+ - An *Array* which would act as the user-defined order, to override the parsed *Daru::DataFrame*.
+ #
+ # +:index+ - An *Array* which would act as the user-defined index, to override the parsed *Daru::DataFrame*.
+ #
+ # +:name+ - A *String* that manually assigns a name to the scraped *Daru::DataFrame*, for user's preference.
+ #
+ # == Returns
+ # An Array of +Daru::DataFrame+s, with each dataframe corresponding to a
+ # HTML table on that webpage.
+ #
+ # == Usage
+ # dfs = Daru::DataFrame.from_html("http://www.moneycontrol.com/", match: "Sun Pharma")
+ # dfs.count
+ # # => 4
+ #
+ # dfs.first
+ # #
+ # # => <Daru::DataFrame(5x4)>
+ # # Company Price Change Value (Rs
+ # # 0 Sun Pharma 502.60 -65.05 2,117.87
+ # # 1 Reliance 1356.90 19.60 745.10
+ # # 2 Tech Mahin 379.45 -49.70 650.22
+ # # 3 ITC 315.85 6.75 621.12
+ # # 4 HDFC 1598.85 50.95 553.91
+ def from_html path, fields={}
+ Daru::IO.from_html path, fields
+ end
+
# Create DataFrame by specifying rows as an Array of Arrays or Array of
# Daru::Vector objects.
def rows source, opts={}
raise SizeError, 'All vectors must have same length' \
unless source.all? { |v| v.size == source.first.size }
@@ -237,10 +285,52 @@
# # b a
# # a 6 1
# # b 7 2
# # c 8 3
# # d 9 4
+ #
+ # df = Daru::DataFrame.new([[1,2,3,4],[6,7,8,9]], name: :bat_man)
+ #
+ # # =>
+ # # #<Daru::DataFrame: bat_man (4x2)>
+ # # 0 1
+ # # 0 1 6
+ # # 1 2 7
+ # # 2 3 8
+ # # 3 4 9
+ #
+ # # Dataframe having Index name
+ #
+ # df = Daru::DataFrame.new({a: [1,2,3,4], b: [6,7,8,9]}, order: [:b, :a],
+ # index: Daru::Index.new([:a, :b, :c, :d], name: 'idx_name'),
+ # name: :spider_man)
+ #
+ # # =>
+ # # <Daru::DataFrame:80766980 @name = spider_man @size = 4>
+ # # idx_name b a
+ # # a 6 1
+ # # b 7 2
+ # # c 8 3
+ # # d 9 4
+ #
+ #
+ # idx = Daru::Index.new [100, 99, 101, 1, 2], name: "s1"
+ # => #<Daru::Index(5): s1 {100, 99, 101, 1, 2}>
+ #
+ # df = Daru::DataFrame.new({b: [11,12,13,14,15], a: [1,2,3,4,5],
+ # c: [11,22,33,44,55]},
+ # order: [:a, :b, :c],
+ # index: idx)
+ # # =>
+ # #<Daru::DataFrame(5x3)>
+ # # s1 a b c
+ # # 100 1 11 11
+ # # 99 2 12 22
+ # # 101 3 13 33
+ # # 1 4 14 44
+ # # 2 5 15 55
+
def initialize source, opts={} # rubocop:disable Metrics/MethodLength
vectors, index = opts[:order], opts[:index] # FIXME: just keyword arges after Ruby 2.1
@data = []
@name = opts[:name]
@@ -455,11 +545,11 @@
# * +vectors_to_dup+ - An Array specifying the names of Vectors to
# be duplicated. Will duplicate the entire DataFrame if not specified.
def dup vectors_to_dup=nil
vectors_to_dup = @vectors.to_a unless vectors_to_dup
- src = vectors_to_dup.map { |vec| @data[@vectors[vec]].dup }
+ src = vectors_to_dup.map { |vec| @data[@vectors.pos(vec)].dup }
new_order = Daru::Index.new(vectors_to_dup)
Daru::DataFrame.new src, order: new_order, index: @index.dup, name: @name, clone: true
end
@@ -542,11 +632,11 @@
# df = Daru::DataFrame.new({
# a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
# b: [:a, :b, nil, Float::NAN, nil, 3, 5, 8],
# c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
# }, index: 11..18)
- # df
+ # df.replace_values nil, Float::NAN
# # => #<Daru::DataFrame(8x3)>
# # a b c
# # 11 1 a a
# # 12 2 b NaN
# # 13 3 NaN 3
@@ -677,11 +767,11 @@
# == Arguments
#
# * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
# Default to :vector.
def map! axis=:vector, &block
- if axis == :vector || axis == :column
+ if %i[vector column].include?(axis)
map_vectors!(&block)
elsif axis == :row
map_rows!(&block)
end
end
@@ -911,11 +1001,11 @@
end
end
# creates a new vector with the data of a given field which the block returns true
def filter_vector vec, &block
- Daru::Vector.new each_row.select(&block).map { |row| row[vec] }
+ Daru::Vector.new(each_row.select(&block).map { |row| row[vec] })
end
# Iterates over each row and retains it in a new DataFrame if the block returns
# true for that row.
def filter_rows
@@ -1029,11 +1119,11 @@
# TODO: remove next version
alias :vector_missing_values :missing_values_rows
def has_missing_data?
- !!@data.any? { |vec| vec.include_values?(*Daru::MISSING_VALUES) }
+ @data.any? { |vec| vec.include_values?(*Daru::MISSING_VALUES) }
end
alias :flawed? :has_missing_data?
deprecate :has_missing_data?, :include_values?, 2016, 10
deprecate :flawed?, :include_values?, 2016, 10
@@ -1117,11 +1207,11 @@
# df = Daru::DataFrame.new({a: [1,2,3,4,5], b: ['a', 'b', 'c', 'd', 'e']})
# df.any?(:row) do |row|
# row[:a] < 3 and row[:b] == 'b'
# end #=> true
def any? axis=:vector, &block
- if axis == :vector || axis == :column
+ if %i[vector column].include?(axis)
@data.any?(&block)
elsif axis == :row
each_row do |row|
return true if yield(row)
end
@@ -1139,11 +1229,11 @@
# df = Daru::DataFrame.new({a: [1,2,3,4,5], b: ['a', 'b', 'c', 'd', 'e']})
# df.all?(:row) do |row|
# row[:a] < 10
# end #=> true
def all? axis=:vector, &block
- if axis == :vector || axis == :column
+ if %i[vector column].include?(axis)
@data.all?(&block)
elsif axis == :row
each_row.all?(&block)
else
raise ArgumentError, "Unidentified axis #{axis}"
@@ -1375,11 +1465,11 @@
#
# df = Daru::DataFrame.new({ a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44] })
# df.rename_vectors :a => :alpha, :c => :gamma
# df.vectors.to_a #=> [:alpha, :b, :gamma]
def rename_vectors name_map
- existing_targets = name_map.select { |k,v| k != v }.values & vectors.to_a
+ existing_targets = name_map.reject { |k,v| k == v }.values & vectors.to_a
delete_vectors(*existing_targets)
new_names = vectors.to_a.map { |v| name_map[v] ? name_map[v] : v }
self.vectors = Daru::Index.new new_names
end
@@ -1406,23 +1496,20 @@
order = Index.new(numeric_vectors)
Daru::DataFrame.new(arry, clone: cln, order: order, index: @index)
end
- # Generate a summary of this DataFrame with ReportBuilder.
- def summary(method=:to_text)
- ReportBuilder.new(no_title: true).add(self).send(method)
- end
-
- def report_building(b) # :nodoc: #
- b.section(name: @name) do |g|
- g.text "Number of rows: #{nrows}"
- @vectors.each do |v|
- g.text "Element:[#{v}]"
- g.parse_element(self[v])
- end
+ # Generate a summary of this DataFrame based on individual vectors in the DataFrame
+ # @return [String] String containing the summary of the DataFrame
+ def summary
+ summary = "= #{name}"
+ summary << "\n Number of rows: #{nrows}"
+ @vectors.each do |v|
+ summary << "\n Element:[#{v}]\n"
+ summary << self[v].summary(1)
end
+ summary
end
# Sorts a dataframe (ascending/descending) in the given pripority sequence of
# vectors, with or without a block.
#
@@ -1781,21 +1868,43 @@
.each_with_index
.map { |vec_name, idx| [vec_name, @data[idx]] }.to_h
end
# Convert to html for IRuby.
- def to_html threshold=30
+ def to_html(threshold=30)
+ table_thead = to_html_thead
+ table_tbody = to_html_tbody(threshold)
path = if index.is_a?(MultiIndex)
File.expand_path('../iruby/templates/dataframe_mi.html.erb', __FILE__)
else
File.expand_path('../iruby/templates/dataframe.html.erb', __FILE__)
end
ERB.new(File.read(path).strip).result(binding)
end
+ def to_html_thead
+ table_thead_path =
+ if index.is_a?(MultiIndex)
+ File.expand_path('../iruby/templates/dataframe_mi_thead.html.erb', __FILE__)
+ else
+ File.expand_path('../iruby/templates/dataframe_thead.html.erb', __FILE__)
+ end
+ ERB.new(File.read(table_thead_path).strip).result(binding)
+ end
+
+ def to_html_tbody(threshold=30)
+ table_tbody_path =
+ if index.is_a?(MultiIndex)
+ File.expand_path('../iruby/templates/dataframe_mi_tbody.html.erb', __FILE__)
+ else
+ File.expand_path('../iruby/templates/dataframe_tbody.html.erb', __FILE__)
+ end
+ ERB.new(File.read(table_tbody_path).strip).result(binding)
+ end
+
def to_s
- to_html
+ "#<#{self.class}#{': ' + @name.to_s if @name}(#{nrows}x#{ncols})>"
end
# Method for updating the metadata (i.e. missing value positions) of the
# after assingment/deletion etc. are complete. This is provided so that
# time is not wasted in creating the metadata for the vector each time
@@ -1898,18 +2007,17 @@
)
end
# Pretty print in a nice table format for the command line (irb/pry/iruby)
def inspect spacing=10, threshold=15
- row_headers = index.is_a?(MultiIndex) ? index.sparse_tuples : index.to_a
name_part = @name ? ": #{@name} " : ''
"#<#{self.class}#{name_part}(#{nrows}x#{ncols})>\n" +
Formatters::Table.format(
each_row.lazy,
row_headers: row_headers,
- headers: vectors,
+ headers: headers,
threshold: threshold,
spacing: spacing
)
end
@@ -2000,12 +2108,36 @@
.rename(cat)
.delete_vector cat_name
end
end
+ # returns array of row tuples at given index(s)
+ def access_row_tuples_by_indexs *indexes
+ positions = @index.pos(*indexes)
+
+ return populate_row_for(positions) if positions.is_a? Numeric
+
+ res = []
+ new_rows = @data.map { |vec| vec[*indexes] }
+ indexes.each do |index|
+ tuples = []
+ new_rows.map { |row| tuples += [row[index]] }
+ res << tuples
+ end
+ res
+ end
+
private
+ def headers
+ Daru::Index.new(Array(index.name) + @vectors.to_a)
+ end
+
+ def row_headers
+ index.is_a?(MultiIndex) ? index.sparse_tuples : index.to_a
+ end
+
def convert_categorical_vectors names
names.map do |n|
next unless self[n].category?
old = [n, self[n]]
self[n] = Daru::Vector.new(self[n].to_ints)
@@ -2032,30 +2164,30 @@
return val if val.is_a?(Daru::Vector)
raise TypeError, "Every iteration must return Daru::Vector not #{val.class}"
end
def dispatch_to_axis(axis, method, *args, &block)
- if axis == :vector || axis == :column
+ if %i[vector column].include?(axis)
send("#{method}_vector", *args, &block)
elsif axis == :row
send("#{method}_row", *args, &block)
else
raise ArgumentError, "Unknown axis #{axis}"
end
end
def dispatch_to_axis_pl(axis, method, *args, &block)
- if axis == :vector || axis == :column
+ if %i[vector column].include?(axis)
send("#{method}_vectors", *args, &block)
elsif axis == :row
send("#{method}_rows", *args, &block)
else
raise ArgumentError, "Unknown axis #{axis}"
end
end
- AXES = [:row, :vector].freeze
+ AXES = %i[row vector].freeze
def extract_axis names, default=:vector
if AXES.include?(names.last)
names.pop
else
@@ -2063,11 +2195,11 @@
end
end
def access_vector *names
if names.first.is_a?(Range)
- dup(@vectors[names.first])
+ dup(@vectors.subset(names.first))
elsif @vectors.is_a?(MultiIndex)
access_vector_multi_index(*names)
else
access_vector_single_index(*names)
end
@@ -2085,18 +2217,22 @@
Daru::DataFrame.new(new_vectors, index: @index, order: pos)
end
def access_vector_single_index *names
if names.count < 2
- pos = @vectors[names.first]
+ begin
+ pos = @vectors.is_a?(Daru::DateTimeIndex) ? @vectors[names.first] : @vectors.pos(names.first)
+ rescue IndexError
+ raise IndexError, "Specified vector #{names.first} does not exist"
+ end
return @data[pos] if pos.is_a?(Numeric)
names = pos
end
- new_vectors = names.map { |name| [name, @data[@vectors[name]]] }.to_h
+ new_vectors = names.map { |name| [name, @data[@vectors.pos(name)]] }.to_h
order = names.is_a?(Array) ? Daru::Index.new(names) : names
Daru::DataFrame.new(new_vectors, order: order,
index: @index, name: @name)
end
@@ -2124,11 +2260,11 @@
name = name[0] unless @vectors.is_a?(MultiIndex)
if @index.empty?
insert_vector_in_empty name, vector
else
- vec = prepare_vector_for_insert name, vector
+ vec = prepare_for_insert name, vector
assign_or_add_vector name, vec
end
end
@@ -2171,31 +2307,41 @@
set_size
@data.map! { |v| v.empty? ? v.reindex(@index) : v }
end
- def prepare_vector_for_insert name, vector
- if vector.is_a?(Daru::Vector)
- # so that index-by-index assignment is avoided when possible.
- return vector.dup if vector.index == @index
-
- Daru::Vector.new([], name: coerce_name(name), index: @index).tap { |v|
- @index.each do |idx|
- v[idx] = vector.index.include?(idx) ? vector[idx] : nil
- end
- }
+ def prepare_for_insert name, arg
+ if arg.is_a? Daru::Vector
+ prepare_vector_for_insert name, arg
+ elsif arg.respond_to?(:to_a)
+ prepare_enum_for_insert name, arg
else
- # FIXME: No spec checks this case... And SizeError is not a thing - zverok, 2016-05-08
- if @size != vector.size
- raise SizeError,
- "Specified vector of length #{vector.size} cannot be inserted in DataFrame of size #{@size}"
+ prepare_value_for_insert name, arg
+ end
+ end
+
+ def prepare_vector_for_insert name, vector
+ # so that index-by-index assignment is avoided when possible.
+ return vector.dup if vector.index == @index
+ Daru::Vector.new([], name: coerce_name(name), index: @index).tap { |v|
+ @index.each do |idx|
+ v[idx] = vector.index.include?(idx) ? vector[idx] : nil
end
+ }
+ end
- Daru::Vector.new(vector, name: coerce_name(name), index: @index)
+ def prepare_enum_for_insert name, enum
+ if @size != enum.size
+ raise "Specified vector of length #{enum.size} cannot be inserted in DataFrame of size #{@size}"
end
+ Daru::Vector.new(enum, name: coerce_name(name), index: @index)
end
+ def prepare_value_for_insert name, value
+ Daru::Vector.new(Array(value) * @size, name: coerce_name(name), index: @index)
+ end
+
def insert_or_modify_row indexes, vector
vector = coerce_vector vector
raise SizeError, 'Vector length should match row length' if
vector.size != @vectors.size
@@ -2274,11 +2420,13 @@
raise ArgumentError, 'All objects in data source should be same class' \
unless source.map(&:class).uniq.size == 1
case source.first
when Array
+ vectors ||= (0..source.size-1).to_a
initialize_from_array_of_arrays source, vectors, index, opts
when Vector
+ vectors ||= (0..source.size-1).to_a
initialize_from_array_of_vectors source, vectors, index, opts
when Hash
initialize_from_array_of_hashes source, vectors, index, opts
else
raise ArgumentError, "Can't create DataFrame from #{source}"