lib/daru/dataframe.rb in daru-0.1.5 vs lib/daru/dataframe.rb in daru-0.1.6

- old
+ new

@@ -18,11 +18,11 @@ # object and pre-condition it (for example use the `convert` or # `header_convert` methods). # # == Arguments # - # * path - Path of the file to load specified as a String. + # * path - Local path / Remote URL of the file to load specified as a String. # # == Options # # Accepts the same options as the Daru::DataFrame constructor and CSV.open() # and uses those to eventually construct the resulting DataFrame. @@ -61,19 +61,24 @@ Daru::IO.from_excel path, opts, &block end # Read a database query and returns a Dataset # - # @param dbh [DBI::DatabaseHandle] A DBI connection to be used to run the query + # @param dbh [DBI::DatabaseHandle, String] A DBI connection OR Path to a SQlite3 database. # @param query [String] The query to be executed # # @return A dataframe containing the data resulting from the query # # USE: # # dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password") # Daru::DataFrame.from_sql(dbh, "SELECT * FROM test") + # + # #Alternatively + # + # require 'dbi' + # Daru::DataFrame.from_sql("path/to/sqlite.db", "SELECT * FROM test") def from_sql dbh, query Daru::IO.from_sql dbh, query end # Read a dataframe from AR::Relation @@ -110,10 +115,53 @@ # df = Daru::DataFrame.from_plaintext 'spec/fixtures/bank2.dat', [:v1,:v2,:v3,:v4,:v5,:v6] def from_plaintext path, fields Daru::IO.from_plaintext path, fields end + # Read the table data from a remote html file. Please note that this module + # works only for static table elements on a HTML page, and won't work in + # cases where the data is being loaded into the HTML table by Javascript. + # + # By default - all <th> tag elements in the first proper row are considered + # as the order, and all the <th> tag elements in the first column are + # considered as the index. + # + # == Arguments + # + # * path [String] - URL of the target HTML file. + # * fields [Hash] - + # + # +:match+ - A *String* to match and choose a particular table(s) from multiple tables of a HTML page. + # + # +:order+ - An *Array* which would act as the user-defined order, to override the parsed *Daru::DataFrame*. + # + # +:index+ - An *Array* which would act as the user-defined index, to override the parsed *Daru::DataFrame*. + # + # +:name+ - A *String* that manually assigns a name to the scraped *Daru::DataFrame*, for user's preference. + # + # == Returns + # An Array of +Daru::DataFrame+s, with each dataframe corresponding to a + # HTML table on that webpage. + # + # == Usage + # dfs = Daru::DataFrame.from_html("http://www.moneycontrol.com/", match: "Sun Pharma") + # dfs.count + # # => 4 + # + # dfs.first + # # + # # => <Daru::DataFrame(5x4)> + # # Company Price Change Value (Rs + # # 0 Sun Pharma 502.60 -65.05 2,117.87 + # # 1 Reliance 1356.90 19.60 745.10 + # # 2 Tech Mahin 379.45 -49.70 650.22 + # # 3 ITC 315.85 6.75 621.12 + # # 4 HDFC 1598.85 50.95 553.91 + def from_html path, fields={} + Daru::IO.from_html path, fields + end + # Create DataFrame by specifying rows as an Array of Arrays or Array of # Daru::Vector objects. def rows source, opts={} raise SizeError, 'All vectors must have same length' \ unless source.all? { |v| v.size == source.first.size } @@ -237,10 +285,52 @@ # # b a # # a 6 1 # # b 7 2 # # c 8 3 # # d 9 4 + # + # df = Daru::DataFrame.new([[1,2,3,4],[6,7,8,9]], name: :bat_man) + # + # # => + # # #<Daru::DataFrame: bat_man (4x2)> + # # 0 1 + # # 0 1 6 + # # 1 2 7 + # # 2 3 8 + # # 3 4 9 + # + # # Dataframe having Index name + # + # df = Daru::DataFrame.new({a: [1,2,3,4], b: [6,7,8,9]}, order: [:b, :a], + # index: Daru::Index.new([:a, :b, :c, :d], name: 'idx_name'), + # name: :spider_man) + # + # # => + # # <Daru::DataFrame:80766980 @name = spider_man @size = 4> + # # idx_name b a + # # a 6 1 + # # b 7 2 + # # c 8 3 + # # d 9 4 + # + # + # idx = Daru::Index.new [100, 99, 101, 1, 2], name: "s1" + # => #<Daru::Index(5): s1 {100, 99, 101, 1, 2}> + # + # df = Daru::DataFrame.new({b: [11,12,13,14,15], a: [1,2,3,4,5], + # c: [11,22,33,44,55]}, + # order: [:a, :b, :c], + # index: idx) + # # => + # #<Daru::DataFrame(5x3)> + # # s1 a b c + # # 100 1 11 11 + # # 99 2 12 22 + # # 101 3 13 33 + # # 1 4 14 44 + # # 2 5 15 55 + def initialize source, opts={} # rubocop:disable Metrics/MethodLength vectors, index = opts[:order], opts[:index] # FIXME: just keyword arges after Ruby 2.1 @data = [] @name = opts[:name] @@ -455,11 +545,11 @@ # * +vectors_to_dup+ - An Array specifying the names of Vectors to # be duplicated. Will duplicate the entire DataFrame if not specified. def dup vectors_to_dup=nil vectors_to_dup = @vectors.to_a unless vectors_to_dup - src = vectors_to_dup.map { |vec| @data[@vectors[vec]].dup } + src = vectors_to_dup.map { |vec| @data[@vectors.pos(vec)].dup } new_order = Daru::Index.new(vectors_to_dup) Daru::DataFrame.new src, order: new_order, index: @index.dup, name: @name, clone: true end @@ -542,11 +632,11 @@ # df = Daru::DataFrame.new({ # a: [1, 2, 3, nil, Float::NAN, nil, 1, 7], # b: [:a, :b, nil, Float::NAN, nil, 3, 5, 8], # c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7] # }, index: 11..18) - # df + # df.replace_values nil, Float::NAN # # => #<Daru::DataFrame(8x3)> # # a b c # # 11 1 a a # # 12 2 b NaN # # 13 3 NaN 3 @@ -677,11 +767,11 @@ # == Arguments # # * +axis+ - The axis to map over. Can be :vector (or :column) or :row. # Default to :vector. def map! axis=:vector, &block - if axis == :vector || axis == :column + if %i[vector column].include?(axis) map_vectors!(&block) elsif axis == :row map_rows!(&block) end end @@ -911,11 +1001,11 @@ end end # creates a new vector with the data of a given field which the block returns true def filter_vector vec, &block - Daru::Vector.new each_row.select(&block).map { |row| row[vec] } + Daru::Vector.new(each_row.select(&block).map { |row| row[vec] }) end # Iterates over each row and retains it in a new DataFrame if the block returns # true for that row. def filter_rows @@ -1029,11 +1119,11 @@ # TODO: remove next version alias :vector_missing_values :missing_values_rows def has_missing_data? - !!@data.any? { |vec| vec.include_values?(*Daru::MISSING_VALUES) } + @data.any? { |vec| vec.include_values?(*Daru::MISSING_VALUES) } end alias :flawed? :has_missing_data? deprecate :has_missing_data?, :include_values?, 2016, 10 deprecate :flawed?, :include_values?, 2016, 10 @@ -1117,11 +1207,11 @@ # df = Daru::DataFrame.new({a: [1,2,3,4,5], b: ['a', 'b', 'c', 'd', 'e']}) # df.any?(:row) do |row| # row[:a] < 3 and row[:b] == 'b' # end #=> true def any? axis=:vector, &block - if axis == :vector || axis == :column + if %i[vector column].include?(axis) @data.any?(&block) elsif axis == :row each_row do |row| return true if yield(row) end @@ -1139,11 +1229,11 @@ # df = Daru::DataFrame.new({a: [1,2,3,4,5], b: ['a', 'b', 'c', 'd', 'e']}) # df.all?(:row) do |row| # row[:a] < 10 # end #=> true def all? axis=:vector, &block - if axis == :vector || axis == :column + if %i[vector column].include?(axis) @data.all?(&block) elsif axis == :row each_row.all?(&block) else raise ArgumentError, "Unidentified axis #{axis}" @@ -1375,11 +1465,11 @@ # # df = Daru::DataFrame.new({ a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44] }) # df.rename_vectors :a => :alpha, :c => :gamma # df.vectors.to_a #=> [:alpha, :b, :gamma] def rename_vectors name_map - existing_targets = name_map.select { |k,v| k != v }.values & vectors.to_a + existing_targets = name_map.reject { |k,v| k == v }.values & vectors.to_a delete_vectors(*existing_targets) new_names = vectors.to_a.map { |v| name_map[v] ? name_map[v] : v } self.vectors = Daru::Index.new new_names end @@ -1406,23 +1496,20 @@ order = Index.new(numeric_vectors) Daru::DataFrame.new(arry, clone: cln, order: order, index: @index) end - # Generate a summary of this DataFrame with ReportBuilder. - def summary(method=:to_text) - ReportBuilder.new(no_title: true).add(self).send(method) - end - - def report_building(b) # :nodoc: # - b.section(name: @name) do |g| - g.text "Number of rows: #{nrows}" - @vectors.each do |v| - g.text "Element:[#{v}]" - g.parse_element(self[v]) - end + # Generate a summary of this DataFrame based on individual vectors in the DataFrame + # @return [String] String containing the summary of the DataFrame + def summary + summary = "= #{name}" + summary << "\n Number of rows: #{nrows}" + @vectors.each do |v| + summary << "\n Element:[#{v}]\n" + summary << self[v].summary(1) end + summary end # Sorts a dataframe (ascending/descending) in the given pripority sequence of # vectors, with or without a block. # @@ -1781,21 +1868,43 @@ .each_with_index .map { |vec_name, idx| [vec_name, @data[idx]] }.to_h end # Convert to html for IRuby. - def to_html threshold=30 + def to_html(threshold=30) + table_thead = to_html_thead + table_tbody = to_html_tbody(threshold) path = if index.is_a?(MultiIndex) File.expand_path('../iruby/templates/dataframe_mi.html.erb', __FILE__) else File.expand_path('../iruby/templates/dataframe.html.erb', __FILE__) end ERB.new(File.read(path).strip).result(binding) end + def to_html_thead + table_thead_path = + if index.is_a?(MultiIndex) + File.expand_path('../iruby/templates/dataframe_mi_thead.html.erb', __FILE__) + else + File.expand_path('../iruby/templates/dataframe_thead.html.erb', __FILE__) + end + ERB.new(File.read(table_thead_path).strip).result(binding) + end + + def to_html_tbody(threshold=30) + table_tbody_path = + if index.is_a?(MultiIndex) + File.expand_path('../iruby/templates/dataframe_mi_tbody.html.erb', __FILE__) + else + File.expand_path('../iruby/templates/dataframe_tbody.html.erb', __FILE__) + end + ERB.new(File.read(table_tbody_path).strip).result(binding) + end + def to_s - to_html + "#<#{self.class}#{': ' + @name.to_s if @name}(#{nrows}x#{ncols})>" end # Method for updating the metadata (i.e. missing value positions) of the # after assingment/deletion etc. are complete. This is provided so that # time is not wasted in creating the metadata for the vector each time @@ -1898,18 +2007,17 @@ ) end # Pretty print in a nice table format for the command line (irb/pry/iruby) def inspect spacing=10, threshold=15 - row_headers = index.is_a?(MultiIndex) ? index.sparse_tuples : index.to_a name_part = @name ? ": #{@name} " : '' "#<#{self.class}#{name_part}(#{nrows}x#{ncols})>\n" + Formatters::Table.format( each_row.lazy, row_headers: row_headers, - headers: vectors, + headers: headers, threshold: threshold, spacing: spacing ) end @@ -2000,12 +2108,36 @@ .rename(cat) .delete_vector cat_name end end + # returns array of row tuples at given index(s) + def access_row_tuples_by_indexs *indexes + positions = @index.pos(*indexes) + + return populate_row_for(positions) if positions.is_a? Numeric + + res = [] + new_rows = @data.map { |vec| vec[*indexes] } + indexes.each do |index| + tuples = [] + new_rows.map { |row| tuples += [row[index]] } + res << tuples + end + res + end + private + def headers + Daru::Index.new(Array(index.name) + @vectors.to_a) + end + + def row_headers + index.is_a?(MultiIndex) ? index.sparse_tuples : index.to_a + end + def convert_categorical_vectors names names.map do |n| next unless self[n].category? old = [n, self[n]] self[n] = Daru::Vector.new(self[n].to_ints) @@ -2032,30 +2164,30 @@ return val if val.is_a?(Daru::Vector) raise TypeError, "Every iteration must return Daru::Vector not #{val.class}" end def dispatch_to_axis(axis, method, *args, &block) - if axis == :vector || axis == :column + if %i[vector column].include?(axis) send("#{method}_vector", *args, &block) elsif axis == :row send("#{method}_row", *args, &block) else raise ArgumentError, "Unknown axis #{axis}" end end def dispatch_to_axis_pl(axis, method, *args, &block) - if axis == :vector || axis == :column + if %i[vector column].include?(axis) send("#{method}_vectors", *args, &block) elsif axis == :row send("#{method}_rows", *args, &block) else raise ArgumentError, "Unknown axis #{axis}" end end - AXES = [:row, :vector].freeze + AXES = %i[row vector].freeze def extract_axis names, default=:vector if AXES.include?(names.last) names.pop else @@ -2063,11 +2195,11 @@ end end def access_vector *names if names.first.is_a?(Range) - dup(@vectors[names.first]) + dup(@vectors.subset(names.first)) elsif @vectors.is_a?(MultiIndex) access_vector_multi_index(*names) else access_vector_single_index(*names) end @@ -2085,18 +2217,22 @@ Daru::DataFrame.new(new_vectors, index: @index, order: pos) end def access_vector_single_index *names if names.count < 2 - pos = @vectors[names.first] + begin + pos = @vectors.is_a?(Daru::DateTimeIndex) ? @vectors[names.first] : @vectors.pos(names.first) + rescue IndexError + raise IndexError, "Specified vector #{names.first} does not exist" + end return @data[pos] if pos.is_a?(Numeric) names = pos end - new_vectors = names.map { |name| [name, @data[@vectors[name]]] }.to_h + new_vectors = names.map { |name| [name, @data[@vectors.pos(name)]] }.to_h order = names.is_a?(Array) ? Daru::Index.new(names) : names Daru::DataFrame.new(new_vectors, order: order, index: @index, name: @name) end @@ -2124,11 +2260,11 @@ name = name[0] unless @vectors.is_a?(MultiIndex) if @index.empty? insert_vector_in_empty name, vector else - vec = prepare_vector_for_insert name, vector + vec = prepare_for_insert name, vector assign_or_add_vector name, vec end end @@ -2171,31 +2307,41 @@ set_size @data.map! { |v| v.empty? ? v.reindex(@index) : v } end - def prepare_vector_for_insert name, vector - if vector.is_a?(Daru::Vector) - # so that index-by-index assignment is avoided when possible. - return vector.dup if vector.index == @index - - Daru::Vector.new([], name: coerce_name(name), index: @index).tap { |v| - @index.each do |idx| - v[idx] = vector.index.include?(idx) ? vector[idx] : nil - end - } + def prepare_for_insert name, arg + if arg.is_a? Daru::Vector + prepare_vector_for_insert name, arg + elsif arg.respond_to?(:to_a) + prepare_enum_for_insert name, arg else - # FIXME: No spec checks this case... And SizeError is not a thing - zverok, 2016-05-08 - if @size != vector.size - raise SizeError, - "Specified vector of length #{vector.size} cannot be inserted in DataFrame of size #{@size}" + prepare_value_for_insert name, arg + end + end + + def prepare_vector_for_insert name, vector + # so that index-by-index assignment is avoided when possible. + return vector.dup if vector.index == @index + Daru::Vector.new([], name: coerce_name(name), index: @index).tap { |v| + @index.each do |idx| + v[idx] = vector.index.include?(idx) ? vector[idx] : nil end + } + end - Daru::Vector.new(vector, name: coerce_name(name), index: @index) + def prepare_enum_for_insert name, enum + if @size != enum.size + raise "Specified vector of length #{enum.size} cannot be inserted in DataFrame of size #{@size}" end + Daru::Vector.new(enum, name: coerce_name(name), index: @index) end + def prepare_value_for_insert name, value + Daru::Vector.new(Array(value) * @size, name: coerce_name(name), index: @index) + end + def insert_or_modify_row indexes, vector vector = coerce_vector vector raise SizeError, 'Vector length should match row length' if vector.size != @vectors.size @@ -2274,11 +2420,13 @@ raise ArgumentError, 'All objects in data source should be same class' \ unless source.map(&:class).uniq.size == 1 case source.first when Array + vectors ||= (0..source.size-1).to_a initialize_from_array_of_arrays source, vectors, index, opts when Vector + vectors ||= (0..source.size-1).to_a initialize_from_array_of_vectors source, vectors, index, opts when Hash initialize_from_array_of_hashes source, vectors, index, opts else raise ArgumentError, "Can't create DataFrame from #{source}"