dataframe.rb in daru-0.1.6

- old
+ new

@@ -18,11 +18,11 @@
       # object and pre-condition it (for example use the `convert` or
       # `header_convert` methods).
       #
       # == Arguments
       #
-      # * path - Path of the file to load specified as a String.
+      # * path - Local path / Remote URL of the file to load specified as a String.
       #
       # == Options
       #
       # Accepts the same options as the Daru::DataFrame constructor and CSV.open()
       # and uses those to eventually construct the resulting DataFrame.
@@ -61,19 +61,24 @@
         Daru::IO.from_excel path, opts, &block
       end
 
       # Read a database query and returns a Dataset
       #
-      # @param dbh [DBI::DatabaseHandle] A DBI connection to be used to run the query
+      # @param dbh [DBI::DatabaseHandle, String] A DBI connection OR Path to a SQlite3 database.
       # @param query [String] The query to be executed
       #
       # @return A dataframe containing the data resulting from the query
       #
       # USE:
       #
       #  dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
       #  Daru::DataFrame.from_sql(dbh, "SELECT * FROM test")
+      #
+      #  #Alternatively
+      #
+      #  require 'dbi'
+      #  Daru::DataFrame.from_sql("path/to/sqlite.db", "SELECT * FROM test")
       def from_sql dbh, query
         Daru::IO.from_sql dbh, query
       end
 
       # Read a dataframe from AR::Relation
@@ -110,10 +115,53 @@
       #   df = Daru::DataFrame.from_plaintext 'spec/fixtures/bank2.dat', [:v1,:v2,:v3,:v4,:v5,:v6]
       def from_plaintext path, fields
         Daru::IO.from_plaintext path, fields
       end
 
+      # Read the table data from a remote html file. Please note that this module
+      # works only for static table elements on a HTML page, and won't work in
+      # cases where the data is being loaded into the HTML table by Javascript.
+      #
+      # By default - all <th> tag elements in the first proper row are considered
+      # as the order, and all the <th> tag elements in the first column are
+      # considered as the index.
+      #
+      # == Arguments
+      #
+      # * path [String] - URL of the target HTML file.
+      # * fields [Hash] -
+      #
+      #   +:match+ - A *String* to match and choose a particular table(s) from multiple tables of a HTML page.
+      #
+      #   +:order+ - An *Array* which would act as the user-defined order, to override the parsed *Daru::DataFrame*.
+      #
+      #   +:index+ - An *Array* which would act as the user-defined index, to override the parsed *Daru::DataFrame*.
+      #
+      #   +:name+ - A *String* that manually assigns a name to the scraped *Daru::DataFrame*, for user's preference.
+      #
+      # == Returns
+      # An Array of +Daru::DataFrame+s, with each dataframe corresponding to a
+      # HTML table on that webpage.
+      #
+      # == Usage
+      #   dfs = Daru::DataFrame.from_html("http://www.moneycontrol.com/", match: "Sun Pharma")
+      #   dfs.count
+      #   # => 4
+      #
+      #   dfs.first
+      #   #
+      #   # => <Daru::DataFrame(5x4)>
+      #   #          Company      Price     Change Value (Rs
+      #   #     0 Sun Pharma     502.60     -65.05   2,117.87
+      #   #     1   Reliance    1356.90      19.60     745.10
+      #   #     2 Tech Mahin     379.45     -49.70     650.22
+      #   #     3        ITC     315.85       6.75     621.12
+      #   #     4       HDFC    1598.85      50.95     553.91
+      def from_html path, fields={}
+        Daru::IO.from_html path, fields
+      end
+
       # Create DataFrame by specifying rows as an Array of Arrays or Array of
       # Daru::Vector objects.
       def rows source, opts={}
         raise SizeError, 'All vectors must have same length' \
           unless source.all? { |v| v.size == source.first.size }
@@ -237,10 +285,52 @@
     #   #             b          a
     #   #  a          6          1
     #   #  b          7          2
     #   #  c          8          3
     #   #  d          9          4
+    #
+    #   df = Daru::DataFrame.new([[1,2,3,4],[6,7,8,9]], name: :bat_man)
+    #
+    #   # =>
+    #   # #<Daru::DataFrame: bat_man (4x2)>
+    #   #             0          1
+    #   #  0          1          6
+    #   #  1          2          7
+    #   #  2          3          8
+    #   #  3          4          9
+    #
+    #   # Dataframe having Index name
+    #
+    #   df = Daru::DataFrame.new({a: [1,2,3,4], b: [6,7,8,9]}, order: [:b, :a],
+    #     index: Daru::Index.new([:a, :b, :c, :d], name: 'idx_name'),
+    #     name: :spider_man)
+    #
+    #   # =>
+    #   # <Daru::DataFrame:80766980 @name = spider_man @size = 4>
+    #   # idx_name            b          a
+    #   #        a          6          1
+    #   #        b          7          2
+    #   #        c          8          3
+    #   #        d          9          4
+    #
+    #
+    #   idx = Daru::Index.new [100, 99, 101, 1, 2], name: "s1"
+    #   => #<Daru::Index(5): s1 {100, 99, 101, 1, 2}>
+    #
+    #   df = Daru::DataFrame.new({b: [11,12,13,14,15], a: [1,2,3,4,5],
+    #     c: [11,22,33,44,55]},
+    #     order: [:a, :b, :c],
+    #     index: idx)
+    #    # =>
+    #    #<Daru::DataFrame(5x3)>
+    #    #   s1   a   b   c
+    #    #  100   1  11  11
+    #    #   99   2  12  22
+    #    #  101   3  13  33
+    #    #    1   4  14  44
+    #    #    2   5  15  55
+
     def initialize source, opts={} # rubocop:disable Metrics/MethodLength
       vectors, index = opts[:order], opts[:index] # FIXME: just keyword arges after Ruby 2.1
       @data = []
       @name = opts[:name]
 
@@ -455,11 +545,11 @@
     # * +vectors_to_dup+ - An Array specifying the names of Vectors to
     # be duplicated. Will duplicate the entire DataFrame if not specified.
     def dup vectors_to_dup=nil
       vectors_to_dup = @vectors.to_a unless vectors_to_dup
 
-      src = vectors_to_dup.map { |vec| @data[@vectors[vec]].dup }
+      src = vectors_to_dup.map { |vec| @data[@vectors.pos(vec)].dup }
       new_order = Daru::Index.new(vectors_to_dup)
 
       Daru::DataFrame.new src, order: new_order, index: @index.dup, name: @name, clone: true
     end
 
@@ -542,11 +632,11 @@
     #   df = Daru::DataFrame.new({
     #     a: [1,    2,          3,   nil,        Float::NAN, nil, 1,   7],
     #     b: [:a,  :b,          nil, Float::NAN, nil,        3,   5,   8],
     #     c: ['a',  Float::NAN, 3,   4,          3,          5,   nil, 7]
     #   }, index: 11..18)
-    #   df
+    #   df.replace_values nil, Float::NAN
     #   # => #<Daru::DataFrame(8x3)>
     #   #       a   b   c
     #   #   11   1   a   a
     #   #   12   2   b NaN
     #   #   13   3 NaN   3
@@ -677,11 +767,11 @@
     # == Arguments
     #
     # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
     # Default to :vector.
     def map! axis=:vector, &block
-      if axis == :vector || axis == :column
+      if %i[vector column].include?(axis)
         map_vectors!(&block)
       elsif axis == :row
         map_rows!(&block)
       end
     end
@@ -911,11 +1001,11 @@
       end
     end
 
     # creates a new vector with the data of a given field which the block returns true
     def filter_vector vec, &block
-      Daru::Vector.new each_row.select(&block).map { |row| row[vec] }
+      Daru::Vector.new(each_row.select(&block).map { |row| row[vec] })
     end
 
     # Iterates over each row and retains it in a new DataFrame if the block returns
     # true for that row.
     def filter_rows
@@ -1029,11 +1119,11 @@
 
     # TODO: remove next version
     alias :vector_missing_values :missing_values_rows
 
     def has_missing_data?
-      !!@data.any? { |vec| vec.include_values?(*Daru::MISSING_VALUES) }
+      @data.any? { |vec| vec.include_values?(*Daru::MISSING_VALUES) }
     end
     alias :flawed? :has_missing_data?
     deprecate :has_missing_data?, :include_values?, 2016, 10
     deprecate :flawed?, :include_values?, 2016, 10
 
@@ -1117,11 +1207,11 @@
     #   df = Daru::DataFrame.new({a: [1,2,3,4,5], b: ['a', 'b', 'c', 'd', 'e']})
     #   df.any?(:row) do |row|
     #     row[:a] < 3 and row[:b] == 'b'
     #   end #=> true
     def any? axis=:vector, &block
-      if axis == :vector || axis == :column
+      if %i[vector column].include?(axis)
         @data.any?(&block)
       elsif axis == :row
         each_row do |row|
           return true if yield(row)
         end
@@ -1139,11 +1229,11 @@
     #   df = Daru::DataFrame.new({a: [1,2,3,4,5], b: ['a', 'b', 'c', 'd', 'e']})
     #   df.all?(:row) do |row|
     #     row[:a] < 10
     #   end #=> true
     def all? axis=:vector, &block
-      if axis == :vector || axis == :column
+      if %i[vector column].include?(axis)
         @data.all?(&block)
       elsif axis == :row
         each_row.all?(&block)
       else
         raise ArgumentError, "Unidentified axis #{axis}"
@@ -1375,11 +1465,11 @@
     #
     #   df = Daru::DataFrame.new({ a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44] })
     #   df.rename_vectors :a => :alpha, :c => :gamma
     #   df.vectors.to_a #=> [:alpha, :b, :gamma]
     def rename_vectors name_map
-      existing_targets = name_map.select { |k,v| k != v }.values & vectors.to_a
+      existing_targets = name_map.reject { |k,v| k == v }.values & vectors.to_a
       delete_vectors(*existing_targets)
 
       new_names = vectors.to_a.map { |v| name_map[v] ? name_map[v] : v }
       self.vectors = Daru::Index.new new_names
     end
@@ -1406,23 +1496,20 @@
 
       order = Index.new(numeric_vectors)
       Daru::DataFrame.new(arry, clone: cln, order: order, index: @index)
     end
 
-    # Generate a summary of this DataFrame with ReportBuilder.
-    def summary(method=:to_text)
-      ReportBuilder.new(no_title: true).add(self).send(method)
-    end
-
-    def report_building(b) # :nodoc: #
-      b.section(name: @name) do |g|
-        g.text "Number of rows: #{nrows}"
-        @vectors.each do |v|
-          g.text "Element:[#{v}]"
-          g.parse_element(self[v])
-        end
+    # Generate a summary of this DataFrame based on individual vectors in the DataFrame
+    # @return [String] String containing the summary of the DataFrame
+    def summary
+      summary = "= #{name}"
+      summary << "\n  Number of rows: #{nrows}"
+      @vectors.each do |v|
+        summary << "\n  Element:[#{v}]\n"
+        summary << self[v].summary(1)
       end
+      summary
     end
 
     # Sorts a dataframe (ascending/descending) in the given pripority sequence of
     # vectors, with or without a block.
     #
@@ -1781,21 +1868,43 @@
         .each_with_index
         .map { |vec_name, idx| [vec_name, @data[idx]] }.to_h
     end
 
     # Convert to html for IRuby.
-    def to_html threshold=30
+    def to_html(threshold=30)
+      table_thead = to_html_thead
+      table_tbody = to_html_tbody(threshold)
       path = if index.is_a?(MultiIndex)
                File.expand_path('../iruby/templates/dataframe_mi.html.erb', __FILE__)
              else
                File.expand_path('../iruby/templates/dataframe.html.erb', __FILE__)
              end
       ERB.new(File.read(path).strip).result(binding)
     end
 
+    def to_html_thead
+      table_thead_path =
+        if index.is_a?(MultiIndex)
+          File.expand_path('../iruby/templates/dataframe_mi_thead.html.erb', __FILE__)
+        else
+          File.expand_path('../iruby/templates/dataframe_thead.html.erb', __FILE__)
+        end
+      ERB.new(File.read(table_thead_path).strip).result(binding)
+    end
+
+    def to_html_tbody(threshold=30)
+      table_tbody_path =
+        if index.is_a?(MultiIndex)
+          File.expand_path('../iruby/templates/dataframe_mi_tbody.html.erb', __FILE__)
+        else
+          File.expand_path('../iruby/templates/dataframe_tbody.html.erb', __FILE__)
+        end
+      ERB.new(File.read(table_tbody_path).strip).result(binding)
+    end
+
     def to_s
-      to_html
+      "#<#{self.class}#{': ' + @name.to_s if @name}(#{nrows}x#{ncols})>"
     end
 
     # Method for updating the metadata (i.e. missing value positions) of the
     # after assingment/deletion etc. are complete. This is provided so that
     # time is not wasted in creating the metadata for the vector each time
@@ -1898,18 +2007,17 @@
       )
     end
 
     # Pretty print in a nice table format for the command line (irb/pry/iruby)
     def inspect spacing=10, threshold=15
-      row_headers = index.is_a?(MultiIndex) ? index.sparse_tuples : index.to_a
       name_part = @name ? ": #{@name} " : ''
 
       "#<#{self.class}#{name_part}(#{nrows}x#{ncols})>\n" +
         Formatters::Table.format(
           each_row.lazy,
           row_headers: row_headers,
-          headers: vectors,
+          headers: headers,
           threshold: threshold,
           spacing: spacing
         )
     end
 
@@ -2000,12 +2108,36 @@
           .rename(cat)
           .delete_vector cat_name
       end
     end
 
+    # returns array of row tuples at given index(s)
+    def access_row_tuples_by_indexs *indexes
+      positions = @index.pos(*indexes)
+
+      return populate_row_for(positions) if positions.is_a? Numeric
+
+      res = []
+      new_rows = @data.map { |vec| vec[*indexes] }
+      indexes.each do |index|
+        tuples = []
+        new_rows.map { |row| tuples += [row[index]] }
+        res << tuples
+      end
+      res
+    end
+
     private
 
+    def headers
+      Daru::Index.new(Array(index.name) + @vectors.to_a)
+    end
+
+    def row_headers
+      index.is_a?(MultiIndex) ? index.sparse_tuples : index.to_a
+    end
+
     def convert_categorical_vectors names
       names.map do |n|
         next unless self[n].category?
         old = [n, self[n]]
         self[n] = Daru::Vector.new(self[n].to_ints)
@@ -2032,30 +2164,30 @@
       return val if val.is_a?(Daru::Vector)
       raise TypeError, "Every iteration must return Daru::Vector not #{val.class}"
     end
 
     def dispatch_to_axis(axis, method, *args, &block)
-      if axis == :vector || axis == :column
+      if %i[vector column].include?(axis)
         send("#{method}_vector", *args, &block)
       elsif axis == :row
         send("#{method}_row", *args, &block)
       else
         raise ArgumentError, "Unknown axis #{axis}"
       end
     end
 
     def dispatch_to_axis_pl(axis, method, *args, &block)
-      if axis == :vector || axis == :column
+      if %i[vector column].include?(axis)
         send("#{method}_vectors", *args, &block)
       elsif axis == :row
         send("#{method}_rows", *args, &block)
       else
         raise ArgumentError, "Unknown axis #{axis}"
       end
     end
 
-    AXES = [:row, :vector].freeze
+    AXES = %i[row vector].freeze
 
     def extract_axis names, default=:vector
       if AXES.include?(names.last)
         names.pop
       else
@@ -2063,11 +2195,11 @@
       end
     end
 
     def access_vector *names
       if names.first.is_a?(Range)
-        dup(@vectors[names.first])
+        dup(@vectors.subset(names.first))
       elsif @vectors.is_a?(MultiIndex)
         access_vector_multi_index(*names)
       else
         access_vector_single_index(*names)
       end
@@ -2085,18 +2217,22 @@
       Daru::DataFrame.new(new_vectors, index: @index, order: pos)
     end
 
     def access_vector_single_index *names
       if names.count < 2
-        pos = @vectors[names.first]
+        begin
+          pos = @vectors.is_a?(Daru::DateTimeIndex) ? @vectors[names.first] : @vectors.pos(names.first)
+        rescue IndexError
+          raise IndexError, "Specified vector #{names.first} does not exist"
+        end
 
         return @data[pos] if pos.is_a?(Numeric)
 
         names = pos
       end
 
-      new_vectors = names.map { |name| [name, @data[@vectors[name]]] }.to_h
+      new_vectors = names.map { |name| [name, @data[@vectors.pos(name)]] }.to_h
 
       order = names.is_a?(Array) ? Daru::Index.new(names) : names
       Daru::DataFrame.new(new_vectors, order: order,
                                        index: @index, name: @name)
     end
@@ -2124,11 +2260,11 @@
       name = name[0] unless @vectors.is_a?(MultiIndex)
 
       if @index.empty?
         insert_vector_in_empty name, vector
       else
-        vec = prepare_vector_for_insert name, vector
+        vec = prepare_for_insert name, vector
 
         assign_or_add_vector name, vec
       end
     end
 
@@ -2171,31 +2307,41 @@
       set_size
 
       @data.map! { |v| v.empty? ? v.reindex(@index) : v }
     end
 
-    def prepare_vector_for_insert name, vector
-      if vector.is_a?(Daru::Vector)
-        # so that index-by-index assignment is avoided when possible.
-        return vector.dup if vector.index == @index
-
-        Daru::Vector.new([], name: coerce_name(name), index: @index).tap { |v|
-          @index.each do |idx|
-            v[idx] = vector.index.include?(idx) ? vector[idx] : nil
-          end
-        }
+    def prepare_for_insert name, arg
+      if arg.is_a? Daru::Vector
+        prepare_vector_for_insert name, arg
+      elsif arg.respond_to?(:to_a)
+        prepare_enum_for_insert name, arg
       else
-        # FIXME: No spec checks this case... And SizeError is not a thing - zverok, 2016-05-08
-        if @size != vector.size
-          raise SizeError,
-            "Specified vector of length #{vector.size} cannot be inserted in DataFrame of size #{@size}"
+        prepare_value_for_insert name, arg
+      end
+    end
+
+    def prepare_vector_for_insert name, vector
+      # so that index-by-index assignment is avoided when possible.
+      return vector.dup if vector.index == @index
+      Daru::Vector.new([], name: coerce_name(name), index: @index).tap { |v|
+        @index.each do |idx|
+          v[idx] = vector.index.include?(idx) ? vector[idx] : nil
         end
+      }
+    end
 
-        Daru::Vector.new(vector, name: coerce_name(name), index: @index)
+    def prepare_enum_for_insert name, enum
+      if @size != enum.size
+        raise "Specified vector of length #{enum.size} cannot be inserted in DataFrame of size #{@size}"
       end
+      Daru::Vector.new(enum, name: coerce_name(name), index: @index)
     end
 
+    def prepare_value_for_insert name, value
+      Daru::Vector.new(Array(value) * @size, name: coerce_name(name), index: @index)
+    end
+
     def insert_or_modify_row indexes, vector
       vector = coerce_vector vector
 
       raise SizeError, 'Vector length should match row length' if
         vector.size != @vectors.size
@@ -2274,11 +2420,13 @@
       raise ArgumentError, 'All objects in data source should be same class' \
         unless source.map(&:class).uniq.size == 1
 
       case source.first
       when Array
+        vectors ||= (0..source.size-1).to_a
         initialize_from_array_of_arrays source, vectors, index, opts
       when Vector
+        vectors ||= (0..source.size-1).to_a
         initialize_from_array_of_vectors source, vectors, index, opts
       when Hash
         initialize_from_array_of_hashes source, vectors, index, opts
       else
         raise ArgumentError, "Can't create DataFrame from #{source}"