dataframe.rb in daru-0.1.2

- old
+ new

@@ -12,78 +12,103 @@
     include Daru::Maths::Arithmetic::DataFrame
     include Daru::Maths::Statistics::DataFrame
     include Daru::Plotting::DataFrame if Daru.has_nyaplot?
 
     class << self
-      # Load data from a CSV file. Specify an optional block to grab the CSV 
-      # object and pre-condition it (for example use the `convert` or 
+      # Load data from a CSV file. Specify an optional block to grab the CSV
+      # object and pre-condition it (for example use the `convert` or
       # `header_convert` methods).
-      # 
+      #
       # == Arguments
-      # 
+      #
       # * path - Path of the file to load specified as a String.
-      # 
+      #
       # == Options
-      # 
+      #
       # Accepts the same options as the Daru::DataFrame constructor and CSV.open()
       # and uses those to eventually construct the resulting DataFrame.
       #
       # == Verbose Description
       #
-      # You can specify all the options to the `.from_csv` function that you 
+      # You can specify all the options to the `.from_csv` function that you
       # do to the Ruby `CSV.read()` function, since this is what is used internally.
       #
-      # For example, if the columns in your CSV file are separated by something 
-      # other that commas, you can use the `:col_sep` option. If you want to 
-      # convert numeric values to numbers and not keep them as strings, you can 
+      # For example, if the columns in your CSV file are separated by something
+      # other that commas, you can use the `:col_sep` option. If you want to
+      # convert numeric values to numbers and not keep them as strings, you can
       # use the `:converters` option and set it to `:numeric`.
       #
-      # The `.from_csv` function uses the following defaults for reading CSV files 
+      # The `.from_csv` function uses the following defaults for reading CSV files
       # (that are passed into the `CSV.read()` function):
       #
       #   {
       #     :col_sep           => ',',
       #     :converters        => :numeric
       #   }
       def from_csv path, opts={}, &block
-        Daru::IO.from_csv path, opts, &block      
+        Daru::IO.from_csv path, opts, &block
       end
 
       # Read data from an Excel file into a DataFrame.
-      # 
+      #
       # == Arguments
-      # 
+      #
       # * path - Path of the file to be read.
-      # 
+      #
       # == Options
-      # 
+      #
       # *:worksheet_id - ID of the worksheet that is to be read.
-      def from_excel path, opts={}, &block      
+      def from_excel path, opts={}, &block
         Daru::IO.from_excel path, opts, &block
       end
 
       # Read a database query and returns a Dataset
       #
+      # @param dbh [DBI::DatabaseHandle] A DBI connection to be used to run the query
+      # @param query [String] The query to be executed
+      #
+      # @return A dataframe containing the data resulting from the query
+      #
       # USE:
       #
       #  dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
       #  Daru::DataFrame.from_sql(dbh, "SELECT * FROM test")
       def from_sql dbh, query
         Daru::IO.from_sql dbh, query
       end
 
+      # Read a dataframe from AR::Relation
+      #
+      # @param relation [ActiveRecord::Relation] An AR::Relation object from which data is loaded
+      # @params fields [Array] Field names to be loaded (optional)
+      #
+      # @return A dataframe containing the data loaded from the relation
+      #
+      # USE:
+      #
+      #   # When Post model is defined as:
+      #   class Post < ActiveRecord::Base
+      #     scope :active, -> { where.not(published_at: nil) }
+      #   end
+      #
+      #   # You can load active posts into a dataframe by:
+      #   Daru::DataFrame.from_activerecord(Post.active, :title, :published_at)
+      def from_activerecord relation, *fields
+        Daru::IO.from_activerecord relation, *fields
+      end
+
       # Read the database from a plaintext file. For this method to work,
       # the data should be present in a plain text file in columns. See
       # spec/fixtures/bank2.dat for an example.
-      # 
+      #
       # == Arguments
-      # 
+      #
       # * path - Path of the file to be read.
       # * fields - Vector names of the resulting database.
-      # 
+      #
       # == Usage
-      # 
+      #
       #   df = Daru::DataFrame.from_plaintext 'spec/fixtures/bank2.dat', [:v1,:v2,:v3,:v4,:v5,:v6]
       def from_plaintext path, fields
         Daru::IO.from_plaintext path, fields
       end
 
@@ -135,19 +160,19 @@
       #    a  0   1
       #    b  1   0
       #
       # Useful to process outputs from databases
       def crosstab_by_assignation rows, columns, values
-        raise "Three vectors should be equal size" if 
+        raise "Three vectors should be equal size" if
           rows.size != columns.size or rows.size!=values.size
 
         cols_values = columns.factors
         cols_n      = cols_values.size
 
-        h_rows = rows.factors.inject({}) do |a,v| 
-          a[v] = cols_values.inject({}) do |a1,v1| 
-            a1[v1]=nil 
+        h_rows = rows.factors.inject({}) do |a,v|
+          a[v] = cols_values.inject({}) do |a1,v1|
+            a1[v1]=nil
             a1
           end
           a
         end
 
@@ -184,42 +209,42 @@
 
     # DataFrame basically consists of an Array of Vector objects.
     # These objects are indexed by row and column by vectors and index Index objects.
     #
     # == Arguments
-    # 
+    #
     # * source - Source from the DataFrame is to be initialized. Can be a Hash
     # of names and vectors (array or Daru::Vector), an array of arrays or
     # array of Daru::Vectors.
-    # 
+    #
     # == Options
-    # 
-    # +:order+ - An *Array*/*Daru::Index*/*Daru::MultiIndex* containing the order in 
+    #
+    # +:order+ - An *Array*/*Daru::Index*/*Daru::MultiIndex* containing the order in
     # which Vectors should appear in the DataFrame.
-    # 
+    #
     # +:index+ - An *Array*/*Daru::Index*/*Daru::MultiIndex* containing the order
     # in which rows of the DataFrame will be named.
-    # 
+    #
     # +:name+  - A name for the DataFrame.
     #
     # +:clone+ - Specify as *true* or *false*. When set to false, and Vector
     # objects are passed for the source, the Vector objects will not duplicated
-    # when creating the DataFrame. Will have no effect if Array is passed in 
-    # the source, or if the passed Daru::Vectors have different indexes. 
+    # when creating the DataFrame. Will have no effect if Array is passed in
+    # the source, or if the passed Daru::Vectors have different indexes.
     # Default to *true*.
-    # 
+    #
     # == Usage
-    #   df = Daru::DataFrame.new({a: [1,2,3,4], b: [6,7,8,9]}, order: [:b, :a], 
+    #   df = Daru::DataFrame.new({a: [1,2,3,4], b: [6,7,8,9]}, order: [:b, :a],
     #     index: [:a, :b, :c, :d], name: :spider_man)
-    # 
-    #   # => 
+    #
+    #   # =>
     #   # <Daru::DataFrame:80766980 @name = spider_man @size = 4>
-    #   #             b          a 
-    #   #  a          6          1 
-    #   #  b          7          2 
-    #   #  c          8          3 
-    #   #  d          9          4 
+    #   #             b          a
+    #   #  a          6          1
+    #   #  b          7          2
+    #   #  c          8          3
+    #   #  d          9          4
     def initialize source, opts={}
       vectors = opts[:order]
       index   = opts[:index]
       clone   = opts[:clone] == false ? false : true
       @data   = []
@@ -290,11 +315,11 @@
 
             if clone
               @vectors.each do |vector|
                 # avoids matching indexes of vectors if all the supplied vectors
                 # have the same index.
-                if vectors_have_same_index 
+                if vectors_have_same_index
                   v = source[vector].dup
                 else
                   v = Daru::Vector.new([], name: vector, index: @index)
 
                   @index.each do |idx|
@@ -329,12 +354,12 @@
       $stderr.puts "#vector has been deprecated in favour of #[]. Please use that."
       self[*names]
     end
 
     # Access row or vector. Specify name of row/vector followed by axis(:row, :vector).
-    # Defaults to *:vector*. Use of this method is not recommended for accessing 
-    # rows or vectors. Use df.row[:a] for accessing row with index ':a' or 
+    # Defaults to *:vector*. Use of this method is not recommended for accessing
+    # rows or vectors. Use df.row[:a] for accessing row with index ':a' or
     # df.vector[:vec] for accessing vector with index *:vec*.
     def [](*names)
       if names[-1] == :vector or names[-1] == :row
         axis = names[-1]
         names = names[0..-2]
@@ -352,11 +377,11 @@
     end
 
     # Insert a new row/vector of the specified name or modify a previous row.
     # Instead of using this method directly, use df.row[:a] = [1,2,3] to set/create
     # a row ':a' to [1,2,3], or df.vector[:vec] = [1,2,3] for vectors.
-    # 
+    #
     # In case a Daru::Vector is specified after the equality the sign, the indexes
     # of the vector will be matched against the row/vector indexes of the DataFrame
     # before an insertion is performed. Unmatched indexes will be set to nil.
     def []=(*args)
       axis = args.include?(:row) ? :row : :vector
@@ -366,11 +391,11 @@
       name = args[0..-2]
       vector = args[-1]
 
       if axis == :vector
         insert_or_modify_vector name, vector
-      elsif axis == :row        
+      elsif axis == :row
         insert_or_modify_row name, vector
       else
         raise IndexError, "Expected axis to be row or vector, not #{axis}."
       end
     end
@@ -387,30 +412,30 @@
     def add_vector n, vector
       self[n] = vector
     end
 
     # Access a row or set/create a row. Refer #[] and #[]= docs for details.
-    # 
+    #
     # == Usage
     #   df.row[:a] # access row named ':a'
     #   df.row[:b] = [1,2,3] # set row ':b' to [1,2,3]
     def row
       Daru::Accessors::DataFrameByRow.new(self)
     end
 
     # Duplicate the DataFrame entirely.
-    # 
+    #
     # == Arguments
-    # 
-    # * +vectors_to_dup+ - An Array specifying the names of Vectors to 
+    #
+    # * +vectors_to_dup+ - An Array specifying the names of Vectors to
     # be duplicated. Will duplicate the entire DataFrame if not specified.
     def dup vectors_to_dup=nil
       vectors_to_dup = @vectors.to_a unless vectors_to_dup
 
       src = []
       vectors_to_dup.each do |vec|
-        src << @data[@vectors[vec]].to_a
+        src << @data[@vectors[vec]].to_a.dup
       end
       new_order = Daru::Index.new(vectors_to_dup)
 
       Daru::DataFrame.new src, order: new_order, index: @index.dup, name: @name, clone: true
     end
@@ -420,13 +445,13 @@
       Daru::DataFrame.new([], order: @vectors.dup, index: @index.dup, name: @name)
     end
 
     # Returns a 'view' of the DataFrame, i.e the object ID's of vectors are
     # preserved.
-    # 
+    #
     # == Arguments
-    # 
+    #
     # +vectors_to_clone+ - Names of vectors to clone. Optional. Will return
     # a view of the whole data frame otherwise.
     def clone *vectors_to_clone
       vectors_to_clone.flatten! unless vectors_to_clone.all? { |a| !a.is_a?(Array) }
       return super if vectors_to_clone.empty?
@@ -436,21 +461,21 @@
         hsh
       end
       Daru::DataFrame.new(h, clone: false)
     end
 
-    # Returns a 'shallow' copy of DataFrame if missing data is not present, 
+    # Returns a 'shallow' copy of DataFrame if missing data is not present,
     # or a full copy of only valid data if missing data is present.
     def clone_only_valid
       if has_missing_data?
         dup_only_valid
       else
         clone
       end
     end
 
-    # Creates a new duplicate dataframe containing only rows 
+    # Creates a new duplicate dataframe containing only rows
     # without a single missing value.
     def dup_only_valid vecs=nil
       rows_with_nil = @data.inject([]) do |memo, vector|
         memo.concat vector.missing_positions
         memo
@@ -483,11 +508,11 @@
     def each_vector_with_index(&block)
       return to_enum(:each_vector_with_index) unless block_given?
 
       @vectors.each do |vector|
         yield @data[@vectors[vector]], vector
-      end 
+      end
 
       self
     end
 
     alias_method :each_column_with_index, :each_vector_with_index
@@ -516,16 +541,16 @@
     # Iterate over each row or vector of the DataFrame. Specify axis
     # by passing :vector or :row as the argument. Default to :vector.
     #
     # == Description
     #
-    # `#each` works exactly like Array#each. The default mode for `each` 
-    # is to iterate over the columns of the DataFrame. To iterate over 
+    # `#each` works exactly like Array#each. The default mode for `each`
+    # is to iterate over the columns of the DataFrame. To iterate over
     # rows you must pass the axis, i.e `:row` as an argument.
-    # 
+    #
     # == Arguments
-    # 
+    #
     # * +axis+ - The axis to iterate over. Can be :vector (or :column)
     # or :row. Default to :vector.
     def each axis=:vector, &block
       if axis == :vector or axis == :column
         each_vector(&block)
@@ -539,18 +564,18 @@
     # Iterate over a row or vector and return results in a Daru::Vector.
     # Specify axis with :vector or :row. Default to :vector.
     #
     # == Description
     #
-    # The #collect iterator works similar to #map, the only difference 
-    # being that it returns a Daru::Vector comprising of the results of 
-    # each block run. The resultant Vector has the same index as that 
-    # of the axis over which collect has iterated. It also accepts the 
+    # The #collect iterator works similar to #map, the only difference
+    # being that it returns a Daru::Vector comprising of the results of
+    # each block run. The resultant Vector has the same index as that
+    # of the axis over which collect has iterated. It also accepts the
     # optional axis argument.
     #
     # == Arguments
-    # 
+    #
     # * +axis+ - The axis to iterate over. Can be :vector (or :column)
     # or :row. Default to :vector.
     def collect axis=:vector, &block
       if axis == :vector or axis == :column
         collect_vectors(&block)
@@ -563,20 +588,20 @@
 
     # Map over each vector or row of the data frame according to
     # the argument specified. Will return an Array of the resulting
     # elements. To map over each row/vector and get a DataFrame,
     # see #recode.
-    # 
+    #
     # == Description
-    # 
-    # The #map iterator works like Array#map. The value returned by 
-    # each run of the block is added to an Array and the Array is 
-    # returned. This method also accepts an axis argument, like #each. 
+    #
+    # The #map iterator works like Array#map. The value returned by
+    # each run of the block is added to an Array and the Array is
+    # returned. This method also accepts an axis argument, like #each.
     # The default is :vector.
-    # 
+    #
     # == Arguments
-    # 
+    #
     # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
     # Default to :vector.
     def map axis=:vector, &block
       if axis == :vector or axis == :column
         map_vectors(&block)
@@ -588,13 +613,13 @@
     end
 
     # Destructive map. Modifies the DataFrame. Each run of the block
     # must return a Daru::Vector. You can specify the axis to map over
     # as the argument. Default to :vector.
-    # 
+    #
     # == Arguments
-    # 
+    #
     # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
     # Default to :vector.
     def map! axis=:vector, &block
       if axis == :vector or axis == :column
         map_vectors!(&block)
@@ -607,19 +632,19 @@
     # block must return a Daru::Vector object. You can specify the axis
     # to map over. Default to :vector.
     #
     # == Description
     #
-    # Recode works similarly to #map, but an important difference between 
-    # the two is that recode returns a modified Daru::DataFrame instead 
-    # of an Array. For this reason, #recode expects that every run of the 
+    # Recode works similarly to #map, but an important difference between
+    # the two is that recode returns a modified Daru::DataFrame instead
+    # of an Array. For this reason, #recode expects that every run of the
     # block to return a Daru::Vector.
     #
     # Just like map and each, recode also accepts an optional _axis_ argument.
-    # 
+    #
     # == Arguments
-    # 
+    #
     # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
     # Default to :vector.
     def recode axis=:vector, &block
       if axis == :vector or axis == :column
         recode_vectors(&block)
@@ -627,26 +652,26 @@
         recode_rows(&block)
       end
     end
 
     # Retain vectors or rows if the block returns a truthy value.
-    # 
+    #
     # == Description
-    # 
-    # For filtering out certain rows/vectors based on their values, 
-    # use the #filter method. By default it iterates over vectors and 
-    # keeps those vectors for which the block returns true. It accepts 
-    # an optional axis argument which lets you specify whether you want 
+    #
+    # For filtering out certain rows/vectors based on their values,
+    # use the #filter method. By default it iterates over vectors and
+    # keeps those vectors for which the block returns true. It accepts
+    # an optional axis argument which lets you specify whether you want
     # to iterate over vectors or rows.
-    # 
+    #
     # == Arguments
-    # 
+    #
     # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
     # Default to :vector.
-    # 
+    #
     # == Usage
-    # 
+    #
     #   # Filter vectors
     #
     #   df.filter do |vector|
     #     vector.type == :numeric and vector.median < 50
     #   end
@@ -663,16 +688,16 @@
         filter_rows(&block)
       end
     end
 
     def recode_vectors &block
-      block_given? or return to_enum(:recode_vectors) 
+      block_given? or return to_enum(:recode_vectors)
 
       df = self.dup
       df.each_vector_with_index do |v, i|
         ret = yield v
-        ret.is_a?(Daru::Vector) or 
+        ret.is_a?(Daru::Vector) or
           raise TypeError, "Every iteration must return Daru::Vector not #{ret.class}"
         df[*i] = ret
       end
 
       df
@@ -761,11 +786,11 @@
       end
 
       self
     end
 
-    # Retrieves a Daru::Vector, based on the result of calculation 
+    # Retrieves a Daru::Vector, based on the result of calculation
     # performed on each row.
     def collect_rows &block
       return to_enum(:collect_rows) unless block_given?
 
       data = []
@@ -876,19 +901,19 @@
       @index.each do |index|
         keep_row = yield access_row(index)
 
         deletion << index unless keep_row
       end
-      deletion.each { |idx| 
-        delete_row idx 
+      deletion.each { |idx|
+        delete_row idx
       }
     end
 
     def keep_vector_if &block
       @vectors.each do |vector|
         keep_vector = yield @data[@vectors[vector]], vector
-        
+
         delete_vector vector unless keep_vector
       end
     end
 
     # creates a new vector with the data of a given field which the block returns true
@@ -923,20 +948,20 @@
 
     # Iterates over each vector and retains it in a new DataFrame if the block returns
     # true for that vector.
     def filter_vectors &block
       return to_enum(:filter_vectors) unless block_given?
-      
+
       df = self.dup
       df.keep_vector_if &block
 
       df
     end
 
     # Test each row with one or more tests. Each test is a Proc with the form
     # *Proc.new {|row| row[:age] > 0}*
-    # 
+    #
     # The function returns an array with all errors.
     def verify(*tests)
       if(tests[0].is_a? Symbol)
         id = tests[0]
         tests.shift
@@ -961,13 +986,13 @@
       vr
     end
 
     # DSL for yielding each row and returning a Daru::Vector based on the
     # value each run of the block returns.
-    # 
+    #
     # == Usage
-    # 
+    #
     #   a1 = Daru::Vector.new([1, 2, 3, 4, 5, 6, 7])
     #   a2 = Daru::Vector.new([10, 20, 30, 40, 50, 60, 70])
     #   a3 = Daru::Vector.new([100, 200, 300, 400, 500, 600, 700])
     #   ds = Daru::DataFrame.new({ :a => a1, :b => a2, :c => a3 })
     #   total = ds.vector_by_calculation { a + b + c }
@@ -989,30 +1014,30 @@
       Daru::Vector.new a, index: @index
     end
 
     # Returns a vector, based on a string with a calculation based
     # on vector.
-    # 
+    #
     # The calculation will be eval'ed, so you can put any variable
     # or expression valid on ruby.
-    # 
+    #
     # For example:
     #   a = Daru::Vector.new [1,2]
     #   b = Daru::Vector.new [3,4]
     #   ds = Daru::DataFrame.new({:a => a,:b => b})
     #   ds.compute("a+b")
     #   => Vector [4,6]
     def compute text, &block
       return instance_eval(&block) if block_given?
-      instance_eval(text) 
+      instance_eval(text)
     end
 
     # Return a vector with the number of missing values in each row.
-    # 
+    #
     # == Arguments
-    # 
-    # * +missing_values+ - An Array of the values that should be 
+    #
+    # * +missing_values+ - An Array of the values that should be
     # treated as 'missing'. The default missing value is *nil*.
     def missing_values_rows missing_values=[nil]
       number_of_missing = []
       each_row do |row|
         row.missing_values = missing_values
@@ -1029,13 +1054,13 @@
       !!@data.any? { |v| v.has_missing_data? }
     end
 
     alias :flawed? :has_missing_data?
 
-    # Return a nested hash using vector names as keys and an array constructed of 
+    # Return a nested hash using vector names as keys and an array constructed of
     # hashes with other values. If block provided, is used to provide the
-    # values, with parameters +row+ of dataset, +current+ last hash on 
+    # values, with parameters +row+ of dataset, +current+ last hash on
     # hierarchy and +name+ of the key to include
     def nest *tree_keys, &block
       tree_keys = tree_keys[0] if tree_keys[0].is_a? Array
       out = {}
 
@@ -1099,11 +1124,11 @@
     # @param [Symbol] axis (:vector) The axis to iterate over. Can be :vector or
     #   :row. A Daru::Vector object is yielded in the block.
     # @example Using any?
     #   df = Daru::DataFrame.new({a: [1,2,3,4,5], b: ['a', 'b', 'c', 'd', 'e']})
     #   df.any?(:row) do |row|
-    #     row[:a] < 3 and row[:b] == 'b' 
+    #     row[:a] < 3 and row[:b] == 'b'
     #   end #=> true
     def any? axis=:vector, &block
       if axis == :vector or axis == :column
         @data.any?(&block)
       elsif axis == :row
@@ -1121,11 +1146,11 @@
     # @param [Symbol] axis (:vector) The axis to iterate over. Can be :vector or
     #   :row. A Daru::Vector object is yielded in the block.
     # @example Using all?
     #   df = Daru::DataFrame.new({a: [1,2,3,4,5], b: ['a', 'b', 'c', 'd', 'e']})
     #   df.all?(:row) do |row|
-    #     row[:a] < 10 
+    #     row[:a] < 10
     #   end #=> true
     def all? axis=:vector, &block
       if axis == :vector or axis == :column
         @data.all?(&block)
       elsif axis == :row
@@ -1143,18 +1168,22 @@
     # @param [Fixnum] quantity (10) The number of elements to display from the top.
     def head quantity=10
       self[0..(quantity-1), :row]
     end
 
+    alias :first :head
+
     # The last ten elements of the DataFrame
-    # 
+    #
     # @param [Fixnum] quantity (10) The number of elements to display from the bottom.
     def tail quantity=10
       self[(@size - quantity)..(@size-1), :row]
     end
 
-    # Returns a vector with sum of all vectors specified in the argument. 
+    alias :last :tail
+
+    # Returns a vector with sum of all vectors specified in the argument.
     # Tf vecs parameter is empty, sum all numeric vector.
     def vector_sum vecs=nil
       vecs ||= numeric_vectors
       sum = Daru::Vector.new [0]*@size, index: @index, name: @name, dtype: @dtype
 
@@ -1164,13 +1193,13 @@
 
       sum
     end
 
     # Calculate mean of the rows of the dataframe.
-    # 
+    #
     # == Arguments
-    # 
+    #
     # * +max_missing+ - The maximum number of elements in the row that can be
     # zero for the mean calculation to happen. Default to 0.
     def vector_mean max_missing=0
       mean_vec = Daru::Vector.new [0]*@size, index: @index, name: "mean_#{@name}"
 
@@ -1179,20 +1208,20 @@
       end
 
       mean_vec
     end
 
-    # Group elements by vector to perform operations on them. Returns a 
+    # Group elements by vector to perform operations on them. Returns a
     # Daru::Core::GroupBy object.See the Daru::Core::GroupBy docs for a detailed
     # list of possible operations.
-    # 
+    #
     # == Arguments
-    # 
+    #
     # * vectors - An Array contatining names of vectors to group by.
-    # 
+    #
     # == Usage
-    # 
+    #
     #   df = Daru::DataFrame.new({
     #     a: %w{foo bar foo bar   foo bar foo foo},
     #     b: %w{one one two three two two one three},
     #     c:   [1  ,2  ,3  ,1    ,3  ,6  ,3  ,8],
     #     d:   [11 ,22 ,33 ,44   ,55 ,66 ,77 ,88]
@@ -1207,11 +1236,11 @@
     #   # ["foo", "two", 3]=>[2, 4]}
     def group_by *vectors
       vectors.flatten!
       vectors.each { |v| raise(ArgumentError, "Vector #{v} does not exist") unless
         has_vector?(v) }
-        
+
       Daru::Core::GroupBy.new(self, vectors)
     end
 
     def reindex_vectors new_vectors
       raise ArgumentError, "Must pass the new index of type Index or its "\
@@ -1232,48 +1261,48 @@
     # Concatenate another DataFrame along corresponding columns.
     # Very premature implementation. Use with caution.
     def concat other_df
       vectors = []
       @vectors.each do |v|
-        vectors << self[v].to_a.concat(other_df[v].to_a)
+        vectors << self[v].to_a.dup.concat(other_df[v].to_a)
       end
 
       Daru::DataFrame.new(vectors, order: @vectors)
     end
 
     # Set a particular column as the new DF
     def set_index new_index, opts={}
-      raise ArgumentError, "All elements in new index must be unique." if 
+      raise ArgumentError, "All elements in new index must be unique." if
         @size != self[new_index].uniq.size
-      
+
       self.index = Daru::Index.new(self[new_index].to_a)
       self.delete_vector(new_index) unless opts[:keep]
 
       self
     end
 
     # Change the index of the DataFrame and preserve the labels of the previous
     # indexing. New index can be Daru::Index or any of its subclasses.
-    # 
+    #
     # @param [Daru::Index] new_index The new Index for reindexing the DataFrame.
     # @example Reindexing DataFrame
-    #   df = Daru::DataFrame.new({a: [1,2,3,4], b: [11,22,33,44]}, 
+    #   df = Daru::DataFrame.new({a: [1,2,3,4], b: [11,22,33,44]},
     #     index: ['a','b','c','d'])
-    #   #=> 
+    #   #=>
     #   ##<Daru::DataFrame:83278130 @name = b19277b8-c548-41da-ad9a-2ad8c060e273 @size = 4>
-    #   #                    a          b 
-    #   #         a          1         11 
-    #   #         b          2         22 
-    #   #         c          3         33 
-    #   #         d          4         44 
+    #   #                    a          b
+    #   #         a          1         11
+    #   #         b          2         22
+    #   #         c          3         33
+    #   #         d          4         44
     #   df.reindex Daru::Index.new(['b', 0, 'a', 'g'])
-    #   #=> 
+    #   #=>
     #   ##<Daru::DataFrame:83177070 @name = b19277b8-c548-41da-ad9a-2ad8c060e273 @size = 4>
-    #   #                    a          b 
-    #   #         b          2         22 
-    #   #         0        nil        nil 
-    #   #         a          1         11 
+    #   #                    a          b
+    #   #         b          2         22
+    #   #         0        nil        nil
+    #   #         a          1         11
     #   #         g        nil        nil
     def reindex new_index
       raise ArgumentError, "Must pass the new index of type Index or its "\
         "subclasses, not #{new_index.class}" unless new_index.kind_of?(Daru::Index)
 
@@ -1294,33 +1323,33 @@
     # @param [Daru::Index] idx New index object on which the rows of the dataframe
     #   are to be indexed.
     # @example Reassgining index of a DataFrame
     #   df = Daru::DataFrame.new({a: [1,2,3,4], b: [11,22,33,44]})
     #   df.index.to_a #=> [0,1,2,3]
-    # 
+    #
     #   df.index = Daru::Index.new(['a','b','c','d'])
     #   df.index.to_a #=> ['a','b','c','d']
-    #   df.row['a'].to_a #=> [1,11] 
+    #   df.row['a'].to_a #=> [1,11]
     def index= idx
       @data.each { |vec| vec.index = idx}
       @index = idx
 
       self
     end
 
     # Reassign vectors with a new index of type Daru::Index or any of its subclasses.
-    # 
+    #
     # @param [Daru::Index] idx The new index object on which the vectors are to
     #   be indexed. Must of the same size as ncols.
     # @example Reassigning vectors of a DataFrame
     #   df = Daru::DataFrame.new({a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44]})
     #   df.vectors.to_a #=> [:a, :b, :c]
-    # 
+    #
     #   df.vectors = Daru::Index.new([:foo, :bar, :baz])
     #   df.vectors.to_a #=> [:foo, :bar, :baz]
     def vectors= idx
-      raise ArgumentError, "Can only reindex with Index and its subclasses" unless 
+      raise ArgumentError, "Can only reindex with Index and its subclasses" unless
         index.kind_of?(Daru::Index)
       raise ArgumentError, "Specified index length #{idx.size} not equal to"\
         "dataframe size #{ncols}" if idx.size != ncols
 
       @vectors = idx
@@ -1375,35 +1404,35 @@
           g.parse_element(self[v])
         end
       end
     end
 
-    # Sorts a dataframe (ascending/descending)according to the given sequence of 
+    # Sorts a dataframe (ascending/descending)according to the given sequence of
     # vectors, using the attributes provided in the blocks.
-    # 
+    #
     # @param order [Array] The order of vector names in which the DataFrame
     #   should be sorted.
     # @param [Hash] opts The options to sort with.
     # @option opts [TrueClass,FalseClass,Array] :ascending (true) Sort in ascending
     #   or descending order. Specify Array corresponding to *order* for multiple
     #   sort orders.
     # @option opts [Hash] :by ({|a,b| a <=> b}) Specify attributes of objects to
-    #   to be used for sorting, for each vector name in *order* as a hash of 
+    #   to be used for sorting, for each vector name in *order* as a hash of
     #   vector name and lambda pairs. In case a lambda for a vector is not
     #   specified, the default will be used.
-    # 
+    #
     # == Usage
-    #   
+    #
     #   df = Daru::DataFrame.new({a: [-3,2,-1,4], b: [4,3,2,1]})
-    #   
+    #
     #   #<Daru::DataFrame:140630680 @name = 04e00197-f8d5-4161-bca2-93266bfabc6f @size = 4>
-    #   #            a          b 
-    #   # 0         -3          4 
-    #   # 1          2          3 
-    #   # 2         -1          2 
-    #   # 3          4          1 
-    #   df.sort([:a], by: { a: lambda { |a,b| a.abs <=> b.abs } })  
+    #   #            a          b
+    #   # 0         -3          4
+    #   # 1          2          3
+    #   # 2         -1          2
+    #   # 3          4          1
+    #   df.sort([:a], by: { a: lambda { |a,b| a.abs <=> b.abs } })
     def sort! vector_order, opts={}
       raise ArgumentError, "Required atleast one vector name" if vector_order.size < 1
       opts = {
         ascending: true,
         type: :quick_sort,
@@ -1424,58 +1453,58 @@
       self.dup.sort! vector_order, opts
     end
 
     # Pivots a data frame on specified vectors and applies an aggregate function
     # to quickly generate a summary.
-    # 
+    #
     # == Options
-    # 
+    #
     # +:index+ - Keys to group by on the pivot table row index. Pass vector names
     # contained in an Array.
-    # 
+    #
     # +:vectors+ - Keys to group by on the pivot table column index. Pass vector
     # names contained in an Array.
-    # 
+    #
     # +:agg+ - Function to aggregate the grouped values. Default to *:mean*. Can
-    # use any of the statistics functions applicable on Vectors that can be found in 
+    # use any of the statistics functions applicable on Vectors that can be found in
     # the Daru::Statistics::Vector module.
-    # 
-    # +:values+ - Columns to aggregate. Will consider all numeric columns not 
+    #
+    # +:values+ - Columns to aggregate. Will consider all numeric columns not
     # specified in *:index* or *:vectors*. Optional.
-    # 
+    #
     # == Usage
-    # 
+    #
     #   df = Daru::DataFrame.new({
-    #     a: ['foo'  ,  'foo',  'foo',  'foo',  'foo',  'bar',  'bar',  'bar',  'bar'], 
+    #     a: ['foo'  ,  'foo',  'foo',  'foo',  'foo',  'bar',  'bar',  'bar',  'bar'],
     #     b: ['one'  ,  'one',  'one',  'two',  'two',  'one',  'one',  'two',  'two'],
     #     c: ['small','large','large','small','small','large','small','large','small'],
     #     d: [1,2,2,3,3,4,5,6,7],
     #     e: [2,4,4,6,6,8,10,12,14]
     #   })
     #   df.pivot_table(index: [:a], vectors: [:b], agg: :sum, values: :e)
-    # 
-    #   #=> 
+    #
+    #   #=>
     #   # #<Daru::DataFrame:88342020 @name = 08cdaf4e-b154-4186-9084-e76dd191b2c9 @size = 2>
-    #   #            [:e, :one] [:e, :two] 
-    #   #     [:bar]         18         26 
-    #   #     [:foo]         10         12 
+    #   #            [:e, :one] [:e, :two]
+    #   #     [:bar]         18         26
+    #   #     [:foo]         10         12
     def pivot_table opts={}
-      raise ArgumentError, 
+      raise ArgumentError,
         "Specify grouping index" if !opts[:index] or opts[:index].empty?
 
       index   = opts[:index]
       vectors = opts[:vectors] || []
       aggregate_function = opts[:agg] || :mean
-      values = 
+      values =
       if opts[:values].is_a?(Symbol)
         [opts[:values]]
       elsif opts[:values].is_a?(Array)
         opts[:values]
       else # nil
         (@vectors.to_a - (index | vectors)) & numeric_vector_names
       end
-      
+
       raise IndexError, "No numeric vectors to aggregate" if values.empty?
 
       grouped  = group_by(index)
 
       unless vectors.empty?
@@ -1522,11 +1551,11 @@
       else
         grouped.send(aggregate_function)
       end
     end
 
-    # Merge vectors from two DataFrames. In case of name collision, 
+    # Merge vectors from two DataFrames. In case of name collision,
     # the vectors names are changed to x_1, x_2 ....
     #
     # @return {Daru::DataFrame}
     def merge other_df
       raise "Number of rows must be equal in this: #{nrows} and other: #{other_df.nrows}" unless nrows == other_df.nrows
@@ -1543,13 +1572,13 @@
 
       df_new.update
       df_new
     end
 
-    # Join 2 DataFrames with SQL style joins. Currently supports inner, left 
+    # Join 2 DataFrames with SQL style joins. Currently supports inner, left
     # outer, right outer and full outer joins.
-    # 
+    #
     # @param [Daru::DataFrame] other_df Another DataFrame on which the join is
     #   to be performed.
     # @param [Hash] opts Options Hash
     # @option :how [Symbol] Can be one of :inner, :left, :right or :outer.
     # @option :on [Array] The columns on which the join is to be performed.
@@ -1563,15 +1592,15 @@
     #   right = Daru::DataFrame.new({
     #     :id => [1,2,3,4],
     #     :name => ['Rutabaga', 'Pirate', 'Darth Vader', 'Ninja']
     #   })
     #   left.join(right, how: :inner, on: [:name])
-    #   #=> 
+    #   #=>
     #   ##<Daru::DataFrame:82416700 @name = 74c0811b-76c6-4c42-ac93-e6458e82afb0 @size = 2>
-    #   #                 id_1       name       id_2 
-    #   #         0          1     Pirate          2 
-    #   #         1          3      Ninja          4 
+    #   #                 id_1       name       id_2
+    #   #         0          1     Pirate          2
+    #   #         1          3      Ninja          4
     def join(other_df,opts={})
       Daru::Core::Merge.join(self, other_df, opts)
     end
 
 
@@ -1584,11 +1613,11 @@
     # with
     #   ds.one_to_many([:id], "child_%v_%n"
     # the field of first parameters will be copied verbatim
     # to new dataset, and fields which responds to second
     # pattern will be added one case for each different %n.
-    # 
+    #
     # @example
     #   cases=[
     #     ['1','george','red',10,'blue',20,nil,nil],
     #     ['2','fred','green',15,'orange',30,'white',20],
     #     ['3','alfred',nil,nil,nil,nil,nil,nil]
@@ -1605,13 +1634,13 @@
     def one_to_many(parent_fields, pattern)
       re      = Regexp.new pattern.gsub("%v","(.+?)").gsub("%n","(\\d+?)")
       ds_vars = parent_fields.dup
       vars    = []
       max_n   = 0
-      h       = parent_fields.inject({}) { |a,v| 
+      h       = parent_fields.inject({}) { |a,v|
         a[v] = Daru::Vector.new([])
-        a 
+        a
       }
       # Adding _row_id
       h['_col_id'] = Daru::Vector.new([])
       ds_vars.push('_col_id')
 
@@ -1661,16 +1690,16 @@
         i += 1
       }
     end
 
     # Create a sql, basen on a given Dataset
-    # 
+    #
     # == Arguments
-    # 
+    #
     # * table - String specifying name of the table that will created in SQL.
     # * charset - Character set. Default is "UTF8".
-    # 
+    #
     # @example
     #
     #  ds = Daru::DataFrame.new({
     #   :id   => Daru::Vector.new([1,2,3,4,5]),
     #   :name => Daru::Vector.new(%w{Alex Peter Susan Mary John})
@@ -1715,21 +1744,21 @@
 
     # Convert all vectors of type *:numeric* and not containing nils into an NMatrix.
     def to_nmatrix
       numerics_as_arrays = []
       each_vector do |vector|
-        numerics_as_arrays << vector.to_a if(vector.type == :numeric and 
+        numerics_as_arrays << vector.to_a if(vector.type == :numeric and
           vector.missing_positions.size == 0)
       end
 
       numerics_as_arrays.transpose.to_nm
     end
-    
+
     # Converts the DataFrame into an array of hashes where key is vector name
-    # and value is the corresponding element. The 0th index of the array contains 
-    # the array of hashes while the 1th index contains the indexes of each row 
-    # of the dataframe. Each element in the index array corresponds to its row 
+    # and value is the corresponding element. The 0th index of the array contains
+    # the array of hashes while the 1th index contains the indexes of each row
+    # of the dataframe. Each element in the index array corresponds to its row
     # in the array of hashes, which has the same index.
     def to_a
       arry = [[],[]]
       self.each_row do |row|
         arry[0] << row.to_hash
@@ -1760,14 +1789,14 @@
       hsh
     end
 
     # Convert to html for IRuby.
     def to_html threshold=30
-      html = "<table>" + 
+      html = "<table>" +
         "<tr>" +
-          "<th colspan=\"#{@vectors.size+1}\">" + 
-            "Daru::DataFrame:#{self.object_id} " + " rows: #{nrows} " + " cols: #{ncols}" 
+          "<th colspan=\"#{@vectors.size+1}\">" +
+            "Daru::DataFrame:#{self.object_id} " + " rows: #{nrows} " + " cols: #{ncols}"
           "</th>" +
         "</tr>"
       html +='<tr><th></th>'
       @vectors.each { |vector| html += '<th>' + vector.to_s + '</th>' }
       html += '</tr>'
@@ -1789,11 +1818,11 @@
           last_index = @index.to_a.last
           last_row = self.row[last_index]
           html += '<tr>'
           html += "<td>" + last_index.to_s + "</td>"
           (0..(ncols - 1)).to_a.each do |i|
-            html += '<td>' + last_row[i].to_s + '</td>' 
+            html += '<td>' + last_row[i].to_s + '</td>'
           end
           html += '</tr>'
           break
         end
       end
@@ -1823,37 +1852,37 @@
     # Write this DataFrame to a CSV file.
     #
     # == Arguements
     #
     # * filename - Path of CSV file where the DataFrame is to be saved.
-    # 
+    #
     # == Options
-    # 
+    #
     # * convert_comma - If set to *true*, will convert any commas in any
     # of the data to full stops ('.').
-    # All the options accepted by CSV.read() can also be passed into this 
+    # All the options accepted by CSV.read() can also be passed into this
     # function.
     def write_csv filename, opts={}
       Daru::IO.dataframe_write_csv self, filename, opts
     end
 
     # Write this dataframe to an Excel Spreadsheet
-    # 
+    #
     # == Arguments
-    # 
+    #
     # * filename - The path of the file where the DataFrame should be written.
     def write_excel filename, opts={}
       Daru::IO.dataframe_write_excel self, filename, opts
     end
 
     # Insert each case of the Dataset on the selected table
     #
     # == Arguments
-    # 
+    #
     # * dbh - DBI database connection object.
     # * query - Query string.
-    # 
+    #
     # == Usage
     #
     #  ds = Daru::DataFrame.new({:id=>Daru::Vector.new([1,2,3]), :name=>Daru::Vector.new(["a","b","c"])})
     #  dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
     #  ds.write_sql(dbh,"test")
@@ -1867,27 +1896,27 @@
       Daru::IO.save self, filename
     end
 
     def _dump depth
       Marshal.dump({
-        data:  @data, 
-        index: @index.to_a, 
+        data:  @data,
+        index: @index.to_a,
         order: @vectors.to_a,
         name:  @name
         })
     end
 
     def self._load data
       h = Marshal.load data
-      Daru::DataFrame.new(h[:data], 
-        index: h[:index], 
+      Daru::DataFrame.new(h[:data],
+        index: h[:index],
         order: h[:order],
         name:  h[:name])
     end
 
     # Change dtypes of vectors by supplying a hash of :vector_name => :new_dtype
-    # 
+    #
     # == Usage
     #   df = Daru::DataFrame.new({a: [1,2,3], b: [1,2,3], c: [1,2,3]})
     #   df.recast a: :nmatrix, c: :nmatrix
     def recast opts={}
       opts.each do |vector_name, dtype|
@@ -1906,21 +1935,21 @@
     end
 
     # Pretty print in a nice table format for the command line (irb/pry/iruby)
     def inspect spacing=10, threshold=15
       longest = [@name.to_s.size,
-                 (@vectors.map(&:to_s).map(&:size).max || 0), 
+                 (@vectors.map(&:to_s).map(&:size).max || 0),
                  (@index  .map(&:to_s).map(&:size).max || 0),
                  (@data   .map{ |v| v.map(&:to_s).map(&:size).max}.max || 0)].max
 
       name      = @name || 'nil'
       content   = ""
       longest   = spacing if longest > spacing
       formatter = "\n"
 
       (@vectors.size + 1).times { formatter += "%#{longest}.#{longest}s " }
-      content += "\n#<" + self.class.to_s + ":" + self.object_id.to_s + " @name = " + 
+      content += "\n#<" + self.class.to_s + ":" + self.object_id.to_s + " @name = " +
                     name.to_s + " @size = " + @size.to_s + ">"
       content += sprintf formatter, "" , *@vectors.map(&:to_s)
       row_num  = 1
 
       self.each_row_with_index do |row, index|
@@ -1943,14 +1972,14 @@
     def where bool_array
       Daru::Core::Query.df_where self, bool_array
     end
 
     def == other
-      self.class == other.class   and 
-      @size      == other.size    and 
+      self.class == other.class   and
+      @size      == other.size    and
       @index     == other.index   and
-      @vectors   == other.vectors and 
+      @vectors   == other.vectors and
       @vectors.to_a.all? { |v| self[v] == other[v] }
     end
 
     def method_missing(name, *args, &block)
       if md = name.match(/(.+)\=/)
@@ -1975,13 +2004,13 @@
     def quick_sort vector_order, index, by, ascending
       recursive_quick_sort vector_order, index, by, ascending, 0, @size-1
     end
 
     # == Arguments
-    # 
-    # vector_order - 
-    # index - 
+    #
+    # vector_order -
+    # index -
     # by -
     # ascending -
     # left_lower -
     # right_upper -
     def recursive_quick_sort vector_order, index, by, ascending, left_lower, right_upper
@@ -2118,11 +2147,11 @@
         names.each do |name|
           new_vcs << @data[@vectors[name]].to_a
         end
 
         order = names.is_a?(Array) ? Daru::Index.new(names) : names
-        Daru::DataFrame.new(new_vcs, order: order, 
+        Daru::DataFrame.new(new_vcs, order: order,
           index: @index, name: @name)
       end
     end
 
     def access_row *names
@@ -2132,20 +2161,20 @@
         pos = @index[names]
         if pos.is_a?(Integer)
           return Daru::Vector.new(populate_row_for(pos), index: @vectors, name: pos)
         else
           new_rows = pos.map { |tuple| populate_row_for(tuple) }
-          
+
           if !location.is_a?(Range) and names.size < @index.width
             pos = pos.drop_left_level names.size
           end
 
           Daru::DataFrame.rows(
             new_rows, order: @vectors, name: @name, index: pos)
         end
       else
-        if names[1].nil? 
+        if names[1].nil?
           names = @index[location]
           if names.is_a?(Numeric)
             row = []
             @data.each do |vector|
               row << vector[location]
@@ -2157,27 +2186,27 @@
         # Access multiple rows
         rows = []
         names.each do |name|
           rows << self.row[name].to_a
         end
-        
-        Daru::DataFrame.rows rows, index: names ,name: @name, order: @vectors        
+
+        Daru::DataFrame.rows rows, index: names ,name: @name, order: @vectors
       end
     end
 
     def populate_row_for pos
       @data.map do |vector|
         vector[pos]
       end
     end
 
     def insert_or_modify_vector name, vector
-      name = name[0] unless @vectors.is_a?(MultiIndex)   
+      name = name[0] unless @vectors.is_a?(MultiIndex)
       v = nil
 
       if @index.empty?
-        v = vector.is_a?(Daru::Vector) ? vector : Daru::Vector.new(vector.to_a)  
+        v = vector.is_a?(Daru::Vector) ? vector : Daru::Vector.new(vector.to_a)
         @index = v.index
         assign_or_add_vector name, v
         set_size
 
         @data.map! do |v|
@@ -2215,24 +2244,24 @@
 
     def assign_or_add_vector name, v
       #FIXME: fix this jugaad. need to make changes in Indexing itself.
       pos = @vectors[name]
 
-      if !pos.kind_of?(Daru::Index) and pos == name and 
+      if !pos.kind_of?(Daru::Index) and pos == name and
         (@vectors.include?(name) or (pos.is_a?(Integer) and pos < @data.size))
         @data[pos] = v
       elsif pos.kind_of?(Daru::Index)
         pos.each do |p|
           @data[@vectors[p]] = v
         end
       else
         @vectors = @vectors | [name] if !@vectors.include?(name)
         @data[@vectors[name]] = v
-      end      
+      end
     end
 
-    def insert_or_modify_row name, vector    
+    def insert_or_modify_row name, vector
       if index.is_a?(MultiIndex)
         # TODO
       else
         name = name[0]
         v =
@@ -2262,11 +2291,11 @@
         @data << Daru::Vector.new([], name: set_name(name), index: @index)
       end
     end
 
     def validate_labels
-      raise IndexError, "Expected equal number of vector names (#{@vectors.size}) for number of vectors (#{@data.size})." if 
+      raise IndexError, "Expected equal number of vector names (#{@vectors.size}) for number of vectors (#{@data.size})." if
         @vectors and @vectors.size != @data.size
 
       raise IndexError, "Expected number of indexes same as number of rows" if
         @index and @data[0] and @index.size != @data[0].size
     end
@@ -2328,11 +2357,11 @@
     def set_name potential_name
       potential_name.is_a?(Array) ? potential_name.join : potential_name
     end
 
     def symbolize arry
-      symbolized_arry = 
+      symbolized_arry =
       if arry.all? { |e| e.is_a?(Array) }
         arry.map do |sub_arry|
           sub_arry.map do |e|
             e.is_a?(Numeric) ? e : e.to_sym
           end
@@ -2342,6 +2371,6 @@
       end
 
       symbolized_arry
     end
   end
-end
\ No newline at end of file
+end