subframes.rb in red_amber-0.4.1

- old
+ new

@@ -80,11 +80,11 @@
         instance = allocate
         instance.instance_variable_set(:@baseframe, dataframe)
         enum =
           Enumerator.new(subset_indices.size) do |y|
             subset_indices.each do |i|
-              y.yield dataframe.take(i)
+              y.yield DataFrame.new_dataframe_with_schema(dataframe, dataframe.take(i))
             end
           end
         instance.instance_variable_set(:@enum, enum)
         instance
       end
@@ -106,11 +106,11 @@
         instance = allocate
         instance.instance_variable_set(:@baseframe, dataframe)
         enum =
           Enumerator.new(subset_filters.size) do |y|
             subset_filters.each do |i|
-              y.yield dataframe.filter(i)
+              y.yield DataFrame.new_dataframe_with_schema(dataframe, dataframe.filter(i))
             end
           end
         instance.instance_variable_set(:@enum, enum)
         instance
       end
@@ -137,11 +137,11 @@
             Enumerator.new(dataframes.size) do |y|
               dataframes.each do |i|
                 y.yield i
               end
             end
-          instance.instance_variable_set(:@baseframe, enum.reduce(&:concatenate))
+          instance.instance_variable_set(:@baseframe, enum.lazy)
         end
         instance.instance_variable_set(:@enum, enum)
         instance
       end
 
@@ -158,15 +158,17 @@
       #   @yieldreturn [Array<DataFrame>]
       #     the block should return DataFrames with same schema.
       #   @return [SubFrames]
       #     a new SubFrames.
       #
+      # @since 0.4.0
+      #
       def define_subframable_method(method)
         define_method(method) do |&block|
           return enum_for(:each) { size } unless block # rubocop:disable Lint/ToEnumArguments
 
-          self.class.by_dataframes(super(&block))
+          SubFrames.by_dataframes(super(&block))
         end
       end
     end
 
     # Create a new SubFrames object from a DataFrame and an array of indices or filters.
@@ -193,29 +195,35 @@
     #     2       3 B        false
     #     3       4 B        (nil)
     #     4       5 B        true
     #     5       6 C        false
     #
-    #     SubFrames.new(dataframe, [[0, 2, 3], [4, 1]])
+    #     # --- This object is used as common source in this class ---
+    #     subframes = SubFrames.new(dataframe, [[0 ,1], [2, 3, 4], [5]])
     #
     #     # =>
-    #     #<RedAmber::SubFrames : 0x0000000000003a34>
-    #     @baseframe=#<RedAmber::DataFrame : 5 x 3 Vectors, 0x0000000000003a48>
-    #     2 SubFrames: [3, 2] in sizes.
+    #     #<RedAmber::SubFrames : 0x000000000000cf6c>
+    #     @baseframe=#<RedAmber::DataFrame : 6 x 3 Vectors, 0x000000000000cf80>
+    #     3 SubFrames: [2, 3, 1] in sizes.
     #     ---
-    #     #<RedAmber::DataFrame : 3 x 3 Vectors, 0x0000000000003a5c>
+    #     #<RedAmber::DataFrame : 2 x 3 Vectors, 0x000000000000cf94>
     #             x y        z
     #       <uint8> <string> <boolean>
     #     0       1 A        false
-    #     1       3 B        false
-    #     2       4 B        (nil)
+    #     1       2 A        true
     #     ---
-    #     #<RedAmber::DataFrame : 2 x 3 Vectors, 0x0000000000003a70>
+    #     #<RedAmber::DataFrame : 3 x 3 Vectors, 0x000000000000cfa8>
     #             x y        z
     #       <uint8> <string> <boolean>
-    #     0       5 B        true
-    #     1       2 A        true
+    #     0       3 B        false
+    #     1       4 B        (nil)
+    #     2       5 B        true
+    #     ---
+    #     #<RedAmber::DataFrame : 1 x 3 Vectors, 0x000000000000cfbc>
+    #             x y        z
+    #       <uint8> <string> <boolean>
+    #     0       6 C        false
     #
     # @overload initialize(dataframe)
     #   Create a new SubFrames object by block.
     #
     #   @param dataframe [DataFrame]
@@ -281,25 +289,29 @@
                 elsif i.boolean?
                   dataframe.filter(i)
                 else
                   raise SubFramesArgumentError, "illegal type: #{i}"
                 end
-              yielder.yield df
+              yielder.yield DataFrame.new_dataframe_with_schema(dataframe, df)
             end
           end
       end
     end
 
-    # Return concatenated SubFrames as a DataDrame.
+    # Return concatenated SubFrames as a DataFrame.
     #
     # Once evaluated, memorize it as @baseframe.
     # @return [DataFrame]
     #   a concatenated DataFrame.
     # @since 0.4.0
     #
     def baseframe
-      @baseframe ||= reduce(&:concatenate)
+      if @baseframe.nil? || @baseframe.is_a?(Enumerator)
+        @baseframe = reduce(&:concatenate)
+      else
+        @baseframe
+      end
     end
     alias_method :concatenate, :baseframe
     alias_method :concat, :baseframe
 
     # Iterates over sub DataFrames or returns an Enumerator.
@@ -323,17 +335,17 @@
     #     evaluated result value from the block.
     #   @return [self]
     #     returns self.
     #
     # @example Returns Enumerator
-    #   sf.each
+    #   subframes.each
     #
     #   # =>
     #   #<Enumerator: ...>
     #
     # @example `to_a` from Enumerable.
-    #   sf.to_a
+    #   subframes.to_a
     #
     #   # =>
     #   [#<RedAmber::DataFrame : 2 x 3 Vectors, 0x000000000002a120>
     #           x y        z
     #     <uint8> <string> <boolean>
@@ -352,11 +364,11 @@
     #     <uint8> <string> <boolean>
     #   0       6 C        false
     #   ]
     #
     # @example Concatenate SubFrames. This example is used in #concatenate.
-    #   sf.reduce(&:concatenate)
+    #   subframes.reduce(&:concatenate)
     #
     #   # =>
     #   #<RedAmber::DataFrame : 6 x 3 Vectors, 0x000000000004883c>
     #           x y        z
     #     <uint8> <string> <boolean>
@@ -376,151 +388,193 @@
       nil
     end
 
     # Aggregate SubFrames to create a DataFrame.
     #
-    # This method will check if built-in aggregation function is used.
-    # @todo Support user-defined aggregation functions.
+    # This method creates a DataFrame with one row corresponding to one sub dataframe.
+    # @note This method does not check if aggregation function is used.
     #
+    # @overload aggregate(keys)
+    #
+    #   Aggregate SubFrames creating DataFrame with label `keys` and
+    #   its column values by block.
+    #
+    #   @param keys [Symbol, Array<Symbol>]
+    #     a key or keys of result. Key names may be renamed to new label.
+    #   @yieldparam dataframe [DataFrame]
+    #     passes each dataframe in self to the block. Block is called by instance_eval,
+    #     so inside of the block is the context of passed dataframe.
+    #   @yieldreturn [Array]
+    #     aggregated values from the columns of passed dataframe.
+    #   @return [DataFrame]
+    #     created DataFrame.
+    #   @example Aggregate by key labels in arguments and values from block.
+    #     subframes.aggregate(:y, :sum_x) { [y.first, x.sum] }
+    #
+    #     # =>
+    #     #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000003b24>
+    #       y          sum_x
+    #       <string> <uint8>
+    #     0 A              3
+    #     1 B             12
+    #     2 C              6
+    #
+    #   @example Aggregate by key labels in an Array and values from block.
+    #     subframes.aggregate([:y, :sum_x]) { [y.first, x.sum] }
+    #
+    #     # =>
+    #     #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000003b24>
+    #       y          sum_x
+    #       <string> <uint8>
+    #     0 A              3
+    #     1 B             12
+    #     2 C              6
+    #
+    # @overload aggregate
+    #
+    #   Aggregate SubFrames creating DataFrame with pairs of key and aggregated value
+    #   in Hash from the block.
+    #
+    #   @yieldparam dataframe [DataFrame]
+    #     passes each dataframe in self to the block. Block is called by instance_eval,
+    #     so inside of the block is the context of passed dataframe.
+    #   @yieldreturn [Hash<key => aggregated_value>]
+    #     pairs of key name and aggregated values from the columns of passed dataframe.
+    #     Key names may be renamed to new label in the result.
+    #   @return [DataFrame]
+    #     created DataFrame.
+    #   @example Aggregate by key and value pairs from block.
+    #     subframes.aggregate do
+    #       { y: y.first, sum_x: x.sum }
+    #     end
+    #
+    #     # =>
+    #     #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000003b24>
+    #       y          sum_x
+    #       <string> <uint8>
+    #     0 A              3
+    #     1 B             12
+    #     2 C              6
+    #
+    # @overload aggregate
+    #
+    #   Aggregate SubFrames creating DataFrame with an Array of key and aggregated value
+    #   from the block.
+    #
+    #   @yieldparam dataframe [DataFrame]
+    #     passes each dataframe in self to the block. Block is called by instance_eval,
+    #     so inside of the block is the context of passed dataframe.
+    #   @yieldreturn [Array<key, aggregated_value>]
+    #     pairs of key name and aggregated values from the columns of passed dataframe.
+    #     Key names may be renamed to new label in the result.
+    #   @return [DataFrame]
+    #     created DataFrame.
+    #   @example Aggregate by key and value arrays from block.
+    #     subframes.aggregate do
+    #       [[:y, y.first], [:sum_x, x.sum]]
+    #     end
+    #
+    #     # =>
+    #     #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000003b24>
+    #       y          sum_x
+    #       <string> <uint8>
+    #     0 A              3
+    #     1 B             12
+    #     2 C              6
+    #
     # @overload aggregate(group_keys, aggregations)
     #
     #   Aggregate SubFrames for first values of the columns of
     #   `group_keys` and the aggregated results of key-function pairs.
+    #   [Experimental] This API may be changed in the future.
     #
     #   @param group_keys [Symbol, String, Array<Symbol, String>]
     #     group key name(s) to output values.
     #   @param aggregations [Hash<Array<Symbol, String> => Array<:Symbol>>]
     #     a Hash of variable (column) name and
     #     Vector aggregate function name to apply.
     #   @return [DataFrame]
     #     an aggregated DataFrame.
-    #   @example
-    #     subframes
+    #   @example Aggregate with a group key and key function pairs by a Hash.
+    #     subframes.aggregate(:y, { x: :sum, z: :count })
     #
     #     # =>
-    #     #<RedAmber::SubFrames : 0x0000000000003980>
-    #     @baseframe=#<RedAmber::DataFrame : 6 x 3 Vectors, 0x0000000000003994>
-    #     3 SubFrames: [2, 3, 1] in sizes.
-    #     ---
-    #     #<RedAmber::DataFrame : 2 x 3 Vectors, 0x00000000000039a8>
-    #             x y        z
-    #       <uint8> <string> <boolean>
-    #     0       1 A        false
-    #     1       2 A        true
-    #     ---
-    #     #<RedAmber::DataFrame : 3 x 3 Vectors, 0x00000000000039bc>
-    #             x y        z
-    #       <uint8> <string> <boolean>
-    #     0       3 B        false
-    #     1       4 B        (nil)
-    #     2       5 B        true
-    #     ---
-    #     #<RedAmber::DataFrame : 1 x 3 Vectors, 0x00000000000039d0>
-    #             x y        z
-    #       <uint8> <string> <boolean>
-    #     0       6 C        false
-    #
-    #     subframes.aggregate(:y, { x: :sum })
-    #
-    #     # =>
     #     #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000003b24>
-    #       y          sum_x
-    #       <string> <uint8>
-    #     0 A              3
-    #     1 B             12
-    #     2 C              6
+    #       y          sum_x count_z
+    #       <string> <uint8> <uint8>
+    #     0 A              3       2
+    #     1 B             12       2
+    #     2 C              6       1
     #
     # @overload aggregate(group_keys, aggregations)
     #
     #   Aggregate SubFrames for first values of the columns of
     #   `group_keys` and the aggregated results of all combinations
     #   of supplied keys and functions.
+    #   [Experimental] This API may be changed in the future.
     #
     #   @param group_keys [Symbol, String, Array<Symbol, String>]
     #     group key name(s) to output values.
     #   @param aggregations [Array[Array<Symbol, String>, Array<:Symbol>]]
     #     an Array of Array of variable (column) names and
     #     Array of Vector aggregate function names to apply.
     #   @return [DataFrame]
     #     an aggregated DataFrame.
-    #   @example
+    #   @example Aggregate with group keys and keys and functions by an Array.
     #     sf.aggregate(:y, [[:x, :z], [:count, :sum]])
     #
     #     # =>
     #     #<RedAmber::DataFrame : 3 x 5 Vectors, 0x000000000000fcbc>
-    #       y        count_x count_z   sum_x   sum_z
+    #       y        count_x   sum_x count_z   sum_z
     #       <string> <uint8> <uint8> <uint8> <uint8>
-    #     0 A              2       2       3       1
-    #     1 B              3       2      12       1
-    #     2 C              1       1       6       0
+    #     0 A              2       3       2       1
+    #     1 B              3      12       2       1
+    #     2 C              1       6       1       0
     #
     # @since 0.4.0
     #
-    def aggregate(group_keys, aggregations)
+    def aggregate(*args, &block)
       aggregator =
-        case aggregations
-        in Hash
-          sf = self
-          aggregations.map do |key, func|
-            unless Vector.aggregate?(func)
-              raise SubFramesArgumentError, "not an aggregation function: #{func}"
+        if block
+          if args.empty?
+            # aggregate { {key => value} or [[key, value], ...] }
+            each_with_object(Hash.new { |h, k| h[k] = [] }) do |df, hash|
+              df.instance_eval(&block).to_h.each do |k, v|
+                hash[k] << v
+              end
             end
-
-            ["#{func}_#{key}", sf.each.map { |df| df[key].send(func) }]
+          else
+            # aggregate(keys) { values }
+            values = each.map { |df| Array(df.instance_eval(&block)) }.transpose
+            args.flatten.zip(values)
           end
-        in [Array => keys, Array => functions]
-          functions.each do |func|
-            unless Vector.aggregate?(func)
-              raise SubFramesArgumentError, "not an aggregation function: #{func}"
-            end
+        else
+          # These functions may be removed in the future.
+          case args
+          in [group_keys1, Hash => h]
+            # aggregate(group_keys, { key => func })
+            ary = Array(group_keys1).map { |key| [:first, key] }
+            ary.concat(h.to_a.map { [_2, _1] }) # rubocop:disable Style/NumberedParametersLimit
+          in [group_keys2, [Array => keys, Array => funcs]]
+            # aggregate(group_keys, [keys, funcs])
+            ary = Array(group_keys2).map { |key| [:first, key] }
+            ary.concat(funcs.product(keys))
+          else
+            raise SubFramesArgumentError, "invalid argument: #{args}"
           end
           sf = self
-          functions.product(keys).map do |func, key|
-            ["#{func}_#{key}", sf.each.map { |df| df[key].send(func) }]
+          ary.map do |func, key|
+            label = func == :first ? key : "#{func}_#{key}"
+            [label, sf.each.map { |df| df[key].send(func) }]
           end
-        else
-          raise SubFramesArgumentError, "invalid argument: #{aggregations}"
         end
-
-      if group_keys.empty?
-        DataFrame.new(aggregator)
-      else
-        baseframe
-          .pick(group_keys)
-          .slice(offset_indices)
-          .assign(aggregator)
-      end
+      DataFrame.new(aggregator)
     end
 
     # Returns a SubFrames containing DataFrames returned by the block.
     #
     # @example Map as it is.
-    #   subframes
-    #
-    #   # =>
-    #   #<RedAmber::SubFrames : 0x000000000001359c>
-    #   @baseframe=#<RedAmber::DataFrame : 6 x 3 Vectors, 0x00000000000135b0>
-    #   3 SubFrames: [2, 3, 1] in sizes.
-    #   ---
-    #   #<RedAmber::DataFrame : 2 x 3 Vectors, 0x00000000000135c4>
-    #           x y        z
-    #     <uint8> <string> <boolean>
-    #   0       1 A        false
-    #   1       2 A        true
-    #   ---
-    #   #<RedAmber::DataFrame : 3 x 3 Vectors, 0x00000000000135d8>
-    #           x y        z
-    #     <uint8> <string> <boolean>
-    #   0       3 B        false
-    #   1       4 B        (nil)
-    #   2       5 B        true
-    #   ---
-    #   #<RedAmber::DataFrame : 1 x 3 Vectors, 0x00000000000135ec>
-    #           x y        z
-    #     <uint8> <string> <boolean>
-    #   0       6 C        false
-    #
     #   subframes.map { _1 }
     #
     #   # This will create a new SubFrame and a new baseframe,
     #   # But each element DataFrames are re-used.
     #   # =>
@@ -591,35 +645,10 @@
     #   @yieldreturn [Vector, Array, Arrow::Array]
     #     an updated column value which are overloaded.
     #   @return [SubFrames]
     #     a new SubFrames object with updated DataFrames.
     #   @example
-    #     subframes
-    #
-    #     # =>
-    #     #<RedAmber::SubFrames : 0x000000000000c33c>
-    #     @baseframe=#<RedAmber::DataFrame : 6 x 3 Vectors, 0x000000000000c350>
-    #     3 SubFrames: [2, 3, 1] in sizes.
-    #     ---
-    #     #<RedAmber::DataFrame : 2 x 3 Vectors, 0x000000000000c364>
-    #             x y        z
-    #       <uint8> <string> <boolean>
-    #     0       1 A        false
-    #     1       2 A        true
-    #     ---
-    #     #<RedAmber::DataFrame : 3 x 3 Vectors, 0x000000000000c378>
-    #             x y        z
-    #       <uint8> <string> <boolean>
-    #     0       3 B        false
-    #     1       4 B        (nil)
-    #     2       5 B        true
-    #     ---
-    #     #<RedAmber::DataFrame : 1 x 3 Vectors, 0x000000000000c38c>
-    #             x y        z
-    #       <uint8> <string> <boolean>
-    #     0       6 C        false
-    #
     #     subframes.assign(:x_plus1) { x + 1 }
     #
     #     # =>
     #     #<RedAmber::SubFrames : 0x000000000000c3a0>
     #     @baseframe=#<RedAmber::DataFrame : 6 x 4 Vectors, 0x000000000000c3b4>
@@ -910,11 +939,11 @@
     # Indices at the top of each sub DataFrames.
     #
     # @return [Array<Integer>]
     #   indices of offset of each sub DataFrames.
     # @example When `sizes` is [2, 3, 1].
-    #   sf.offset_indices # => [0, 2, 5]
+    #   subframes.offset_indices # => [0, 2, 5]
     # @since 0.4.0
     #
     def offset_indices
       sum = 0
       sizes.map do |size|
@@ -1034,12 +1063,18 @@
     #   0       6 C        false
     #
     # @since 0.4.0
     #
     def inspect(limit: 16)
+      shape =
+        if @baseframe.is_a?(Enumerator)
+          "Enumerator::Lazy:size=#{@baseframe.size}"
+        else
+          baseframe.shape_str(with_id: true)
+        end
       sizes_truncated = (size > limit ? sizes.take(limit) << '...' : sizes).join(', ')
       "#<#{self.class} : #{format('0x%016x', object_id)}>\n" \
-        "@baseframe=#<#{baseframe.shape_str(with_id: true)}>\n" \
+        "@baseframe=#<#{shape}>\n" \
         "#{size} SubFrame#{pl(size)}: " \
         "[#{sizes_truncated}] in size#{pl(size)}.\n" \
         "---\n#{_to_s(limit: limit, with_id: true)}"
     end