lib/red_amber/subframes.rb in red_amber-0.4.0 vs lib/red_amber/subframes.rb in red_amber-0.4.1

- old
+ new

@@ -80,11 +80,11 @@ instance = allocate instance.instance_variable_set(:@baseframe, dataframe) enum = Enumerator.new(subset_indices.size) do |y| subset_indices.each do |i| - y.yield dataframe.take(i) + y.yield DataFrame.new_dataframe_with_schema(dataframe, dataframe.take(i)) end end instance.instance_variable_set(:@enum, enum) instance end @@ -106,11 +106,11 @@ instance = allocate instance.instance_variable_set(:@baseframe, dataframe) enum = Enumerator.new(subset_filters.size) do |y| subset_filters.each do |i| - y.yield dataframe.filter(i) + y.yield DataFrame.new_dataframe_with_schema(dataframe, dataframe.filter(i)) end end instance.instance_variable_set(:@enum, enum) instance end @@ -137,11 +137,11 @@ Enumerator.new(dataframes.size) do |y| dataframes.each do |i| y.yield i end end - instance.instance_variable_set(:@baseframe, enum.reduce(&:concatenate)) + instance.instance_variable_set(:@baseframe, enum.lazy) end instance.instance_variable_set(:@enum, enum) instance end @@ -158,15 +158,17 @@ # @yieldreturn [Array<DataFrame>] # the block should return DataFrames with same schema. # @return [SubFrames] # a new SubFrames. # + # @since 0.4.0 + # def define_subframable_method(method) define_method(method) do |&block| return enum_for(:each) { size } unless block # rubocop:disable Lint/ToEnumArguments - self.class.by_dataframes(super(&block)) + SubFrames.by_dataframes(super(&block)) end end end # Create a new SubFrames object from a DataFrame and an array of indices or filters. @@ -193,29 +195,35 @@ # 2 3 B false # 3 4 B (nil) # 4 5 B true # 5 6 C false # - # SubFrames.new(dataframe, [[0, 2, 3], [4, 1]]) + # # --- This object is used as common source in this class --- + # subframes = SubFrames.new(dataframe, [[0 ,1], [2, 3, 4], [5]]) # # # => - # #<RedAmber::SubFrames : 0x0000000000003a34> - # @baseframe=#<RedAmber::DataFrame : 5 x 3 Vectors, 0x0000000000003a48> - # 2 SubFrames: [3, 2] in sizes. + # #<RedAmber::SubFrames : 0x000000000000cf6c> + # @baseframe=#<RedAmber::DataFrame : 6 x 3 Vectors, 0x000000000000cf80> + # 3 SubFrames: [2, 3, 1] in sizes. # --- - # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x0000000000003a5c> + # #<RedAmber::DataFrame : 2 x 3 Vectors, 0x000000000000cf94> # x y z # <uint8> <string> <boolean> # 0 1 A false - # 1 3 B false - # 2 4 B (nil) + # 1 2 A true # --- - # #<RedAmber::DataFrame : 2 x 3 Vectors, 0x0000000000003a70> + # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x000000000000cfa8> # x y z # <uint8> <string> <boolean> - # 0 5 B true - # 1 2 A true + # 0 3 B false + # 1 4 B (nil) + # 2 5 B true + # --- + # #<RedAmber::DataFrame : 1 x 3 Vectors, 0x000000000000cfbc> + # x y z + # <uint8> <string> <boolean> + # 0 6 C false # # @overload initialize(dataframe) # Create a new SubFrames object by block. # # @param dataframe [DataFrame] @@ -281,25 +289,29 @@ elsif i.boolean? dataframe.filter(i) else raise SubFramesArgumentError, "illegal type: #{i}" end - yielder.yield df + yielder.yield DataFrame.new_dataframe_with_schema(dataframe, df) end end end end - # Return concatenated SubFrames as a DataDrame. + # Return concatenated SubFrames as a DataFrame. # # Once evaluated, memorize it as @baseframe. # @return [DataFrame] # a concatenated DataFrame. # @since 0.4.0 # def baseframe - @baseframe ||= reduce(&:concatenate) + if @baseframe.nil? || @baseframe.is_a?(Enumerator) + @baseframe = reduce(&:concatenate) + else + @baseframe + end end alias_method :concatenate, :baseframe alias_method :concat, :baseframe # Iterates over sub DataFrames or returns an Enumerator. @@ -323,17 +335,17 @@ # evaluated result value from the block. # @return [self] # returns self. # # @example Returns Enumerator - # sf.each + # subframes.each # # # => # #<Enumerator: ...> # # @example `to_a` from Enumerable. - # sf.to_a + # subframes.to_a # # # => # [#<RedAmber::DataFrame : 2 x 3 Vectors, 0x000000000002a120> # x y z # <uint8> <string> <boolean> @@ -352,11 +364,11 @@ # <uint8> <string> <boolean> # 0 6 C false # ] # # @example Concatenate SubFrames. This example is used in #concatenate. - # sf.reduce(&:concatenate) + # subframes.reduce(&:concatenate) # # # => # #<RedAmber::DataFrame : 6 x 3 Vectors, 0x000000000004883c> # x y z # <uint8> <string> <boolean> @@ -376,151 +388,193 @@ nil end # Aggregate SubFrames to create a DataFrame. # - # This method will check if built-in aggregation function is used. - # @todo Support user-defined aggregation functions. + # This method creates a DataFrame with one row corresponding to one sub dataframe. + # @note This method does not check if aggregation function is used. # + # @overload aggregate(keys) + # + # Aggregate SubFrames creating DataFrame with label `keys` and + # its column values by block. + # + # @param keys [Symbol, Array<Symbol>] + # a key or keys of result. Key names may be renamed to new label. + # @yieldparam dataframe [DataFrame] + # passes each dataframe in self to the block. Block is called by instance_eval, + # so inside of the block is the context of passed dataframe. + # @yieldreturn [Array] + # aggregated values from the columns of passed dataframe. + # @return [DataFrame] + # created DataFrame. + # @example Aggregate by key labels in arguments and values from block. + # subframes.aggregate(:y, :sum_x) { [y.first, x.sum] } + # + # # => + # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000003b24> + # y sum_x + # <string> <uint8> + # 0 A 3 + # 1 B 12 + # 2 C 6 + # + # @example Aggregate by key labels in an Array and values from block. + # subframes.aggregate([:y, :sum_x]) { [y.first, x.sum] } + # + # # => + # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000003b24> + # y sum_x + # <string> <uint8> + # 0 A 3 + # 1 B 12 + # 2 C 6 + # + # @overload aggregate + # + # Aggregate SubFrames creating DataFrame with pairs of key and aggregated value + # in Hash from the block. + # + # @yieldparam dataframe [DataFrame] + # passes each dataframe in self to the block. Block is called by instance_eval, + # so inside of the block is the context of passed dataframe. + # @yieldreturn [Hash<key => aggregated_value>] + # pairs of key name and aggregated values from the columns of passed dataframe. + # Key names may be renamed to new label in the result. + # @return [DataFrame] + # created DataFrame. + # @example Aggregate by key and value pairs from block. + # subframes.aggregate do + # { y: y.first, sum_x: x.sum } + # end + # + # # => + # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000003b24> + # y sum_x + # <string> <uint8> + # 0 A 3 + # 1 B 12 + # 2 C 6 + # + # @overload aggregate + # + # Aggregate SubFrames creating DataFrame with an Array of key and aggregated value + # from the block. + # + # @yieldparam dataframe [DataFrame] + # passes each dataframe in self to the block. Block is called by instance_eval, + # so inside of the block is the context of passed dataframe. + # @yieldreturn [Array<key, aggregated_value>] + # pairs of key name and aggregated values from the columns of passed dataframe. + # Key names may be renamed to new label in the result. + # @return [DataFrame] + # created DataFrame. + # @example Aggregate by key and value arrays from block. + # subframes.aggregate do + # [[:y, y.first], [:sum_x, x.sum]] + # end + # + # # => + # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000003b24> + # y sum_x + # <string> <uint8> + # 0 A 3 + # 1 B 12 + # 2 C 6 + # # @overload aggregate(group_keys, aggregations) # # Aggregate SubFrames for first values of the columns of # `group_keys` and the aggregated results of key-function pairs. + # [Experimental] This API may be changed in the future. # # @param group_keys [Symbol, String, Array<Symbol, String>] # group key name(s) to output values. # @param aggregations [Hash<Array<Symbol, String> => Array<:Symbol>>] # a Hash of variable (column) name and # Vector aggregate function name to apply. # @return [DataFrame] # an aggregated DataFrame. - # @example - # subframes + # @example Aggregate with a group key and key function pairs by a Hash. + # subframes.aggregate(:y, { x: :sum, z: :count }) # # # => - # #<RedAmber::SubFrames : 0x0000000000003980> - # @baseframe=#<RedAmber::DataFrame : 6 x 3 Vectors, 0x0000000000003994> - # 3 SubFrames: [2, 3, 1] in sizes. - # --- - # #<RedAmber::DataFrame : 2 x 3 Vectors, 0x00000000000039a8> - # x y z - # <uint8> <string> <boolean> - # 0 1 A false - # 1 2 A true - # --- - # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x00000000000039bc> - # x y z - # <uint8> <string> <boolean> - # 0 3 B false - # 1 4 B (nil) - # 2 5 B true - # --- - # #<RedAmber::DataFrame : 1 x 3 Vectors, 0x00000000000039d0> - # x y z - # <uint8> <string> <boolean> - # 0 6 C false - # - # subframes.aggregate(:y, { x: :sum }) - # - # # => # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x0000000000003b24> - # y sum_x - # <string> <uint8> - # 0 A 3 - # 1 B 12 - # 2 C 6 + # y sum_x count_z + # <string> <uint8> <uint8> + # 0 A 3 2 + # 1 B 12 2 + # 2 C 6 1 # # @overload aggregate(group_keys, aggregations) # # Aggregate SubFrames for first values of the columns of # `group_keys` and the aggregated results of all combinations # of supplied keys and functions. + # [Experimental] This API may be changed in the future. # # @param group_keys [Symbol, String, Array<Symbol, String>] # group key name(s) to output values. # @param aggregations [Array[Array<Symbol, String>, Array<:Symbol>]] # an Array of Array of variable (column) names and # Array of Vector aggregate function names to apply. # @return [DataFrame] # an aggregated DataFrame. - # @example + # @example Aggregate with group keys and keys and functions by an Array. # sf.aggregate(:y, [[:x, :z], [:count, :sum]]) # # # => # #<RedAmber::DataFrame : 3 x 5 Vectors, 0x000000000000fcbc> - # y count_x count_z sum_x sum_z + # y count_x sum_x count_z sum_z # <string> <uint8> <uint8> <uint8> <uint8> - # 0 A 2 2 3 1 - # 1 B 3 2 12 1 - # 2 C 1 1 6 0 + # 0 A 2 3 2 1 + # 1 B 3 12 2 1 + # 2 C 1 6 1 0 # # @since 0.4.0 # - def aggregate(group_keys, aggregations) + def aggregate(*args, &block) aggregator = - case aggregations - in Hash - sf = self - aggregations.map do |key, func| - unless Vector.aggregate?(func) - raise SubFramesArgumentError, "not an aggregation function: #{func}" + if block + if args.empty? + # aggregate { {key => value} or [[key, value], ...] } + each_with_object(Hash.new { |h, k| h[k] = [] }) do |df, hash| + df.instance_eval(&block).to_h.each do |k, v| + hash[k] << v + end end - - ["#{func}_#{key}", sf.each.map { |df| df[key].send(func) }] + else + # aggregate(keys) { values } + values = each.map { |df| Array(df.instance_eval(&block)) }.transpose + args.flatten.zip(values) end - in [Array => keys, Array => functions] - functions.each do |func| - unless Vector.aggregate?(func) - raise SubFramesArgumentError, "not an aggregation function: #{func}" - end + else + # These functions may be removed in the future. + case args + in [group_keys1, Hash => h] + # aggregate(group_keys, { key => func }) + ary = Array(group_keys1).map { |key| [:first, key] } + ary.concat(h.to_a.map { [_2, _1] }) # rubocop:disable Style/NumberedParametersLimit + in [group_keys2, [Array => keys, Array => funcs]] + # aggregate(group_keys, [keys, funcs]) + ary = Array(group_keys2).map { |key| [:first, key] } + ary.concat(funcs.product(keys)) + else + raise SubFramesArgumentError, "invalid argument: #{args}" end sf = self - functions.product(keys).map do |func, key| - ["#{func}_#{key}", sf.each.map { |df| df[key].send(func) }] + ary.map do |func, key| + label = func == :first ? key : "#{func}_#{key}" + [label, sf.each.map { |df| df[key].send(func) }] end - else - raise SubFramesArgumentError, "invalid argument: #{aggregations}" end - - if group_keys.empty? - DataFrame.new(aggregator) - else - baseframe - .pick(group_keys) - .slice(offset_indices) - .assign(aggregator) - end + DataFrame.new(aggregator) end # Returns a SubFrames containing DataFrames returned by the block. # # @example Map as it is. - # subframes - # - # # => - # #<RedAmber::SubFrames : 0x000000000001359c> - # @baseframe=#<RedAmber::DataFrame : 6 x 3 Vectors, 0x00000000000135b0> - # 3 SubFrames: [2, 3, 1] in sizes. - # --- - # #<RedAmber::DataFrame : 2 x 3 Vectors, 0x00000000000135c4> - # x y z - # <uint8> <string> <boolean> - # 0 1 A false - # 1 2 A true - # --- - # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x00000000000135d8> - # x y z - # <uint8> <string> <boolean> - # 0 3 B false - # 1 4 B (nil) - # 2 5 B true - # --- - # #<RedAmber::DataFrame : 1 x 3 Vectors, 0x00000000000135ec> - # x y z - # <uint8> <string> <boolean> - # 0 6 C false - # # subframes.map { _1 } # # # This will create a new SubFrame and a new baseframe, # # But each element DataFrames are re-used. # # => @@ -591,35 +645,10 @@ # @yieldreturn [Vector, Array, Arrow::Array] # an updated column value which are overloaded. # @return [SubFrames] # a new SubFrames object with updated DataFrames. # @example - # subframes - # - # # => - # #<RedAmber::SubFrames : 0x000000000000c33c> - # @baseframe=#<RedAmber::DataFrame : 6 x 3 Vectors, 0x000000000000c350> - # 3 SubFrames: [2, 3, 1] in sizes. - # --- - # #<RedAmber::DataFrame : 2 x 3 Vectors, 0x000000000000c364> - # x y z - # <uint8> <string> <boolean> - # 0 1 A false - # 1 2 A true - # --- - # #<RedAmber::DataFrame : 3 x 3 Vectors, 0x000000000000c378> - # x y z - # <uint8> <string> <boolean> - # 0 3 B false - # 1 4 B (nil) - # 2 5 B true - # --- - # #<RedAmber::DataFrame : 1 x 3 Vectors, 0x000000000000c38c> - # x y z - # <uint8> <string> <boolean> - # 0 6 C false - # # subframes.assign(:x_plus1) { x + 1 } # # # => # #<RedAmber::SubFrames : 0x000000000000c3a0> # @baseframe=#<RedAmber::DataFrame : 6 x 4 Vectors, 0x000000000000c3b4> @@ -910,11 +939,11 @@ # Indices at the top of each sub DataFrames. # # @return [Array<Integer>] # indices of offset of each sub DataFrames. # @example When `sizes` is [2, 3, 1]. - # sf.offset_indices # => [0, 2, 5] + # subframes.offset_indices # => [0, 2, 5] # @since 0.4.0 # def offset_indices sum = 0 sizes.map do |size| @@ -1034,12 +1063,18 @@ # 0 6 C false # # @since 0.4.0 # def inspect(limit: 16) + shape = + if @baseframe.is_a?(Enumerator) + "Enumerator::Lazy:size=#{@baseframe.size}" + else + baseframe.shape_str(with_id: true) + end sizes_truncated = (size > limit ? sizes.take(limit) << '...' : sizes).join(', ') "#<#{self.class} : #{format('0x%016x', object_id)}>\n" \ - "@baseframe=#<#{baseframe.shape_str(with_id: true)}>\n" \ + "@baseframe=#<#{shape}>\n" \ "#{size} SubFrame#{pl(size)}: " \ "[#{sizes_truncated}] in size#{pl(size)}.\n" \ "---\n#{_to_s(limit: limit, with_id: true)}" end