lib/arrow/table.rb in red-arrow-0.13.0 vs lib/arrow/table.rb in red-arrow-0.14.0

- old
+ new

@@ -28,31 +28,158 @@ end end alias_method :initialize_raw, :initialize private :initialize_raw - def initialize(schema_or_raw_table_or_columns, columns=nil) - if columns.nil? - if schema_or_raw_table_or_columns[0].is_a?(Column) - columns = schema_or_raw_table_or_columns - fields = columns.collect(&:field) + + # Creates a new {Arrow::Table}. + # + # @overload initialize(columns) + # + # @param columns [::Array<Arrow::Column>] The columns of the table. + # + # @example Create a table from columns + # count_field = Arrow::Field.new("count", :uint32) + # count_array = Arrow::UInt32Array.new([0, 2, nil, 4]) + # count_column = Arrow::Column.new(count_field, count_array) + # visible_field = Arrow::Field.new("visible", :boolean) + # visible_array = Arrow::BooleanArray.new([true, nil, nil, false]) + # visible_column = Arrow::Column.new(visible_field, visible_array) + # Arrow::Table.new([count_column, visible_column]) + # + # @overload initialize(raw_table) + # + # @param raw_table [Hash<String, Arrow::Array>] + # The pairs of column name and values of the table. Column values is + # `Arrow::Array`. + # + # @example Create a table from column name and values + # Arrow::Table.new("count" => Arrow::UInt32Array.new([0, 2, nil, 4]), + # "visible" => Arrow::BooleanArray.new([true, nil, nil, false])) + # + # @overload initialize(raw_table) + # + # @param raw_table [Hash<String, Arrow::ChunkedArray>] + # The pairs of column name and values of the table. Column values is + # `Arrow::ChunkedArray`. + # + # @example Create a table from column name and values + # count_chunks = [ + # Arrow::UInt32Array.new([0, 2]), + # Arrow::UInt32Array.new([nil, 4]), + # ] + # visible_chunks = [ + # Arrow::BooleanArray.new([true]), + # Arrow::BooleanArray.new([nil, nil, false]), + # ] + # Arrow::Table.new("count" => Arrow::ChunkedArray.new(count_chunks), + # "visible" => Arrow::ChunkedArray.new(visible_chunks)) + # + # @overload initialize(schema, columns) + # + # @param schema [Arrow::Schema] The schema of the table. + # You can also specify schema as primitive Ruby objects. + # See {Arrow::Schema#initialize} for details. + # + # @param columns [::Array<Arrow::Column>] The data of the table. + # + # @example Create a table from schema and columns + # count_field = Arrow::Field.new("count", :uint32) + # count_array = Arrow::UInt32Array.new([0, 2, nil, 4]) + # count_column = Arrow::Column.new(count_field, count_array) + # visible_field = Arrow::Field.new("visible", :boolean) + # visible_array = Arrow::BooleanArray.new([true, nil, nil, false]) + # visible_column = Arrow::Column.new(visible_field, visible_array) + # Arrow::Table.new(Arrow::Schema.new([count_field, visible_field]), + # [count_column, visible_column]) + # + # @overload initialize(schema, arrays) + # + # @param schema [Arrow::Schema] The schema of the table. + # You can also specify schema as primitive Ruby objects. + # See {Arrow::Schema#initialize} for details. + # + # @param arrays [::Array<Arrow::Array>] The data of the table. + # + # @example Create a table from schema and arrays + # count_field = Arrow::Field.new("count", :uint32) + # count_array = Arrow::UInt32Array.new([0, 2, nil, 4]) + # visible_field = Arrow::Field.new("visible", :boolean) + # visible_array = Arrow::BooleanArray.new([true, nil, nil, false]) + # Arrow::Table.new(Arrow::Schema.new([count_field, visible_field]), + # [count_array, visible_array]) + # + # @overload initialize(schema, record_batches) + # + # @param schema [Arrow::Schema] The schema of the table. + # You can also specify schema as primitive Ruby objects. + # See {Arrow::Schema#initialize} for details. + # + # @param arrays [::Array<Arrow::RecordBatch>] The data of the table. + # + # @example Create a table from schema and record batches + # count_field = Arrow::Field.new("count", :uint32) + # visible_field = Arrow::Field.new("visible", :boolean) + # schema = Arrow::Schema.new([count_field, visible_field]) + # record_batches = [ + # Arrow::RecordBatch.new(schema, [[0, true], [2, nil], [nil, nil]]), + # Arrow::RecordBatch.new(schema, [[4, false]]), + # ] + # Arrow::Table.new(schema, record_batches) + # + # @overload initialize(schema, raw_records) + # + # @param schema [Arrow::Schema] The schema of the table. + # You can also specify schema as primitive Ruby objects. + # See {Arrow::Schema#initialize} for details. + # + # @param arrays [::Array<::Array>] The data of the table as primitive + # Ruby objects. + # + # @example Create a table from schema and raw records + # schema = { + # count: :uint32, + # visible: :boolean, + # } + # raw_records = [ + # [0, true], + # [2, nil], + # [nil, nil], + # [4, false], + # ] + # Arrow::Table.new(schema, raw_records) + def initialize(*args) + n_args = args.size + case n_args + when 1 + if args[0][0].is_a?(Column) + values = args[0] + fields = values.collect(&:field) schema = Schema.new(fields) else - raw_table = schema_or_raw_table_or_columns + raw_table = args[0] fields = [] - columns = [] + values = [] raw_table.each do |name, array| field = Field.new(name.to_s, array.value_data_type) fields << field - columns << Column.new(field, array) + values << Column.new(field, array) end schema = Schema.new(fields) end + when 2 + schema = args[0] + schema = Schema.new(schema) unless schema.is_a?(Schema) + values = args[1] + if values[0].is_a?(::Array) + values = [RecordBatch.new(schema, values)] + end else - schema = schema_or_raw_table_or_columns + message = "wrong number of arguments (given, #{n_args}, expected 1..2)" + raise ArgumentError, message end - initialize_raw(schema, columns) + initialize_raw(schema, values) end def columns @columns ||= n_columns.times.collect {|i| get_column(i)} end @@ -69,61 +196,116 @@ alias_method :size, :n_rows alias_method :length, :n_rows alias_method :[], :find_column - # TODO + alias_method :slice_raw, :slice + + # @overload slice(offset, length) # - # @return [Arrow::Table] + # @param offset [Integer] The offset of sub Arrow::Table. + # @param length [Integer] The length of sub Arrow::Table. + # @return [Arrow::Table] + # The sub `Arrow::Table` that covers only from + # `offset` to `offset + length` range. + # + # @overload slice(index) + # + # @param index [Integer] The index in this table. + # @return [Arrow::Record] + # The `Arrow::Record` corresponding to index of + # the table. + # + # @overload slice(booleans) + # + # @param booleans [::Array<Boolean>] + # The values indicating the target rows. + # @return [Arrow::Table] + # The sub `Arrow::Table` that covers only rows of indices + # the values of `booleans` is true. + # + # @overload slice(boolean_array) + # + # @param boolean_array [::Array<Arrow::BooleanArray>] + # The values indicating the target rows. + # @return [Arrow::Table] + # The sub `Arrow::Table` that covers only rows of indices + # the values of `boolean_array` is true. + # + # @overload slice(range) + # + # @param range_included_end [Range] The range indicating the target rows. + # @return [Arrow::Table] + # The sub `Arrow::Table` that covers only rows of the range of indices. + # + # @overload slice + # + # @yield [slicer] Gives slicer that constructs condition to select records. + # @yieldparam slicer [Arrow::Slicer] The slicer that helps us to + # build condition. + # @yieldreturn [Arrow::Slicer::Condition, ::Array<Arrow::Slicer::Condition>] + # The condition to select records. + # @return [Arrow::Table] + # The sub `Arrow::Table` that covers only rows matched by condition + # specified by slicer. def slice(*args) slicers = [] - expected_n_args = nil - case args.size - when 0 - expected_n_args = "1..2" unless block_given? - when 1 - slicers << args[0] - when 2 - from, to = args - slicers << (from...(from + to)) - else - if block_given? - expected_n_args = "0..2" - else - expected_n_args = "1..2" - end - end - if expected_n_args - message = "wrong number of arguments " + - "(given #{args.size}, expected #{expected_n_args})" - raise ArgumentError, message - end - if block_given? + unless args.empty? + raise ArgumentError, "must not specify both arguments and block" + end block_slicer = yield(Slicer.new(self)) case block_slicer - when nil - # Ignore when ::Array slicers.concat(block_slicer) else slicers << block_slicer end + else + expected_n_args = nil + case args.size + when 1 + if args[0].is_a?(Integer) + index = args[0] + index += n_rows if index < 0 + return nil if index < 0 + return nil if index >= n_rows + return Record.new(self, index) + else + slicers << args[0] + end + when 2 + offset, length = args + slicers << (offset...(offset + length)) + else + expected_n_args = "1..2" + end + if expected_n_args + message = "wrong number of arguments " + + "(given #{args.size}, expected #{expected_n_args})" + raise ArgumentError, message + end end ranges = [] slicers.each do |slicer| slicer = slicer.evaluate if slicer.respond_to?(:evaluate) case slicer when Integer slicer += n_rows if slicer < 0 - ranges << [slicer, slicer] + ranges << [slicer, n_rows - 1] when Range - from = slicer.first + original_from = from = slicer.first to = slicer.last to -= 1 if slicer.exclude_end? from += n_rows if from < 0 + if from < 0 or from >= n_rows + message = + "offset is out of range (-#{n_rows + 1},#{n_rows}): " + + "#{original_from}" + raise ArgumentError, message + end to += n_rows if to < 0 ranges << [from, to] when ::Array boolean_array_to_slice_ranges(slicer, 0, ranges) when ChunkedArray @@ -328,50 +510,19 @@ if in_target ranges << [target_start, offset + array.length - 1] end end - # TODO: Almost codes should be implemented in Apache Arrow C++. def slice_by_ranges(ranges) - sliced_columns = columns.collect do |column| - chunks = [] - arrays = column.data.each_chunk.to_a - offset = 0 - offset_in_array = 0 - ranges.each do |from, to| - range_size = to - from + 1 - while range_size > 0 - while offset + arrays.first.length - offset_in_array < from - offset += arrays.first.length - offset_in_array - arrays.shift - offset_in_array = 0 - end - if offset < from - skipped_size = from - offset - offset += skipped_size - offset_in_array += skipped_size - end - array = arrays.first - array_length = array.length - rest_length = array_length - offset_in_array - if rest_length <= range_size - chunks << array.slice(offset_in_array, array_length) - offset += rest_length - range_size -= rest_length - offset_in_array = 0 - arrays.shift - else - chunks << array.slice(offset_in_array, range_size) - offset += range_size - offset_in_array += range_size - range_size = 0 - end - end - end - Column.new(column.field, ChunkedArray.new(chunks)) + sliced_table = [] + ranges.each do |from, to| + sliced_table << slice_raw(from, to - from + 1) end - - self.class.new(schema, sliced_columns) + if sliced_table.size > 1 + sliced_table[0].concatenate(sliced_table[1..-1]) + else + sliced_table[0] + end end def ensure_column(name, data) case data when Array