lib/arrow/table.rb in red-arrow-0.13.0 vs lib/arrow/table.rb in red-arrow-0.14.0
- old
+ new
@@ -28,31 +28,158 @@
end
end
alias_method :initialize_raw, :initialize
private :initialize_raw
- def initialize(schema_or_raw_table_or_columns, columns=nil)
- if columns.nil?
- if schema_or_raw_table_or_columns[0].is_a?(Column)
- columns = schema_or_raw_table_or_columns
- fields = columns.collect(&:field)
+
+ # Creates a new {Arrow::Table}.
+ #
+ # @overload initialize(columns)
+ #
+ # @param columns [::Array<Arrow::Column>] The columns of the table.
+ #
+ # @example Create a table from columns
+ # count_field = Arrow::Field.new("count", :uint32)
+ # count_array = Arrow::UInt32Array.new([0, 2, nil, 4])
+ # count_column = Arrow::Column.new(count_field, count_array)
+ # visible_field = Arrow::Field.new("visible", :boolean)
+ # visible_array = Arrow::BooleanArray.new([true, nil, nil, false])
+ # visible_column = Arrow::Column.new(visible_field, visible_array)
+ # Arrow::Table.new([count_column, visible_column])
+ #
+ # @overload initialize(raw_table)
+ #
+ # @param raw_table [Hash<String, Arrow::Array>]
+ # The pairs of column name and values of the table. Column values is
+ # `Arrow::Array`.
+ #
+ # @example Create a table from column name and values
+ # Arrow::Table.new("count" => Arrow::UInt32Array.new([0, 2, nil, 4]),
+ # "visible" => Arrow::BooleanArray.new([true, nil, nil, false]))
+ #
+ # @overload initialize(raw_table)
+ #
+ # @param raw_table [Hash<String, Arrow::ChunkedArray>]
+ # The pairs of column name and values of the table. Column values is
+ # `Arrow::ChunkedArray`.
+ #
+ # @example Create a table from column name and values
+ # count_chunks = [
+ # Arrow::UInt32Array.new([0, 2]),
+ # Arrow::UInt32Array.new([nil, 4]),
+ # ]
+ # visible_chunks = [
+ # Arrow::BooleanArray.new([true]),
+ # Arrow::BooleanArray.new([nil, nil, false]),
+ # ]
+ # Arrow::Table.new("count" => Arrow::ChunkedArray.new(count_chunks),
+ # "visible" => Arrow::ChunkedArray.new(visible_chunks))
+ #
+ # @overload initialize(schema, columns)
+ #
+ # @param schema [Arrow::Schema] The schema of the table.
+ # You can also specify schema as primitive Ruby objects.
+ # See {Arrow::Schema#initialize} for details.
+ #
+ # @param columns [::Array<Arrow::Column>] The data of the table.
+ #
+ # @example Create a table from schema and columns
+ # count_field = Arrow::Field.new("count", :uint32)
+ # count_array = Arrow::UInt32Array.new([0, 2, nil, 4])
+ # count_column = Arrow::Column.new(count_field, count_array)
+ # visible_field = Arrow::Field.new("visible", :boolean)
+ # visible_array = Arrow::BooleanArray.new([true, nil, nil, false])
+ # visible_column = Arrow::Column.new(visible_field, visible_array)
+ # Arrow::Table.new(Arrow::Schema.new([count_field, visible_field]),
+ # [count_column, visible_column])
+ #
+ # @overload initialize(schema, arrays)
+ #
+ # @param schema [Arrow::Schema] The schema of the table.
+ # You can also specify schema as primitive Ruby objects.
+ # See {Arrow::Schema#initialize} for details.
+ #
+ # @param arrays [::Array<Arrow::Array>] The data of the table.
+ #
+ # @example Create a table from schema and arrays
+ # count_field = Arrow::Field.new("count", :uint32)
+ # count_array = Arrow::UInt32Array.new([0, 2, nil, 4])
+ # visible_field = Arrow::Field.new("visible", :boolean)
+ # visible_array = Arrow::BooleanArray.new([true, nil, nil, false])
+ # Arrow::Table.new(Arrow::Schema.new([count_field, visible_field]),
+ # [count_array, visible_array])
+ #
+ # @overload initialize(schema, record_batches)
+ #
+ # @param schema [Arrow::Schema] The schema of the table.
+ # You can also specify schema as primitive Ruby objects.
+ # See {Arrow::Schema#initialize} for details.
+ #
+ # @param arrays [::Array<Arrow::RecordBatch>] The data of the table.
+ #
+ # @example Create a table from schema and record batches
+ # count_field = Arrow::Field.new("count", :uint32)
+ # visible_field = Arrow::Field.new("visible", :boolean)
+ # schema = Arrow::Schema.new([count_field, visible_field])
+ # record_batches = [
+ # Arrow::RecordBatch.new(schema, [[0, true], [2, nil], [nil, nil]]),
+ # Arrow::RecordBatch.new(schema, [[4, false]]),
+ # ]
+ # Arrow::Table.new(schema, record_batches)
+ #
+ # @overload initialize(schema, raw_records)
+ #
+ # @param schema [Arrow::Schema] The schema of the table.
+ # You can also specify schema as primitive Ruby objects.
+ # See {Arrow::Schema#initialize} for details.
+ #
+ # @param arrays [::Array<::Array>] The data of the table as primitive
+ # Ruby objects.
+ #
+ # @example Create a table from schema and raw records
+ # schema = {
+ # count: :uint32,
+ # visible: :boolean,
+ # }
+ # raw_records = [
+ # [0, true],
+ # [2, nil],
+ # [nil, nil],
+ # [4, false],
+ # ]
+ # Arrow::Table.new(schema, raw_records)
+ def initialize(*args)
+ n_args = args.size
+ case n_args
+ when 1
+ if args[0][0].is_a?(Column)
+ values = args[0]
+ fields = values.collect(&:field)
schema = Schema.new(fields)
else
- raw_table = schema_or_raw_table_or_columns
+ raw_table = args[0]
fields = []
- columns = []
+ values = []
raw_table.each do |name, array|
field = Field.new(name.to_s, array.value_data_type)
fields << field
- columns << Column.new(field, array)
+ values << Column.new(field, array)
end
schema = Schema.new(fields)
end
+ when 2
+ schema = args[0]
+ schema = Schema.new(schema) unless schema.is_a?(Schema)
+ values = args[1]
+ if values[0].is_a?(::Array)
+ values = [RecordBatch.new(schema, values)]
+ end
else
- schema = schema_or_raw_table_or_columns
+ message = "wrong number of arguments (given, #{n_args}, expected 1..2)"
+ raise ArgumentError, message
end
- initialize_raw(schema, columns)
+ initialize_raw(schema, values)
end
def columns
@columns ||= n_columns.times.collect {|i| get_column(i)}
end
@@ -69,61 +196,116 @@
alias_method :size, :n_rows
alias_method :length, :n_rows
alias_method :[], :find_column
- # TODO
+ alias_method :slice_raw, :slice
+
+ # @overload slice(offset, length)
#
- # @return [Arrow::Table]
+ # @param offset [Integer] The offset of sub Arrow::Table.
+ # @param length [Integer] The length of sub Arrow::Table.
+ # @return [Arrow::Table]
+ # The sub `Arrow::Table` that covers only from
+ # `offset` to `offset + length` range.
+ #
+ # @overload slice(index)
+ #
+ # @param index [Integer] The index in this table.
+ # @return [Arrow::Record]
+ # The `Arrow::Record` corresponding to index of
+ # the table.
+ #
+ # @overload slice(booleans)
+ #
+ # @param booleans [::Array<Boolean>]
+ # The values indicating the target rows.
+ # @return [Arrow::Table]
+ # The sub `Arrow::Table` that covers only rows of indices
+ # the values of `booleans` is true.
+ #
+ # @overload slice(boolean_array)
+ #
+ # @param boolean_array [::Array<Arrow::BooleanArray>]
+ # The values indicating the target rows.
+ # @return [Arrow::Table]
+ # The sub `Arrow::Table` that covers only rows of indices
+ # the values of `boolean_array` is true.
+ #
+ # @overload slice(range)
+ #
+ # @param range_included_end [Range] The range indicating the target rows.
+ # @return [Arrow::Table]
+ # The sub `Arrow::Table` that covers only rows of the range of indices.
+ #
+ # @overload slice
+ #
+ # @yield [slicer] Gives slicer that constructs condition to select records.
+ # @yieldparam slicer [Arrow::Slicer] The slicer that helps us to
+ # build condition.
+ # @yieldreturn [Arrow::Slicer::Condition, ::Array<Arrow::Slicer::Condition>]
+ # The condition to select records.
+ # @return [Arrow::Table]
+ # The sub `Arrow::Table` that covers only rows matched by condition
+ # specified by slicer.
def slice(*args)
slicers = []
- expected_n_args = nil
- case args.size
- when 0
- expected_n_args = "1..2" unless block_given?
- when 1
- slicers << args[0]
- when 2
- from, to = args
- slicers << (from...(from + to))
- else
- if block_given?
- expected_n_args = "0..2"
- else
- expected_n_args = "1..2"
- end
- end
- if expected_n_args
- message = "wrong number of arguments " +
- "(given #{args.size}, expected #{expected_n_args})"
- raise ArgumentError, message
- end
-
if block_given?
+ unless args.empty?
+ raise ArgumentError, "must not specify both arguments and block"
+ end
block_slicer = yield(Slicer.new(self))
case block_slicer
- when nil
- # Ignore
when ::Array
slicers.concat(block_slicer)
else
slicers << block_slicer
end
+ else
+ expected_n_args = nil
+ case args.size
+ when 1
+ if args[0].is_a?(Integer)
+ index = args[0]
+ index += n_rows if index < 0
+ return nil if index < 0
+ return nil if index >= n_rows
+ return Record.new(self, index)
+ else
+ slicers << args[0]
+ end
+ when 2
+ offset, length = args
+ slicers << (offset...(offset + length))
+ else
+ expected_n_args = "1..2"
+ end
+ if expected_n_args
+ message = "wrong number of arguments " +
+ "(given #{args.size}, expected #{expected_n_args})"
+ raise ArgumentError, message
+ end
end
ranges = []
slicers.each do |slicer|
slicer = slicer.evaluate if slicer.respond_to?(:evaluate)
case slicer
when Integer
slicer += n_rows if slicer < 0
- ranges << [slicer, slicer]
+ ranges << [slicer, n_rows - 1]
when Range
- from = slicer.first
+ original_from = from = slicer.first
to = slicer.last
to -= 1 if slicer.exclude_end?
from += n_rows if from < 0
+ if from < 0 or from >= n_rows
+ message =
+ "offset is out of range (-#{n_rows + 1},#{n_rows}): " +
+ "#{original_from}"
+ raise ArgumentError, message
+ end
to += n_rows if to < 0
ranges << [from, to]
when ::Array
boolean_array_to_slice_ranges(slicer, 0, ranges)
when ChunkedArray
@@ -328,50 +510,19 @@
if in_target
ranges << [target_start, offset + array.length - 1]
end
end
- # TODO: Almost codes should be implemented in Apache Arrow C++.
def slice_by_ranges(ranges)
- sliced_columns = columns.collect do |column|
- chunks = []
- arrays = column.data.each_chunk.to_a
- offset = 0
- offset_in_array = 0
- ranges.each do |from, to|
- range_size = to - from + 1
- while range_size > 0
- while offset + arrays.first.length - offset_in_array < from
- offset += arrays.first.length - offset_in_array
- arrays.shift
- offset_in_array = 0
- end
- if offset < from
- skipped_size = from - offset
- offset += skipped_size
- offset_in_array += skipped_size
- end
- array = arrays.first
- array_length = array.length
- rest_length = array_length - offset_in_array
- if rest_length <= range_size
- chunks << array.slice(offset_in_array, array_length)
- offset += rest_length
- range_size -= rest_length
- offset_in_array = 0
- arrays.shift
- else
- chunks << array.slice(offset_in_array, range_size)
- offset += range_size
- offset_in_array += range_size
- range_size = 0
- end
- end
- end
- Column.new(column.field, ChunkedArray.new(chunks))
+ sliced_table = []
+ ranges.each do |from, to|
+ sliced_table << slice_raw(from, to - from + 1)
end
-
- self.class.new(schema, sliced_columns)
+ if sliced_table.size > 1
+ sliced_table[0].concatenate(sliced_table[1..-1])
+ else
+ sliced_table[0]
+ end
end
def ensure_column(name, data)
case data
when Array