lib/polars/data_frame.rb in polars-df-0.1.1 vs lib/polars/data_frame.rb in polars-df-0.1.2

- old
+ new

@@ -1,94 +1,338 @@ module Polars + # Two-dimensional data structure representing data as a table with rows and columns. class DataFrame + # @private attr_accessor :_df - def initialize(data = nil) + # Create a new DataFrame. + # + # @param data [Hash, Array, Series, nil] + # Two-dimensional data in various forms. Hash must contain Arrays. + # Array may contain Series. + # @param columns [Array, Hash, nil] + # Column labels to use for resulting DataFrame. If specified, overrides any + # labels already present in the data. Must match data dimensions. + # @param orient ["col", "row", nil] + # Whether to interpret two-dimensional data as columns or as rows. If `nil`, + # the orientation is inferred by matching the columns and data dimensions. If + # this does not yield conclusive results, column orientation is used. + def initialize(data = nil, columns: nil, orient: nil) if defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || data.is_a?(ActiveRecord::Result)) result = data.is_a?(ActiveRecord::Result) ? data : data.connection.select_all(data.to_sql) data = {} result.columns.each_with_index do |k, i| data[k] = result.rows.map { |r| r[i] } end end if data.nil? - self._df = hash_to_rbdf({}) + self._df = hash_to_rbdf({}, columns: columns) elsif data.is_a?(Hash) data = data.transform_keys { |v| v.is_a?(Symbol) ? v.to_s : v } - self._df = hash_to_rbdf(data) + self._df = hash_to_rbdf(data, columns: columns) elsif data.is_a?(Array) - self._df = sequence_to_rbdf(data) + self._df = sequence_to_rbdf(data, columns: columns, orient: orient) elsif data.is_a?(Series) - self._df = series_to_rbdf(data) + self._df = series_to_rbdf(data, columns: columns) else raise ArgumentError, "DataFrame constructor called with unsupported type; got #{data.class.name}" end end + # @private def self._from_rbdf(rb_df) df = DataFrame.allocate df._df = rb_df df end - def self._read_csv(file, has_header: true) + # def self._from_hashes + # end + + # def self._from_hash + # end + + # def self._from_records + # end + + # def self._from_numo + # end + + # no self._from_arrow + + # no self._from_pandas + + # @private + def self._read_csv( + file, + has_header: true, + columns: nil, + sep: str = ",", + comment_char: nil, + quote_char: '"', + skip_rows: 0, + dtypes: nil, + null_values: nil, + ignore_errors: false, + parse_dates: false, + n_threads: nil, + infer_schema_length: 100, + batch_size: 8192, + n_rows: nil, + encoding: "utf8", + low_memory: false, + rechunk: true, + skip_rows_after_header: 0, + row_count_name: nil, + row_count_offset: 0, + sample_size: 1024, + eol_char: "\n" + ) if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname)) - file = Utils.format_path(file) + path = Utils.format_path(file) + else + path = nil + # if defined?(StringIO) && file.is_a?(StringIO) + # file = file.string + # end end - _from_rbdf(RbDataFrame.read_csv(file, has_header)) + dtype_list = nil + dtype_slice = nil + if !dtypes.nil? + if dtypes.is_a?(Hash) + dtype_list = [] + dtypes.each do|k, v| + dtype_list << [k, Utils.rb_type_to_dtype(v)] + end + elsif dtypes.is_a?(Array) + dtype_slice = dtypes + else + raise ArgumentError, "dtype arg should be list or dict" + end + end + + processed_null_values = Utils._process_null_values(null_values) + + if columns.is_a?(String) + columns = [columns] + end + if file.is_a?(String) && file.include?("*") + raise Todo + end + + projection, columns = Utils.handle_projection_columns(columns) + + _from_rbdf( + RbDataFrame.read_csv( + file, + infer_schema_length, + batch_size, + has_header, + ignore_errors, + n_rows, + skip_rows, + projection, + sep, + rechunk, + columns, + encoding, + n_threads, + path, + dtype_list, + dtype_slice, + low_memory, + comment_char, + quote_char, + processed_null_values, + parse_dates, + skip_rows_after_header, + Utils._prepare_row_count_args(row_count_name, row_count_offset), + sample_size, + eol_char + ) + ) end + # @private def self._read_parquet(file) if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname)) file = Utils.format_path(file) end _from_rbdf(RbDataFrame.read_parquet(file)) end + # def self._read_avro + # end + + # @private + def self._read_ipc( + file, + columns: nil, + n_rows: nil, + row_count_name: nil, + row_count_offset: 0, + rechunk: true, + memory_map: true + ) + if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname)) + file = Utils.format_path(file) + end + if columns.is_a?(String) + columns = [columns] + end + + if file.is_a?(String) && file.include?("*") + raise Todo + end + + projection, columns = Utils.handle_projection_columns(columns) + _from_rbdf( + RbDataFrame.read_ipc( + file, + columns, + projection, + n_rows, + Utils._prepare_row_count_args(row_count_name, row_count_offset), + memory_map + ) + ) + end + + # @private def self._read_json(file) if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname)) file = Utils.format_path(file) end _from_rbdf(RbDataFrame.read_json(file)) end + # @private def self._read_ndjson(file) if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname)) file = Utils.format_path(file) end _from_rbdf(RbDataFrame.read_ndjson(file)) end + # Get the shape of the DataFrame. + # + # @return [Array] + # + # @example + # df = Polars::DataFrame.new({"foo" => [1, 2, 3, 4, 5]}) + # df.shape + # # => [5, 1] def shape _df.shape end + # Get the height of the DataFrame. + # + # @return [Integer] + # + # @example + # df = Polars::DataFrame.new({"foo" => [1, 2, 3, 4, 5]}) + # df.height + # # => 5 def height _df.height end + # Get the width of the DataFrame. + # + # @return [Integer] + # + # @example + # df = Polars::DataFrame.new({"foo" => [1, 2, 3, 4, 5]}) + # df.width + # # => 1 def width _df.width end + # Get column names. + # + # @return [Array] + # + # @example + # df = Polars::DataFrame.new({ + # "foo" => [1, 2, 3], + # "bar" => [6, 7, 8], + # "ham" => ["a", "b", "c"] + # }) + # df.columns + # # => ["foo", "bar", "ham"] def columns _df.columns end + # Change the column names of the DataFrame. + # + # @param columns [Array] + # A list with new names for the DataFrame. + # The length of the list should be equal to the width of the DataFrame. + # + # @return [Object] + # + # @example + # df = Polars::DataFrame.new({ + # "foo" => [1, 2, 3], + # "bar" => [6, 7, 8], + # "ham" => ["a", "b", "c"] + # }) + # df.columns = ["apple", "banana", "orange"] + # df + # # => + # # shape: (3, 3) + # # ┌───────┬────────┬────────┐ + # # │ apple ┆ banana ┆ orange │ + # # │ --- ┆ --- ┆ --- │ + # # │ i64 ┆ i64 ┆ str │ + # # ╞═══════╪════════╪════════╡ + # # │ 1 ┆ 6 ┆ a │ + # # ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤ + # # │ 2 ┆ 7 ┆ b │ + # # ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤ + # # │ 3 ┆ 8 ┆ c │ + # # └───────┴────────┴────────┘ def columns=(columns) _df.set_column_names(columns) end + # Get dtypes of columns in DataFrame. Dtypes can also be found in column headers when printing the DataFrame. + # + # @return [Array] + # + # @example + # df = Polars::DataFrame.new({ + # "foo" => [1, 2, 3], + # "bar" => [6.0, 7.0, 8.0], + # "ham" => ["a", "b", "c"] + # }) + # df.dtypes + # # => [:i64, :f64, :str] def dtypes - _df.dtypes.map(&:to_sym) + _df.dtypes end + # Get the schema. + # + # @return [Hash] + # + # @example + # df = Polars::DataFrame.new({ + # "foo" => [1, 2, 3], + # "bar" => [6.0, 7.0, 8.0], + # "ham" => ["a", "b", "c"] + # }) + # df.schema + # # => {"foo"=>:i64, "bar"=>:f64, "ham"=>:str} def schema columns.zip(dtypes).to_h end # def ==(other) @@ -122,49 +366,97 @@ # end # def %(other) # end + # def to_s _df.to_s end alias_method :inspect, :to_s def include?(name) columns.include?(name) end + # def each + # end + + # def _pos_idx + # end + + # def _pos_idxs + # end + + # def [](name) Utils.wrap_s(_df.column(name)) end # def []=(key, value) # end + # no to_arrow + + # def to_h(as_series: true) if as_series get_columns.to_h { |s| [s.name, s] } else get_columns.to_h { |s| [s.name, s.to_a] } end end - # def to_hs / to_a + # def to_hashes / to_a # end # def to_numo # end # no to_pandas + # Select column as Series at index location. + # + # @param index [Integer] + # Location of selection. + # + # @return [Series] + # + # @example + # df = Polars::DataFrame.new({ + # "foo" => [1, 2, 3], + # "bar" => [6, 7, 8], + # "ham" => ["a", "b", "c"] + # }) + # df.to_series(1) + # # => + # # shape: (3,) + # # Series: 'bar' [i64] + # # [ + # # 6 + # # 7 + # # 8 + # # ] def to_series(index = 0) if index < 0 index = columns.length + index end Utils.wrap_s(_df.select_at_idx(index)) end + # Serialize to JSON representation. + # + # @return [nil] + # + # @param file [String] + # File path to which the result should be written. + # @param pretty [Boolean] + # Pretty serialize json. + # @param row_oriented [Boolean] + # Write to row oriented json. This is slower, but more common. + # + # @see #write_ndjson def write_json( file, pretty: false, row_oriented: false ) @@ -174,19 +466,67 @@ _df.write_json(file, pretty, row_oriented) nil end + # Serialize to newline delimited JSON representation. + # + # @param file [String] + # File path to which the result should be written. + # + # @return [nil] def write_ndjson(file) if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname)) file = Utils.format_path(file) end _df.write_ndjson(file) nil end + # Write to comma-separated values (CSV) file. + # + # @param file [String, nil] + # File path to which the result should be written. If set to `nil` + # (default), the output is returned as a string instead. + # @param has_header [Boolean] + # Whether to include header in the CSV output. + # @param sep [String] + # Separate CSV fields with this symbol. + # @param quote [String] + # Byte to use as quoting character. + # @param batch_size [Integer] + # Number of rows that will be processed per thread. + # @param datetime_format [String, nil] + # A format string, with the specifiers defined by the + # [chrono](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) + # Rust crate. If no format specified, the default fractional-second + # precision is inferred from the maximum timeunit found in the frame's + # Datetime cols (if any). + # @param date_format [String, nil] + # A format string, with the specifiers defined by the + # [chrono](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) + # Rust crate. + # @param time_format [String, nil] + # A format string, with the specifiers defined by the + # [chrono](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) + # Rust crate. + # @param float_precision [Integer, nil] + # Number of decimal places to write, applied to both `:f32` and + # `:f64` datatypes. + # @param null_value [String, nil] + # A string representing null values (defaulting to the empty string). + # + # @return [String, nil] + # + # @example + # df = Polars::DataFrame.new({ + # "foo" => [1, 2, 3, 4, 5], + # "bar" => [6, 7, 8, 9, 10], + # "ham" => ["a", "b", "c", "d", "e"] + # }) + # df.write_csv("file.csv") def write_csv( file = nil, has_header: true, sep: ",", quote: '"', @@ -218,12 +558,11 @@ date_format, time_format, float_precision, null_value ) - buffer.rewind - return buffer.read.force_encoding(Encoding::UTF_8) + return buffer.string.force_encoding(Encoding::UTF_8) end if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname)) file = Utils.format_path(file) end @@ -244,13 +583,54 @@ end # def write_avro # end - # def write_ipc - # end + # Write to Arrow IPC binary stream or Feather file. + # + # @param file [String] + # File path to which the file should be written. + # @param compression ["uncompressed", "lz4", "zstd"] + # Compression method. Defaults to "uncompressed". + # + # @return [nil] + def write_ipc(file, compression: "uncompressed") + if compression.nil? + compression = "uncompressed" + end + if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname)) + file = Utils.format_path(file) + end + _df.write_ipc(file, compression) + end + + # Write to Apache Parquet file. + # + # @param file [String] + # File path to which the file should be written. + # @param compression ["lz4", "uncompressed", "snappy", "gzip", "lzo", "brotli", "zstd"] + # Choose "zstd" for good compression performance. + # Choose "lz4" for fast compression/decompression. + # Choose "snappy" for more backwards compatibility guarantees + # when you deal with older parquet readers. + # @param compression_level [Integer, nil] + # The level of compression to use. Higher compression means smaller files on + # disk. + # + # - "gzip" : min-level: 0, max-level: 10. + # - "brotli" : min-level: 0, max-level: 11. + # - "zstd" : min-level: 1, max-level: 22. + # @param statistics [Boolean] + # Write statistics to the parquet headers. This requires extra compute. + # @param row_group_size [Integer, nil] + # Size of the row groups in number of rows. + # If `nil` (default), the chunks of the DataFrame are + # used. Writing in smaller chunks may reduce memory pressure and improve + # writing speeds. + # + # @return [nil] def write_parquet( file, compression: "zstd", compression_level: nil, statistics: false, @@ -266,26 +646,159 @@ _df.write_parquet( file, compression, compression_level, statistics, row_group_size ) end + # Return an estimation of the total (heap) allocated size of the DataFrame. + # + # Estimated size is given in the specified unit (bytes by default). + # + # This estimation is the sum of the size of its buffers, validity, including + # nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + # size of 2 arrays is not the sum of the sizes computed from this function. In + # particular, StructArray's size is an upper bound. + # + # When an array is sliced, its allocated size remains constant because the buffer + # unchanged. However, this function will yield a smaller number. This is because + # this function returns the visible size of the buffer, not its total capacity. + # + # FFI buffers are included in this estimation. + # + # @param unit ["b", "kb", "mb", "gb", "tb"] + # Scale the returned size to the given unit. + # + # @return [Numeric] + # + # @example + # df = Polars::DataFrame.new( + # { + # "x" => 1_000_000.times.to_a.reverse, + # "y" => 1_000_000.times.map { |v| v / 1000.0 }, + # "z" => 1_000_000.times.map(&:to_s) + # }, + # columns: {"x" => :u32, "y" => :f64, "z" => :str} + # ) + # df.estimated_size + # # => 25888898 + # df.estimated_size("mb") + # # => 24.689577102661133 def estimated_size(unit = "b") sz = _df.estimated_size Utils.scale_bytes(sz, to: unit) end # def transpose # end + # Reverse the DataFrame. + # + # @return [DataFrame] + # + # @example + # df = Polars::DataFrame.new({ + # "key" => ["a", "b", "c"], + # "val" => [1, 2, 3] + # }) + # df.reverse() + # # => + # # shape: (3, 2) + # # ┌─────┬─────┐ + # # │ key ┆ val │ + # # │ --- ┆ --- │ + # # │ str ┆ i64 │ + # # ╞═════╪═════╡ + # # │ c ┆ 3 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ b ┆ 2 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ a ┆ 1 │ + # # └─────┴─────┘ def reverse select(Polars.col("*").reverse) end + # Rename column names. + # + # @param mapping [Hash] + # Key value pairs that map from old name to new name. + # + # @return [DataFrame] + # + # @example + # df = Polars::DataFrame.new({ + # "foo" => [1, 2, 3], + # "bar" => [6, 7, 8], + # "ham" => ["a", "b", "c"] + # }) + # df.rename({"foo" => "apple"}) + # # => + # # shape: (3, 3) + # # ┌───────┬─────┬─────┐ + # # │ apple ┆ bar ┆ ham │ + # # │ --- ┆ --- ┆ --- │ + # # │ i64 ┆ i64 ┆ str │ + # # ╞═══════╪═════╪═════╡ + # # │ 1 ┆ 6 ┆ a │ + # # ├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 2 ┆ 7 ┆ b │ + # # ├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 3 ┆ 8 ┆ c │ + # # └───────┴─────┴─────┘ def rename(mapping) lazy.rename(mapping).collect(no_optimization: true) end + # Insert a Series at a certain column index. This operation is in place. + # + # @param index [Integer] + # Column to insert the new `Series` column. + # @param series [Series] + # `Series` to insert. + # + # @return [DataFrame] + # + # @example + # df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]}) + # s = Polars::Series.new("baz", [97, 98, 99]) + # df.insert_at_idx(1, s) + # # => + # # shape: (3, 3) + # # ┌─────┬─────┬─────┐ + # # │ foo ┆ baz ┆ bar │ + # # │ --- ┆ --- ┆ --- │ + # # │ i64 ┆ i64 ┆ i64 │ + # # ╞═════╪═════╪═════╡ + # # │ 1 ┆ 97 ┆ 4 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 2 ┆ 98 ┆ 5 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤ + # # │ 3 ┆ 99 ┆ 6 │ + # # └─────┴─────┴─────┘ + # + # @example + # df = Polars::DataFrame.new({ + # "a" => [1, 2, 3, 4], + # "b" => [0.5, 4, 10, 13], + # "c" => [true, true, false, true] + # }) + # s = Polars::Series.new("d", [-2.5, 15, 20.5, 0]) + # df.insert_at_idx(3, s) + # # => + # # shape: (4, 4) + # # ┌─────┬──────┬───────┬──────┐ + # # │ a ┆ b ┆ c ┆ d │ + # # │ --- ┆ --- ┆ --- ┆ --- │ + # # │ i64 ┆ f64 ┆ bool ┆ f64 │ + # # ╞═════╪══════╪═══════╪══════╡ + # # │ 1 ┆ 0.5 ┆ true ┆ -2.5 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤ + # # │ 2 ┆ 4.0 ┆ true ┆ 15.0 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤ + # # │ 3 ┆ 10.0 ┆ false ┆ 20.5 │ + # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤ + # # │ 4 ┆ 13.0 ┆ true ┆ 0.0 │ + # # └─────┴──────┴───────┴──────┘ def insert_at_idx(index, series) if index < 0 index = columns.length + index end _df.insert_at_idx(index, series._s) @@ -303,10 +816,11 @@ # end # def replace_at_idx # end + # def sort(by, reverse: false, nulls_last: false) _from_rbdf(_df.sort(by, reverse, nulls_last)) end def frame_equal(other, null_equal: true) @@ -314,10 +828,11 @@ end # def replace # end + # def slice(offset, length = nil) if !length.nil? && length < 0 length = height - offset + length end _from_rbdf(_df.slice(offset, length)) @@ -342,10 +857,11 @@ # end # def with_row_count # end + # def groupby(by, maintain_order: false) lazy.groupby(by, maintain_order: maintain_order) end # def groupby_rolling @@ -358,10 +874,11 @@ # end # def join_asof # end + # def join(other, left_on: nil, right_on: nil, on: nil, how: "inner", suffix: "_right") lazy .join( other.lazy, left_on: left_on, @@ -374,10 +891,11 @@ end # def apply # end + # def with_column(column) lazy .with_column(column) .collect(no_optimization: true, string_cache: false) end @@ -386,12 +904,15 @@ # end # def vstack # end - # def extend - # end + # + def extend(other) + _df.extend(other._df) + self + end # def drop # end # def drop_in_place @@ -400,10 +921,11 @@ # def cleared # end # clone handled by initialize_copy + # def get_columns _df.get_columns.map { |s| Utils.wrap_s(s) } end def get_column(name) @@ -411,10 +933,11 @@ end # def fill_null # end + # def fill_nan(fill_value) lazy.fill_nan(fill_value).collect(no_optimization: true) end # def explode @@ -436,10 +959,11 @@ # end # def shift_and_fill # end + # def is_duplicated Utils.wrap_s(_df.is_duplicated) end def is_unique @@ -545,10 +1069,11 @@ # end # def n_unique # end + # def rechunk _from_rbdf(_df.rechunk) end def null_count @@ -577,10 +1102,11 @@ # end # def interpolate # end + # def is_empty height == 0 end alias_method :empty?, :is_empty @@ -595,18 +1121,58 @@ def initialize_copy(other) super self._df = _df._clone end - def hash_to_rbdf(data) + def hash_to_rbdf(data, columns: nil) + if !columns.nil? + columns, dtypes = _unpack_columns(columns, lookup_names: data.keys) + + if !data && dtypes + data_series = columns.map { |name| Series.new(name, [], dtype: dtypes[name])._s } + else + data_series = data.map { |name, values| Series.new(name, values, dtype: dtypes[name])._s } + end + data_series = _handle_columns_arg(data_series, columns: columns) + return RbDataFrame.new(data_series) + end + RbDataFrame.read_hash(data) end - def sequence_to_rbdf(data) + def _unpack_columns(columns, lookup_names: nil) + [columns.keys, columns] + end + + def _handle_columns_arg(data, columns: nil) + if columns.nil? + data + else + if !data + columns.map { |c| Series.new(c, nil)._s } + elsif data.length == columns.length + columns.each_with_index do |c, i| + # not in-place? + data[i].rename(c) + end + data + else + raise ArgumentError, "Dimensions of columns arg must match data dimensions." + end + end + end + + def sequence_to_rbdf(data, columns: nil, orient: nil) + if columns || orient + raise Todo + end RbDataFrame.new(data.map(&:_s)) end - def series_to_rbdf(data) + def series_to_rbdf(data, columns: nil) + if columns + raise Todo + end RbDataFrame.new([data._s]) end def wrap_ldf(ldf) LazyFrame._from_rbldf(ldf)