lib/polars/lazy_frame.rb in polars-df-0.1.1 vs lib/polars/lazy_frame.rb in polars-df-0.1.2

- old
+ new

@@ -1,15 +1,159 @@ module Polars + # Representation of a Lazy computation graph/query againat a DataFrame. class LazyFrame + # @private attr_accessor :_ldf + # @private def self._from_rbldf(rb_ldf) ldf = LazyFrame.allocate ldf._ldf = rb_ldf ldf end + # @private + def self._scan_csv( + file, + has_header: true, + sep: ",", + comment_char: nil, + quote_char: '"', + skip_rows: 0, + dtypes: nil, + null_values: nil, + ignore_errors: false, + cache: true, + with_column_names: nil, + infer_schema_length: 100, + n_rows: nil, + encoding: "utf8", + low_memory: false, + rechunk: true, + skip_rows_after_header: 0, + row_count_name: nil, + row_count_offset: 0, + parse_dates: false, + eol_char: "\n" + ) + dtype_list = nil + if !dtypes.nil? + dtype_list = [] + dtypes.each do |k, v| + dtype_list << [k, Utils.rb_type_to_dtype(v)] + end + end + processed_null_values = Utils._process_null_values(null_values) + + _from_rbldf( + RbLazyFrame.new_from_csv( + file, + sep, + has_header, + ignore_errors, + skip_rows, + n_rows, + cache, + dtype_list, + low_memory, + comment_char, + quote_char, + processed_null_values, + infer_schema_length, + with_column_names, + rechunk, + skip_rows_after_header, + encoding, + Utils._prepare_row_count_args(row_count_name, row_count_offset), + parse_dates, + eol_char + ) + ) + end + + # @private + def self._scan_parquet( + file, + n_rows: nil, + cache: true, + parallel: "auto", + rechunk: true, + row_count_name: nil, + row_count_offset: 0, + storage_options: nil, + low_memory: false + ) + _from_rbldf( + RbLazyFrame.new_from_parquet( + file, + n_rows, + cache, + parallel, + rechunk, + Utils._prepare_row_count_args(row_count_name, row_count_offset), + low_memory + ) + ) + end + + # @private + def self._scan_ipc( + file, + n_rows: nil, + cache: true, + rechunk: true, + row_count_name: nil, + row_count_offset: 0, + storage_options: nil, + memory_map: true + ) + if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname)) + file = Utils.format_path(file) + end + + _from_rbldf( + RbLazyFrame.new_from_ipc( + file, + n_rows, + cache, + rechunk, + Utils._prepare_row_count_args(row_count_name, row_count_offset), + memory_map + ) + ) + end + + # @private + def self._scan_ndjson( + file, + infer_schema_length: nil, + batch_size: nil, + n_rows: nil, + low_memory: false, + rechunk: true, + row_count_name: nil, + row_count_offset: 0 + ) + _from_rbldf( + RbLazyFrame.new_from_ndjson( + file, + infer_schema_length, + batch_size, + n_rows, + low_memory, + rechunk, + Utils._prepare_row_count_args(row_count_name, row_count_offset) + ) + ) + end + + # def self.from_json + # end + + # def self.read_json + # end + # def columns # end # def dtypes # end @@ -51,10 +195,11 @@ # end # def profile # end + # def collect( type_coercion: true, predicate_pushdown: true, projection_pushdown: true, simplify_expression: true, @@ -88,20 +233,22 @@ end # def fetch # end + # def lazy self end # def cache # end # def cleared # end + # def filter(predicate) _from_rbldf( _ldf.filter( Utils.expr_to_lit_or_expr(predicate, str_to_lit: false)._rbexpr ) @@ -126,10 +273,11 @@ # end # def join_asof # end + # def join( other, left_on: nil, right_on: nil, on: nil, @@ -200,17 +348,19 @@ end # def with_context # end + # def with_column(column) with_columns([column]) end # def drop # end + # def rename(mapping) existing = mapping.keys _new = mapping.values _from_rbldf(_ldf.rename(existing, _new)) end @@ -249,10 +399,11 @@ # end # def fill_null # end + # def fill_nan(fill_value) if !fill_value.is_a?(Expr) fill_value = Utils.lit(fill_value) end _from_rbldf(_ldf.fill_nan(fill_value._rbexpr)) @@ -280,11 +431,14 @@ # end # def quantile # end - # def explode - # end + # + def explode(columns) + columns = Utils.selection_to_rbexpr_list(columns) + _from_rbldf(_ldf.explode(columns)) + end # def unique # end # def drop_nulls