use magnus::{class, r_hash::ForEach, Module, RArray, RHash, Symbol, TryConvert, Value, QNIL}; use polars::chunked_array::object::PolarsObjectSafe; use polars::chunked_array::ops::{FillNullLimit, FillNullStrategy}; use polars::datatypes::AnyValue; use polars::frame::row::Row; use polars::frame::NullStrategy; use polars::io::avro::AvroCompression; use polars::prelude::*; use polars::series::ops::NullBehavior; use std::fmt::{Display, Formatter}; use std::hash::{Hash, Hasher}; use crate::{RbDataFrame, RbLazyFrame, RbPolarsErr, RbResult, RbSeries, RbValueError}; pub(crate) fn slice_to_wrapped(slice: &[T]) -> &[Wrap] { // Safety: // Wrap is transparent. unsafe { std::mem::transmute(slice) } } pub(crate) fn vec_extract_wrapped(buf: Vec>) -> Vec { // Safety: // Wrap is transparent. unsafe { std::mem::transmute(buf) } } #[repr(transparent)] pub struct Wrap(pub T); impl Clone for Wrap where T: Clone, { fn clone(&self) -> Self { Wrap(self.0.clone()) } } impl From for Wrap { fn from(t: T) -> Self { Wrap(t) } } pub(crate) fn get_rbseq(obj: Value) -> RbResult<(RArray, usize)> { let seq: RArray = obj.try_convert()?; let len = seq.len(); Ok((seq, len)) } pub(crate) fn get_df(obj: Value) -> RbResult { let rbdf = obj.funcall::<_, _, &RbDataFrame>("_df", ())?; Ok(rbdf.df.borrow().clone()) } pub(crate) fn get_lf(obj: Value) -> RbResult { let rbdf = obj.funcall::<_, _, &RbLazyFrame>("_ldf", ())?; Ok(rbdf.ldf.clone()) } pub(crate) fn get_series(obj: Value) -> RbResult { let rbs = obj.funcall::<_, _, &RbSeries>("_s", ())?; Ok(rbs.series.borrow().clone()) } impl TryConvert for Wrap { fn try_convert(obj: Value) -> RbResult { let (seq, len) = get_rbseq(obj)?; let mut builder = Utf8ChunkedBuilder::new("", len, len * 25); for res in seq.each() { let item = res?; match item.try_convert::() { Ok(val) => builder.append_value(&val), Err(_) => builder.append_null(), } } Ok(Wrap(builder.finish())) } } impl TryConvert for Wrap { fn try_convert(ob: Value) -> RbResult { if let Ok(s) = ob.try_convert::() { Ok(Wrap(NullValues::AllColumnsSingle(s))) } else if let Ok(s) = ob.try_convert::>() { Ok(Wrap(NullValues::AllColumns(s))) } else if let Ok(s) = ob.try_convert::>() { Ok(Wrap(NullValues::Named(s))) } else { Err(RbPolarsErr::other( "could not extract value from null_values argument".into(), )) } } } impl From>> for Value { fn from(w: Wrap>) -> Self { match w.0 { AnyValue::UInt8(v) => Value::from(v), AnyValue::UInt16(v) => Value::from(v), AnyValue::UInt32(v) => Value::from(v), AnyValue::UInt64(v) => Value::from(v), AnyValue::Int8(v) => Value::from(v), AnyValue::Int16(v) => Value::from(v), AnyValue::Int32(v) => Value::from(v), AnyValue::Int64(v) => Value::from(v), AnyValue::Float32(v) => Value::from(v), AnyValue::Float64(v) => Value::from(v), AnyValue::Null => *QNIL, AnyValue::Boolean(v) => Value::from(v), AnyValue::Utf8(v) => Value::from(v), AnyValue::Date(v) => class::time() .funcall::<_, _, Value>("at", (v * 86400,)) .unwrap() .funcall::<_, _, Value>("utc", ()) .unwrap() .funcall::<_, _, Value>("to_date", ()) .unwrap(), AnyValue::Datetime(v, tu, tz) => { let t = match tu { TimeUnit::Nanoseconds => todo!(), TimeUnit::Microseconds => { let sec = v / 1000000; let subsec = v % 1000000; class::time() .funcall::<_, _, Value>("at", (sec, subsec, Symbol::new("usec"))) .unwrap() } TimeUnit::Milliseconds => todo!(), }; if tz.is_some() { todo!(); } else { t.funcall::<_, _, Value>("utc", ()).unwrap() } } _ => todo!(), } } } impl From> for Value { fn from(w: Wrap) -> Self { let pl = crate::rb_modules::polars(); match &w.0 { DataType::Int8 => pl.const_get::<_, Value>("Int8").unwrap(), DataType::Int16 => pl.const_get::<_, Value>("Int16").unwrap(), DataType::Int32 => pl.const_get::<_, Value>("Int32").unwrap(), DataType::Int64 => pl.const_get::<_, Value>("Int64").unwrap(), DataType::UInt8 => pl.const_get::<_, Value>("UInt8").unwrap(), DataType::UInt16 => pl.const_get::<_, Value>("UInt16").unwrap(), DataType::UInt32 => pl.const_get::<_, Value>("UInt32").unwrap(), DataType::UInt64 => pl.const_get::<_, Value>("UInt64").unwrap(), DataType::Float32 => pl.const_get::<_, Value>("Float32").unwrap(), DataType::Float64 => pl.const_get::<_, Value>("Float64").unwrap(), DataType::Boolean => pl.const_get::<_, Value>("Boolean").unwrap(), DataType::Utf8 => pl.const_get::<_, Value>("Utf8").unwrap(), DataType::Binary => pl.const_get::<_, Value>("Binary").unwrap(), DataType::List(inner) => { let inner = Wrap(*inner.clone()); let list_class = pl.const_get::<_, Value>("List").unwrap(); list_class.funcall::<_, _, Value>("new", (inner,)).unwrap() } DataType::Date => pl.const_get::<_, Value>("Date").unwrap(), DataType::Datetime(tu, tz) => { let datetime_class = pl.const_get::<_, Value>("Datetime").unwrap(); datetime_class .funcall::<_, _, Value>("new", (tu.to_ascii(), tz.clone())) .unwrap() } DataType::Duration(tu) => { let duration_class = pl.const_get::<_, Value>("Duration").unwrap(); duration_class .funcall::<_, _, Value>("new", (tu.to_ascii(),)) .unwrap() } DataType::Object(_) => pl.const_get::<_, Value>("Object").unwrap(), DataType::Categorical(_) => pl.const_get::<_, Value>("Categorical").unwrap(), DataType::Time => pl.const_get::<_, Value>("Time").unwrap(), DataType::Struct(fields) => { let field_class = pl.const_get::<_, Value>("Field").unwrap(); let iter = fields.iter().map(|fld| { let name = fld.name().clone(); let dtype = Wrap(fld.data_type().clone()); field_class .funcall::<_, _, Value>("new", (name, dtype)) .unwrap() }); let fields = RArray::from_iter(iter); let struct_class = pl.const_get::<_, Value>("Struct").unwrap(); struct_class .funcall::<_, _, Value>("new", (fields,)) .unwrap() } DataType::Null => pl.const_get::<_, Value>("Null").unwrap(), DataType::Unknown => pl.const_get::<_, Value>("Unknown").unwrap(), } } } impl TryConvert for Wrap { fn try_convert(ob: Value) -> RbResult { let dtype = if ob.is_kind_of(class::class()) { let name = ob.funcall::<_, _, String>("name", ())?; match name.as_str() { "Polars::UInt8" => DataType::UInt8, "Polars::UInt16" => DataType::UInt16, "Polars::UInt32" => DataType::UInt32, "Polars::UInt64" => DataType::UInt64, "Polars::Int8" => DataType::Int8, "Polars::Int16" => DataType::Int16, "Polars::Int32" => DataType::Int32, "Polars::Int64" => DataType::Int64, "Polars::Utf8" => DataType::Utf8, "Polars::Binary" => DataType::Binary, "Polars::Boolean" => DataType::Boolean, "Polars::Categorical" => DataType::Categorical(None), "Polars::Date" => DataType::Date, "Polars::Datetime" => DataType::Datetime(TimeUnit::Microseconds, None), "Polars::Time" => DataType::Time, "Polars::Duration" => DataType::Duration(TimeUnit::Microseconds), "Polars::Float32" => DataType::Float32, "Polars::Float64" => DataType::Float64, // "Polars::Object" => DataType::Object(OBJECT_NAME), "Polars::List" => DataType::List(Box::new(DataType::Boolean)), "Polars::Null" => DataType::Null, "Polars::Unknown" => DataType::Unknown, dt => { return Err(RbValueError::new_err(format!( "{dt} is not a correct polars DataType.", ))) } } } else { match ob.try_convert::()?.as_str() { "u8" => DataType::UInt8, "u16" => DataType::UInt16, "u32" => DataType::UInt32, "u64" => DataType::UInt64, "i8" => DataType::Int8, "i16" => DataType::Int16, "i32" => DataType::Int32, "i64" => DataType::Int64, "str" => DataType::Utf8, "bin" => DataType::Binary, "bool" => DataType::Boolean, "cat" => DataType::Categorical(None), "date" => DataType::Date, "datetime" => DataType::Datetime(TimeUnit::Microseconds, None), "f32" => DataType::Float32, "time" => DataType::Time, "dur" => DataType::Duration(TimeUnit::Microseconds), "f64" => DataType::Float64, // "obj" => DataType::Object(OBJECT_NAME), "list" => DataType::List(Box::new(DataType::Boolean)), "null" => DataType::Null, "unk" => DataType::Unknown, _ => { return Err(RbValueError::new_err(format!( "{} is not a supported DataType.", ob ))) } } }; Ok(Wrap(dtype)) } } impl<'s> TryConvert for Wrap> { fn try_convert(ob: Value) -> RbResult { // TODO improve if let Ok(v) = ob.try_convert::() { Ok(AnyValue::Int64(v).into()) } else if let Ok(v) = ob.try_convert::() { Ok(AnyValue::Float64(v).into()) } else if ob.is_nil() { Ok(AnyValue::Null.into()) } else if ob.is_kind_of(class::hash()) { let dict = ob.try_convert::().unwrap(); let len = dict.len(); let mut keys = Vec::with_capacity(len); let mut vals = Vec::with_capacity(len); dict.foreach(|k: Value, v: Value| { let key = k.try_convert::()?; let val = v.try_convert::>()?.0; let dtype = DataType::from(&val); keys.push(Field::new(&key, dtype)); vals.push(val); Ok(ForEach::Continue) })?; Ok(Wrap(AnyValue::StructOwned(Box::new((vals, keys))))) } else { Err(RbPolarsErr::other(format!( "object type not supported {:?}", ob ))) } } } impl<'s> TryConvert for Wrap> { fn try_convert(ob: Value) -> RbResult { let mut vals: Vec>> = Vec::new(); for item in ob.try_convert::()?.each() { vals.push(item?.try_convert::>>()?); } let vals: Vec = unsafe { std::mem::transmute(vals) }; Ok(Wrap(Row(vals))) } } impl TryConvert for Wrap { fn try_convert(ob: Value) -> RbResult { let dict = ob.try_convert::()?; let mut schema = Vec::new(); dict.foreach(|key: String, val: Wrap| { schema.push(Field::new(&key, val.0)); Ok(ForEach::Continue) }) .unwrap(); Ok(Wrap(schema.into_iter().into())) } } #[derive(Clone, Debug)] pub struct ObjectValue { pub inner: Value, } impl Hash for ObjectValue { fn hash(&self, state: &mut H) { let h = self .inner .funcall::<_, _, isize>("hash", ()) .expect("should be hashable"); state.write_isize(h) } } impl Eq for ObjectValue {} impl PartialEq for ObjectValue { fn eq(&self, other: &Self) -> bool { self.inner.eql(&other.inner).unwrap_or(false) } } impl Display for ObjectValue { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { write!(f, "{}", self.inner) } } impl PolarsObject for ObjectValue { fn type_name() -> &'static str { "object" } } impl From for ObjectValue { fn from(v: Value) -> Self { Self { inner: v } } } impl TryConvert for ObjectValue { fn try_convert(ob: Value) -> RbResult { Ok(ObjectValue { inner: ob }) } } impl From<&dyn PolarsObjectSafe> for &ObjectValue { fn from(val: &dyn PolarsObjectSafe) -> Self { unsafe { &*(val as *const dyn PolarsObjectSafe as *const ObjectValue) } } } // TODO remove impl ObjectValue { pub fn to_object(&self) -> Value { self.inner } } impl From for Value { fn from(val: ObjectValue) -> Self { val.inner } } impl Default for ObjectValue { fn default() -> Self { ObjectValue { inner: *QNIL } } } pub(crate) fn dicts_to_rows( records: &Value, infer_schema_len: usize, ) -> RbResult<(Vec, Vec)> { let (dicts, len) = get_rbseq(*records)?; let mut key_names = PlIndexSet::new(); for d in dicts.each().take(infer_schema_len) { let d = d?; let d = d.try_convert::()?; d.foreach(|name: String, _value: Value| { key_names.insert(name); Ok(ForEach::Continue) })?; } let mut rows = Vec::with_capacity(len); for d in dicts.each() { let d = d?; let d = d.try_convert::()?; let mut row = Vec::with_capacity(key_names.len()); for k in key_names.iter() { let val = match d.get(k.clone()) { None => AnyValue::Null, Some(val) => val.try_convert::>()?.0, }; row.push(val) } rows.push(Row(row)) } Ok((rows, key_names.into_iter().collect())) } impl TryConvert for Wrap { fn try_convert(ob: Value) -> RbResult { let parsed = match ob.try_convert::()?.as_str() { "backward" => AsofStrategy::Backward, "forward" => AsofStrategy::Forward, v => { return Err(RbValueError::new_err(format!( "strategy must be one of {{'backward', 'forward'}}, got {}", v ))) } }; Ok(Wrap(parsed)) } } impl TryConvert for Wrap { fn try_convert(ob: Value) -> RbResult { let parsed = match ob.try_convert::()?.as_str() { "linear" => InterpolationMethod::Linear, "nearest" => InterpolationMethod::Nearest, v => { return Err(RbValueError::new_err(format!( "method must be one of {{'linear', 'nearest'}}, got {v}", ))) } }; Ok(Wrap(parsed)) } } impl TryConvert for Wrap> { fn try_convert(ob: Value) -> RbResult { let parsed = match ob.try_convert::()?.as_str() { "uncompressed" => None, "snappy" => Some(AvroCompression::Snappy), "deflate" => Some(AvroCompression::Deflate), v => { return Err(RbValueError::new_err(format!( "compression must be one of {{'uncompressed', 'snappy', 'deflate'}}, got {}", v ))) } }; Ok(Wrap(parsed)) } } impl TryConvert for Wrap { fn try_convert(ob: Value) -> RbResult { let parsed = match ob.try_convert::()?.as_str() { "physical" => CategoricalOrdering::Physical, "lexical" => CategoricalOrdering::Lexical, v => { return Err(RbValueError::new_err(format!( "ordering must be one of {{'physical', 'lexical'}}, got {}", v ))) } }; Ok(Wrap(parsed)) } } impl TryConvert for Wrap { fn try_convert(ob: Value) -> RbResult { let parsed = match ob.try_convert::()?.as_str() { "window" => StartBy::WindowBound, "datapoint" => StartBy::DataPoint, "monday" => StartBy::Monday, v => { return Err(RbValueError::new_err(format!( "closed must be one of {{'window', 'datapoint', 'monday'}}, got {v}", ))) } }; Ok(Wrap(parsed)) } } impl TryConvert for Wrap { fn try_convert(ob: Value) -> RbResult { let parsed = match ob.try_convert::()?.as_str() { "left" => ClosedWindow::Left, "right" => ClosedWindow::Right, "both" => ClosedWindow::Both, "none" => ClosedWindow::None, v => { return Err(RbValueError::new_err(format!( "closed must be one of {{'left', 'right', 'both', 'none'}}, got {}", v ))) } }; Ok(Wrap(parsed)) } } impl TryConvert for Wrap { fn try_convert(ob: Value) -> RbResult { let parsed = match ob.try_convert::()?.as_str() { "utf8" => CsvEncoding::Utf8, "utf8-lossy" => CsvEncoding::LossyUtf8, v => { return Err(RbValueError::new_err(format!( "encoding must be one of {{'utf8', 'utf8-lossy'}}, got {}", v ))) } }; Ok(Wrap(parsed)) } } impl TryConvert for Wrap> { fn try_convert(ob: Value) -> RbResult { let parsed = match ob.try_convert::()?.as_str() { "uncompressed" => None, "lz4" => Some(IpcCompression::LZ4), "zstd" => Some(IpcCompression::ZSTD), v => { return Err(RbValueError::new_err(format!( "compression must be one of {{'uncompressed', 'lz4', 'zstd'}}, got {}", v ))) } }; Ok(Wrap(parsed)) } } impl TryConvert for Wrap { fn try_convert(ob: Value) -> RbResult { let parsed = match ob.try_convert::()?.as_str() { "inner" => JoinType::Inner, "left" => JoinType::Left, "outer" => JoinType::Outer, "semi" => JoinType::Semi, "anti" => JoinType::Anti, // #[cfg(feature = "cross_join")] // "cross" => JoinType::Cross, v => { return Err(RbValueError::new_err(format!( "how must be one of {{'inner', 'left', 'outer', 'semi', 'anti', 'cross'}}, got {}", v ))) } }; Ok(Wrap(parsed)) } } impl TryConvert for Wrap { fn try_convert(ob: Value) -> RbResult { let parsed = match ob.try_convert::()?.as_str() { "first_non_null" => ListToStructWidthStrategy::FirstNonNull, "max_width" => ListToStructWidthStrategy::MaxWidth, v => { return Err(RbValueError::new_err(format!( "n_field_strategy must be one of {{'first_non_null', 'max_width'}}, got {}", v ))) } }; Ok(Wrap(parsed)) } } impl TryConvert for Wrap { fn try_convert(ob: Value) -> RbResult { let parsed = match ob.try_convert::()?.as_str() { "drop" => NullBehavior::Drop, "ignore" => NullBehavior::Ignore, v => { return Err(RbValueError::new_err(format!( "null behavior must be one of {{'drop', 'ignore'}}, got {}", v ))) } }; Ok(Wrap(parsed)) } } impl TryConvert for Wrap { fn try_convert(ob: Value) -> RbResult { let parsed = match ob.try_convert::()?.as_str() { "ignore" => NullStrategy::Ignore, "propagate" => NullStrategy::Propagate, v => { return Err(RbValueError::new_err(format!( "null strategy must be one of {{'ignore', 'propagate'}}, got {}", v ))) } }; Ok(Wrap(parsed)) } } impl TryConvert for Wrap { fn try_convert(ob: Value) -> RbResult { let parsed = match ob.try_convert::()?.as_str() { "auto" => ParallelStrategy::Auto, "columns" => ParallelStrategy::Columns, "row_groups" => ParallelStrategy::RowGroups, "none" => ParallelStrategy::None, v => { return Err(RbValueError::new_err(format!( "parallel must be one of {{'auto', 'columns', 'row_groups', 'none'}}, got {}", v ))) } }; Ok(Wrap(parsed)) } } impl TryConvert for Wrap { fn try_convert(ob: Value) -> RbResult { let parsed = match ob.try_convert::()?.as_str() { "lower" => QuantileInterpolOptions::Lower, "higher" => QuantileInterpolOptions::Higher, "nearest" => QuantileInterpolOptions::Nearest, "linear" => QuantileInterpolOptions::Linear, "midpoint" => QuantileInterpolOptions::Midpoint, v => { return Err(RbValueError::new_err(format!( "interpolation must be one of {{'lower', 'higher', 'nearest', 'linear', 'midpoint'}}, got {}", v ))) } }; Ok(Wrap(parsed)) } } impl TryConvert for Wrap { fn try_convert(ob: Value) -> RbResult { let parsed = match ob.try_convert::()?.as_str() { "min" => RankMethod::Min, "max" => RankMethod::Max, "average" => RankMethod::Average, "dense" => RankMethod::Dense, "ordinal" => RankMethod::Ordinal, "random" => RankMethod::Random, v => { return Err(RbValueError::new_err(format!( "method must be one of {{'min', 'max', 'average', 'dense', 'ordinal', 'random'}}, got {}", v ))) } }; Ok(Wrap(parsed)) } } impl TryConvert for Wrap { fn try_convert(ob: Value) -> RbResult { let parsed = match ob.try_convert::()?.as_str() { "ns" => TimeUnit::Nanoseconds, "us" => TimeUnit::Microseconds, "ms" => TimeUnit::Milliseconds, v => { return Err(RbValueError::new_err(format!( "time unit must be one of {{'ns', 'us', 'ms'}}, got {}", v ))) } }; Ok(Wrap(parsed)) } } impl TryConvert for Wrap { fn try_convert(ob: Value) -> RbResult { let parsed = match ob.try_convert::()?.as_str() { "first" => UniqueKeepStrategy::First, "last" => UniqueKeepStrategy::Last, v => { return Err(RbValueError::new_err(format!( "keep must be one of {{'first', 'last'}}, got {}", v ))) } }; Ok(Wrap(parsed)) } } pub fn parse_fill_null_strategy( strategy: &str, limit: FillNullLimit, ) -> RbResult { let parsed = match strategy { "forward" => FillNullStrategy::Forward(limit), "backward" => FillNullStrategy::Backward(limit), "min" => FillNullStrategy::Min, "max" => FillNullStrategy::Max, "mean" => FillNullStrategy::Mean, "zero" => FillNullStrategy::Zero, "one" => FillNullStrategy::One, e => { return Err(magnus::Error::runtime_error(format!( "strategy must be one of {{'forward', 'backward', 'min', 'max', 'mean', 'zero', 'one'}}, got {}", e, ))) } }; Ok(parsed) } pub fn parse_parquet_compression( compression: &str, compression_level: Option, ) -> RbResult { let parsed = match compression { "uncompressed" => ParquetCompression::Uncompressed, "snappy" => ParquetCompression::Snappy, "gzip" => ParquetCompression::Gzip( compression_level .map(|lvl| { GzipLevel::try_new(lvl as u8) .map_err(|e| RbValueError::new_err(format!("{:?}", e))) }) .transpose()?, ), "lzo" => ParquetCompression::Lzo, "brotli" => ParquetCompression::Brotli( compression_level .map(|lvl| { BrotliLevel::try_new(lvl as u32) .map_err(|e| RbValueError::new_err(format!("{:?}", e))) }) .transpose()?, ), "lz4" => ParquetCompression::Lz4Raw, "zstd" => ParquetCompression::Zstd( compression_level .map(|lvl| { ZstdLevel::try_new(lvl) .map_err(|e| RbValueError::new_err(format!("{:?}", e))) }) .transpose()?, ), e => { return Err(RbValueError::new_err(format!( "compression must be one of {{'uncompressed', 'snappy', 'gzip', 'lzo', 'brotli', 'lz4', 'zstd'}}, got {}", e ))) } }; Ok(parsed) }