lib/polars/string_expr.rb in polars-df-0.1.3 vs lib/polars/string_expr.rb in polars-df-0.1.4

- old
+ new

@@ -7,13 +7,88 @@ # @private def initialize(expr) self._rbexpr = expr._rbexpr end - # def strptime - # end + # Parse a Utf8 expression to a Date/Datetime/Time type. + # + # @param datatype [Symbol] + # `:date`, `:dateime`, or `:time`. + # @param fmt [String] + # Format to use, refer to the + # [chrono strftime documentation](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) + # for specification. Example: `"%y-%m-%d"`. + # @param strict [Boolean] + # Raise an error if any conversion fails. + # @param exact [Boolean] + # - If true, require an exact format match. + # - If false, allow the format to match anywhere in the target string. + # + # @return [Expr] + # + # @note + # When parsing a Datetime the column precision will be inferred from + # the format string, if given, eg: "%F %T%.3f" => Datetime("ms"). If + # no fractional second component is found then the default is "us". + # + # @example + # s = Polars::Series.new( + # "date", + # [ + # "2021-04-22", + # "2022-01-04 00:00:00", + # "01/31/22", + # "Sun Jul 8 00:34:60 2001" + # ] + # ) + # s.to_frame.with_column( + # Polars.col("date") + # .str.strptime(:date, "%F", strict: false) + # .fill_null( + # Polars.col("date").str.strptime(:date, "%F %T", strict: false) + # ) + # .fill_null(Polars.col("date").str.strptime(:date, "%D", strict: false)) + # .fill_null(Polars.col("date").str.strptime(:date, "%c", strict: false)) + # ) + # # => + # # shape: (4, 1) + # # ┌────────────┐ + # # │ date │ + # # │ --- │ + # # │ date │ + # # ╞════════════╡ + # # │ 2021-04-22 │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌┤ + # # │ 2022-01-04 │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌┤ + # # │ 2022-01-31 │ + # # ├╌╌╌╌╌╌╌╌╌╌╌╌┤ + # # │ 2001-07-08 │ + # # └────────────┘ + def strptime(datatype, fmt = nil, strict: true, exact: true) + if !Utils.is_polars_dtype(datatype) + raise ArgumentError, "expected: {DataType} got: #{datatype}" + end + if datatype == :date + Utils.wrap_expr(_rbexpr.str_parse_date(fmt, strict, exact)) + elsif datatype == :datetime + # TODO fix + tu = nil # datatype.tu + dtcol = Utils.wrap_expr(_rbexpr.str_parse_datetime(fmt, strict, exact)) + if tu.nil? + dtcol + else + dtcol.dt.cast_time_unit(tu) + end + elsif datatype == :time + Utils.wrap_expr(_rbexpr.str_parse_time(fmt, strict, exact)) + else + raise ArgumentError, "dtype should be of type :date, :datetime, or :time" + end + end + # Get length of the strings as `:u32` (as number of bytes). # # @return [Expr] # # @note @@ -289,11 +364,11 @@ Utils.wrap_expr(_rbexpr.str_zfill(alignment)) end # Return the string left justified in a string of length `width`. # - # Padding is done using the specified `fillcha``. + # Padding is done using the specified `fillchar`. # The original string is returned if `width` is less than or equal to # `s.length`. # # @param width [Integer] # Justify left to this length. @@ -322,11 +397,11 @@ # # └──────────────┘ def ljust(width, fillchar = " ") Utils.wrap_expr(_rbexpr.str_ljust(width, fillchar)) end - # Return the string right justified in a string of length ``width``. + # Return the string right justified in a string of length `width`. # # Padding is done using the specified `fillchar`. # The original string is returned if `width` is less than or equal to # `s.length`. # @@ -476,18 +551,119 @@ # # └────────┘ def starts_with(sub) Utils.wrap_expr(_rbexpr.str_starts_with(sub)) end - # def json_path_match - # end + # Extract the first match of json string with provided JSONPath expression. + # + # Throw errors if encounter invalid json strings. + # All return value will be casted to Utf8 regardless of the original value. + # + # Documentation on JSONPath standard can be found + # [here](https://goessner.net/articles/JsonPath/). + # + # @param json_path [String] + # A valid JSON path query string. + # + # @return [Expr] + # + # @example + # df = Polars::DataFrame.new( + # {"json_val" => ['{"a":"1"}', nil, '{"a":2}', '{"a":2.1}', '{"a":true}']} + # ) + # df.select(Polars.col("json_val").str.json_path_match("$.a")) + # # => + # # shape: (5, 1) + # # ┌──────────┐ + # # │ json_val │ + # # │ --- │ + # # │ str │ + # # ╞══════════╡ + # # │ 1 │ + # # ├╌╌╌╌╌╌╌╌╌╌┤ + # # │ null │ + # # ├╌╌╌╌╌╌╌╌╌╌┤ + # # │ 2 │ + # # ├╌╌╌╌╌╌╌╌╌╌┤ + # # │ 2.1 │ + # # ├╌╌╌╌╌╌╌╌╌╌┤ + # # │ true │ + # # └──────────┘ + def json_path_match(json_path) + Utils.wrap_expr(_rbexpr.str_json_path_match(json_path)) + end - # def decode - # end + # Decode a value using the provided encoding. + # + # @param encoding ["hex", "base64"] + # The encoding to use. + # @param strict [Boolean] + # How to handle invalid inputs: + # + # - `true`: An error will be thrown if unable to decode a value. + # - `false`: Unhandled values will be replaced with `nil`. + # + # @return [Expr] + # + # @example + # df = Polars::DataFrame.new({"encoded" => ["666f6f", "626172", nil]}) + # df.select(Polars.col("encoded").str.decode("hex")) + # # => + # # shape: (3, 1) + # # ┌─────────┐ + # # │ encoded │ + # # │ --- │ + # # │ str │ + # # ╞═════════╡ + # # │ foo │ + # # ├╌╌╌╌╌╌╌╌╌┤ + # # │ bar │ + # # ├╌╌╌╌╌╌╌╌╌┤ + # # │ null │ + # # └─────────┘ + def decode(encoding, strict: false) + if encoding == "hex" + Utils.wrap_expr(_rbexpr.str_hex_decode(strict)) + elsif encoding == "base64" + Utils.wrap_expr(_rbexpr.str_base64_decode(strict)) + else + raise ArgumentError, "encoding must be one of {{'hex', 'base64'}}, got #{encoding}" + end + end - # def encode - # end + # Encode a value using the provided encoding. + # + # @param encoding ["hex", "base64"] + # The encoding to use. + # + # @return [Expr] + # + # @example + # df = Polars::DataFrame.new({"strings" => ["foo", "bar", nil]}) + # df.select(Polars.col("strings").str.encode("hex")) + # # => + # # shape: (3, 1) + # # ┌─────────┐ + # # │ strings │ + # # │ --- │ + # # │ str │ + # # ╞═════════╡ + # # │ 666f6f │ + # # ├╌╌╌╌╌╌╌╌╌┤ + # # │ 626172 │ + # # ├╌╌╌╌╌╌╌╌╌┤ + # # │ null │ + # # └─────────┘ + def encode(encoding) + if encoding == "hex" + Utils.wrap_expr(_rbexpr.str_hex_encode) + elsif encoding == "base64" + Utils.wrap_expr(_rbexpr.str_base64_encode) + else + raise ArgumentError, "encoding must be one of {{'hex', 'base64'}}, got #{encoding}" + end + end # Extract the target capture group from provided patterns. # # @param pattern [String] # A valid regex pattern @@ -657,13 +833,13 @@ else Utils.wrap_expr(_rbexpr.str_split_exact(by, n)) end end - # Split the string by a substring, restricted to returning at most ``n`` items. + # Split the string by a substring, restricted to returning at most `n` items. # - # If the number of possible splits is less than ``n-1``, the remaining field - # elements will be null. If the number of possible splits is ``n-1`` or greater, + # If the number of possible splits is less than `n-1`, the remaining field + # elements will be null. If the number of possible splits is `n-1` or greater, # the last (nth) substring will contain the remainder of the string. # # @param by [String] # Substring to split by. # @param n [Integer]