lib/polars/string_expr.rb in polars-df-0.1.3 vs lib/polars/string_expr.rb in polars-df-0.1.4
- old
+ new
@@ -7,13 +7,88 @@
# @private
def initialize(expr)
self._rbexpr = expr._rbexpr
end
- # def strptime
- # end
+ # Parse a Utf8 expression to a Date/Datetime/Time type.
+ #
+ # @param datatype [Symbol]
+ # `:date`, `:dateime`, or `:time`.
+ # @param fmt [String]
+ # Format to use, refer to the
+ # [chrono strftime documentation](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
+ # for specification. Example: `"%y-%m-%d"`.
+ # @param strict [Boolean]
+ # Raise an error if any conversion fails.
+ # @param exact [Boolean]
+ # - If true, require an exact format match.
+ # - If false, allow the format to match anywhere in the target string.
+ #
+ # @return [Expr]
+ #
+ # @note
+ # When parsing a Datetime the column precision will be inferred from
+ # the format string, if given, eg: "%F %T%.3f" => Datetime("ms"). If
+ # no fractional second component is found then the default is "us".
+ #
+ # @example
+ # s = Polars::Series.new(
+ # "date",
+ # [
+ # "2021-04-22",
+ # "2022-01-04 00:00:00",
+ # "01/31/22",
+ # "Sun Jul 8 00:34:60 2001"
+ # ]
+ # )
+ # s.to_frame.with_column(
+ # Polars.col("date")
+ # .str.strptime(:date, "%F", strict: false)
+ # .fill_null(
+ # Polars.col("date").str.strptime(:date, "%F %T", strict: false)
+ # )
+ # .fill_null(Polars.col("date").str.strptime(:date, "%D", strict: false))
+ # .fill_null(Polars.col("date").str.strptime(:date, "%c", strict: false))
+ # )
+ # # =>
+ # # shape: (4, 1)
+ # # ┌────────────┐
+ # # │ date │
+ # # │ --- │
+ # # │ date │
+ # # ╞════════════╡
+ # # │ 2021-04-22 │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┤
+ # # │ 2022-01-04 │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┤
+ # # │ 2022-01-31 │
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┤
+ # # │ 2001-07-08 │
+ # # └────────────┘
+ def strptime(datatype, fmt = nil, strict: true, exact: true)
+ if !Utils.is_polars_dtype(datatype)
+ raise ArgumentError, "expected: {DataType} got: #{datatype}"
+ end
+ if datatype == :date
+ Utils.wrap_expr(_rbexpr.str_parse_date(fmt, strict, exact))
+ elsif datatype == :datetime
+ # TODO fix
+ tu = nil # datatype.tu
+ dtcol = Utils.wrap_expr(_rbexpr.str_parse_datetime(fmt, strict, exact))
+ if tu.nil?
+ dtcol
+ else
+ dtcol.dt.cast_time_unit(tu)
+ end
+ elsif datatype == :time
+ Utils.wrap_expr(_rbexpr.str_parse_time(fmt, strict, exact))
+ else
+ raise ArgumentError, "dtype should be of type :date, :datetime, or :time"
+ end
+ end
+
# Get length of the strings as `:u32` (as number of bytes).
#
# @return [Expr]
#
# @note
@@ -289,11 +364,11 @@
Utils.wrap_expr(_rbexpr.str_zfill(alignment))
end
# Return the string left justified in a string of length `width`.
#
- # Padding is done using the specified `fillcha``.
+ # Padding is done using the specified `fillchar`.
# The original string is returned if `width` is less than or equal to
# `s.length`.
#
# @param width [Integer]
# Justify left to this length.
@@ -322,11 +397,11 @@
# # └──────────────┘
def ljust(width, fillchar = " ")
Utils.wrap_expr(_rbexpr.str_ljust(width, fillchar))
end
- # Return the string right justified in a string of length ``width``.
+ # Return the string right justified in a string of length `width`.
#
# Padding is done using the specified `fillchar`.
# The original string is returned if `width` is less than or equal to
# `s.length`.
#
@@ -476,18 +551,119 @@
# # └────────┘
def starts_with(sub)
Utils.wrap_expr(_rbexpr.str_starts_with(sub))
end
- # def json_path_match
- # end
+ # Extract the first match of json string with provided JSONPath expression.
+ #
+ # Throw errors if encounter invalid json strings.
+ # All return value will be casted to Utf8 regardless of the original value.
+ #
+ # Documentation on JSONPath standard can be found
+ # [here](https://goessner.net/articles/JsonPath/).
+ #
+ # @param json_path [String]
+ # A valid JSON path query string.
+ #
+ # @return [Expr]
+ #
+ # @example
+ # df = Polars::DataFrame.new(
+ # {"json_val" => ['{"a":"1"}', nil, '{"a":2}', '{"a":2.1}', '{"a":true}']}
+ # )
+ # df.select(Polars.col("json_val").str.json_path_match("$.a"))
+ # # =>
+ # # shape: (5, 1)
+ # # ┌──────────┐
+ # # │ json_val │
+ # # │ --- │
+ # # │ str │
+ # # ╞══════════╡
+ # # │ 1 │
+ # # ├╌╌╌╌╌╌╌╌╌╌┤
+ # # │ null │
+ # # ├╌╌╌╌╌╌╌╌╌╌┤
+ # # │ 2 │
+ # # ├╌╌╌╌╌╌╌╌╌╌┤
+ # # │ 2.1 │
+ # # ├╌╌╌╌╌╌╌╌╌╌┤
+ # # │ true │
+ # # └──────────┘
+ def json_path_match(json_path)
+ Utils.wrap_expr(_rbexpr.str_json_path_match(json_path))
+ end
- # def decode
- # end
+ # Decode a value using the provided encoding.
+ #
+ # @param encoding ["hex", "base64"]
+ # The encoding to use.
+ # @param strict [Boolean]
+ # How to handle invalid inputs:
+ #
+ # - `true`: An error will be thrown if unable to decode a value.
+ # - `false`: Unhandled values will be replaced with `nil`.
+ #
+ # @return [Expr]
+ #
+ # @example
+ # df = Polars::DataFrame.new({"encoded" => ["666f6f", "626172", nil]})
+ # df.select(Polars.col("encoded").str.decode("hex"))
+ # # =>
+ # # shape: (3, 1)
+ # # ┌─────────┐
+ # # │ encoded │
+ # # │ --- │
+ # # │ str │
+ # # ╞═════════╡
+ # # │ foo │
+ # # ├╌╌╌╌╌╌╌╌╌┤
+ # # │ bar │
+ # # ├╌╌╌╌╌╌╌╌╌┤
+ # # │ null │
+ # # └─────────┘
+ def decode(encoding, strict: false)
+ if encoding == "hex"
+ Utils.wrap_expr(_rbexpr.str_hex_decode(strict))
+ elsif encoding == "base64"
+ Utils.wrap_expr(_rbexpr.str_base64_decode(strict))
+ else
+ raise ArgumentError, "encoding must be one of {{'hex', 'base64'}}, got #{encoding}"
+ end
+ end
- # def encode
- # end
+ # Encode a value using the provided encoding.
+ #
+ # @param encoding ["hex", "base64"]
+ # The encoding to use.
+ #
+ # @return [Expr]
+ #
+ # @example
+ # df = Polars::DataFrame.new({"strings" => ["foo", "bar", nil]})
+ # df.select(Polars.col("strings").str.encode("hex"))
+ # # =>
+ # # shape: (3, 1)
+ # # ┌─────────┐
+ # # │ strings │
+ # # │ --- │
+ # # │ str │
+ # # ╞═════════╡
+ # # │ 666f6f │
+ # # ├╌╌╌╌╌╌╌╌╌┤
+ # # │ 626172 │
+ # # ├╌╌╌╌╌╌╌╌╌┤
+ # # │ null │
+ # # └─────────┘
+ def encode(encoding)
+ if encoding == "hex"
+ Utils.wrap_expr(_rbexpr.str_hex_encode)
+ elsif encoding == "base64"
+ Utils.wrap_expr(_rbexpr.str_base64_encode)
+ else
+ raise ArgumentError, "encoding must be one of {{'hex', 'base64'}}, got #{encoding}"
+ end
+ end
# Extract the target capture group from provided patterns.
#
# @param pattern [String]
# A valid regex pattern
@@ -657,13 +833,13 @@
else
Utils.wrap_expr(_rbexpr.str_split_exact(by, n))
end
end
- # Split the string by a substring, restricted to returning at most ``n`` items.
+ # Split the string by a substring, restricted to returning at most `n` items.
#
- # If the number of possible splits is less than ``n-1``, the remaining field
- # elements will be null. If the number of possible splits is ``n-1`` or greater,
+ # If the number of possible splits is less than `n-1`, the remaining field
+ # elements will be null. If the number of possible splits is `n-1` or greater,
# the last (nth) substring will contain the remainder of the string.
#
# @param by [String]
# Substring to split by.
# @param n [Integer]