lib/polars/io.rb in polars-df-0.1.2 vs lib/polars/io.rb in polars-df-0.1.3
- old
+ new
@@ -1,7 +1,95 @@
module Polars
module IO
+ # Read a CSV file into a DataFrame.
+ #
+ # @param file [Object]
+ # Path to a file or a file-like object.
+ # @param has_header [Boolean]
+ # Indicate if the first row of dataset is a header or not.
+ # If set to false, column names will be autogenerated in the
+ # following format: `column_x`, with `x` being an
+ # enumeration over every column in the dataset starting at 1.
+ # @param columns [Object]
+ # Columns to select. Accepts a list of column indices (starting
+ # at zero) or a list of column names.
+ # @param new_columns [Object]
+ # Rename columns right after parsing the CSV file. If the given
+ # list is shorter than the width of the DataFrame the remaining
+ # columns will have their original name.
+ # @param sep [String]
+ # Single byte character to use as delimiter in the file.
+ # @param comment_char [String]
+ # Single byte character that indicates the start of a comment line,
+ # for instance `#`.
+ # @param quote_char [String]
+ # Single byte character used for csv quoting.
+ # Set to nil to turn off special handling and escaping of quotes.
+ # @param skip_rows [Integer]
+ # Start reading after `skip_rows` lines.
+ # @param dtypes [Object]
+ # Overwrite dtypes during inference.
+ # @param null_values [Object]
+ # Values to interpret as null values. You can provide a:
+ #
+ # - `String`: All values equal to this string will be null.
+ # - `Array`: All values equal to any string in this array will be null.
+ # - `Hash`: A hash that maps column name to a null value string.
+ # @param ignore_errors [Boolean]
+ # Try to keep reading lines if some lines yield errors.
+ # First try `infer_schema_length: 0` to read all columns as
+ # `:str` to check which values might cause an issue.
+ # @param parse_dates [Boolean]
+ # Try to automatically parse dates. If this does not succeed,
+ # the column remains of data type `:str`.
+ # @param n_threads [Integer]
+ # Number of threads to use in csv parsing.
+ # Defaults to the number of physical cpu's of your system.
+ # @param infer_schema_length [Integer]
+ # Maximum number of lines to read to infer schema.
+ # If set to 0, all columns will be read as `:utf8`.
+ # If set to `nil`, a full table scan will be done (slow).
+ # @param batch_size [Integer]
+ # Number of lines to read into the buffer at once.
+ # Modify this to change performance.
+ # @param n_rows [Integer]
+ # Stop reading from CSV file after reading `n_rows`.
+ # During multi-threaded parsing, an upper bound of `n_rows`
+ # rows cannot be guaranteed.
+ # @param encoding ["utf8", "utf8-lossy"]
+ # Lossy means that invalid utf8 values are replaced with `�`
+ # characters. When using other encodings than `utf8` or
+ # `utf8-lossy`, the input is first decoded im memory with
+ # python.
+ # @param low_memory [Boolean]
+ # Reduce memory usage at expense of performance.
+ # @param rechunk [Boolean]
+ # Make sure that all columns are contiguous in memory by
+ # aggregating the chunks into a single array.
+ # @param storage_options [Hash]
+ # Extra options that make sense for a
+ # particular storage connection.
+ # @param skip_rows_after_header [Integer]
+ # Skip this number of rows when the header is parsed.
+ # @param row_count_name [String]
+ # If not nil, this will insert a row count column with the given name into
+ # the DataFrame.
+ # @param row_count_offset [Integer]
+ # Offset to start the row_count column (only used if the name is set).
+ # @param sample_size [Integer]
+ # Set the sample size. This is used to sample statistics to estimate the
+ # allocation needed.
+ # @param eol_char [String]
+ # Single byte end of line character.
+ #
+ # @return [DataFrame]
+ #
+ # @note
+ # This operation defaults to a `rechunk` operation at the end, meaning that
+ # all data will be stored continuously in memory.
+ # Set `rechunk: false` if you are benchmarking the csv-reader. A `rechunk` is
+ # an expensive operation.
def read_csv(
file,
has_header: true,
columns: nil,
new_columns: nil,
@@ -82,10 +170,79 @@
else
df
end
end
+ # Lazily read from a CSV file or multiple files via glob patterns.
+ #
+ # This allows the query optimizer to push down predicates and
+ # projections to the scan level, thereby potentially reducing
+ # memory overhead.
+ #
+ # @param file [Object]
+ # Path to a file.
+ # @param has_header [Boolean]
+ # Indicate if the first row of dataset is a header or not.
+ # If set to false, column names will be autogenerated in the
+ # following format: ``column_x``, with ``x`` being an
+ # enumeration over every column in the dataset starting at 1.
+ # @param sep [String]
+ # Single byte character to use as delimiter in the file.
+ # @param comment_char [String]
+ # Single byte character that indicates the start of a comment line,
+ # for instance `#`.
+ # @param quote_char [String]
+ # Single byte character used for csv quoting.
+ # Set to None to turn off special handling and escaping of quotes.
+ # @param skip_rows [Integer]
+ # Start reading after `skip_rows` lines. The header will be parsed at this
+ # offset.
+ # @param dtypes [Object]
+ # Overwrite dtypes during inference.
+ # @param null_values [Object]
+ # Values to interpret as null values. You can provide a:
+ #
+ # - `String`: All values equal to this string will be null.
+ # - `Array`: All values equal to any string in this array will be null.
+ # - `Hash`: A hash that maps column name to a null value string.
+ # @param ignore_errors [Boolean]
+ # Try to keep reading lines if some lines yield errors.
+ # First try `infer_schema_length: 0` to read all columns as
+ # `:str` to check which values might cause an issue.
+ # @param cache [Boolean]
+ # Cache the result after reading.
+ # @param with_column_names [Object]
+ # Apply a function over the column names.
+ # This can be used to update a schema just in time, thus before
+ # scanning.
+ # @param infer_schema_length [Integer]
+ # Maximum number of lines to read to infer schema.
+ # If set to 0, all columns will be read as `:str`.
+ # If set to `nil`, a full table scan will be done (slow).
+ # @param n_rows [Integer]
+ # Stop reading from CSV file after reading `n_rows`.
+ # @param encoding ["utf8", "utf8-lossy"]
+ # Lossy means that invalid utf8 values are replaced with `�`
+ # characters.
+ # @param low_memory [Boolean]
+ # Reduce memory usage in expense of performance.
+ # @param rechunk [Boolean]
+ # Reallocate to contiguous memory when all chunks/ files are parsed.
+ # @param skip_rows_after_header [Integer]
+ # Skip this number of rows when the header is parsed.
+ # @param row_count_name [String]
+ # If not nil, this will insert a row count column with the given name into
+ # the DataFrame.
+ # @param row_count_offset [Integer]
+ # Offset to start the row_count column (only used if the name is set).
+ # @param parse_dates [Boolean]
+ # Try to automatically parse dates. If this does not succeed,
+ # the column remains of data type `:str`.
+ # @param eol_char [String]
+ # Single byte end of line character.
+ #
+ # @return [LazyFrame]
def scan_csv(
file,
has_header: true,
sep: ",",
comment_char: nil,
@@ -138,10 +295,36 @@
parse_dates: parse_dates,
eol_char: eol_char,
)
end
+ # Lazily read from an Arrow IPC (Feather v2) file or multiple files via glob patterns.
+ #
+ # This allows the query optimizer to push down predicates and projections to the scan
+ # level, thereby potentially reducing memory overhead.
+ #
+ # @param file [String]
+ # Path to a IPC file.
+ # @param n_rows [Integer]
+ # Stop reading from IPC file after reading `n_rows`.
+ # @param cache [Boolean]
+ # Cache the result after reading.
+ # @param rechunk [Boolean]
+ # Reallocate to contiguous memory when all chunks/ files are parsed.
+ # @param row_count_name [String]
+ # If not nil, this will insert a row count column with give name into the
+ # DataFrame.
+ # @param row_count_offset [Integer]
+ # Offset to start the row_count column (only use if the name is set).
+ # @param storage_options [Hash]
+ # Extra options that make sense for a particular storage connection.
+ # @param memory_map [Boolean]
+ # Try to memory map the file. This can greatly improve performance on repeated
+ # queries as the OS may cache pages.
+ # Only uncompressed IPC files can be memory mapped.
+ #
+ # @return [LazyFrame]
def scan_ipc(
file,
n_rows: nil,
cache: true,
rechunk: true,
@@ -160,10 +343,38 @@
storage_options: storage_options,
memory_map: memory_map
)
end
+ # Lazily read from a parquet file or multiple files via glob patterns.
+ #
+ # This allows the query optimizer to push down predicates and projections to the scan
+ # level, thereby potentially reducing memory overhead.
+ #
+ # @param file [String]
+ # Path to a file.
+ # @param n_rows [Integer]
+ # Stop reading from parquet file after reading `n_rows`.
+ # @param cache [Boolean]
+ # Cache the result after reading.
+ # @param parallel ["auto", "columns", "row_groups", "none"]
+ # This determines the direction of parallelism. 'auto' will try to determine the
+ # optimal direction.
+ # @param rechunk [Boolean]
+ # In case of reading multiple files via a glob pattern rechunk the final DataFrame
+ # into contiguous memory chunks.
+ # @param row_count_name [String]
+ # If not nil, this will insert a row count column with give name into the
+ # DataFrame.
+ # @param row_count_offset [Integer]
+ # Offset to start the row_count column (only use if the name is set).
+ # @param storage_options [Hash]
+ # Extra options that make sense for a particular storage connection.
+ # @param low_memory [Boolean]
+ # Reduce memory pressure at the expense of performance.
+ #
+ # @return [LazyFrame]
def scan_parquet(
file,
n_rows: nil,
cache: true,
parallel: "auto",
@@ -188,10 +399,34 @@
storage_options: storage_options,
low_memory: low_memory
)
end
+ # Lazily read from a newline delimited JSON file.
+ #
+ # This allows the query optimizer to push down predicates and projections to the scan
+ # level, thereby potentially reducing memory overhead.
+ #
+ # @param file [String]
+ # Path to a file.
+ # @param infer_schema_length [Integer]
+ # Infer the schema length from the first `infer_schema_length` rows.
+ # @param batch_size [Integer]
+ # Number of rows to read in each batch.
+ # @param n_rows [Integer]
+ # Stop reading from JSON file after reading `n_rows`.
+ # @param low_memory [Boolean]
+ # Reduce memory pressure at the expense of performance.
+ # @param rechunk [Boolean]
+ # Reallocate to contiguous memory when all chunks/ files are parsed.
+ # @param row_count_name [String]
+ # If not nil, this will insert a row count column with give name into the
+ # DataFrame.
+ # @param row_count_offset [Integer]
+ # Offset to start the row_count column (only use if the name is set).
+ #
+ # @return [LazyFrame]
def scan_ndjson(
file,
infer_schema_length: 100,
batch_size: 1024,
n_rows: nil,
@@ -217,10 +452,34 @@
end
# def read_avro
# end
+ # Read into a DataFrame from Arrow IPC (Feather v2) file.
+ #
+ # @param file [Object]
+ # Path to a file or a file-like object.
+ # @param columns [Object]
+ # Columns to select. Accepts a list of column indices (starting at zero) or a list
+ # of column names.
+ # @param n_rows [Integer]
+ # Stop reading from IPC file after reading `n_rows`.
+ # @param memory_map [Boolean]
+ # Try to memory map the file. This can greatly improve performance on repeated
+ # queries as the OS may cache pages.
+ # Only uncompressed IPC files can be memory mapped.
+ # @param storage_options [Hash]
+ # Extra options that make sense for a particular storage connection.
+ # @param row_count_name [String]
+ # If not nil, this will insert a row count column with give name into the
+ # DataFrame.
+ # @param row_count_offset [Integer]
+ # Offset to start the row_count column (only use if the name is set).
+ # @param rechunk [Boolean]
+ # Make sure that all data is contiguous.
+ #
+ # @return [DataFrame]
def read_ipc(
file,
columns: nil,
n_rows: nil,
memory_map: true,
@@ -241,30 +500,178 @@
memory_map: memory_map
)
end
end
- def read_parquet(file)
+ # Read into a DataFrame from a parquet file.
+ #
+ # @param file [Object]
+ # Path to a file, or a file-like object.
+ # @param columns [Object]
+ # Columns to select. Accepts a list of column indices (starting at zero) or a list
+ # of column names.
+ # @param n_rows [Integer]
+ # Stop reading from parquet file after reading `n_rows`.
+ # @param storage_options [Hash]
+ # Extra options that make sense for a particular storage connection.
+ # @param parallel ["auto", "columns", "row_groups", "none"]
+ # This determines the direction of parallelism. 'auto' will try to determine the
+ # optimal direction.
+ # @param row_count_name [String]
+ # If not nil, this will insert a row count column with give name into the
+ # DataFrame.
+ # @param row_count_offset [Integer]
+ # Offset to start the row_count column (only use if the name is set).
+ # @param low_memory [Boolean]
+ # Reduce memory pressure at the expense of performance.
+ #
+ # @return [DataFrame]
+ #
+ # @note
+ # This operation defaults to a `rechunk` operation at the end, meaning that
+ # all data will be stored continuously in memory.
+ # Set `rechunk: false` if you are benchmarking the parquet-reader. A `rechunk` is
+ # an expensive operation.
+ def read_parquet(
+ file,
+ columns: nil,
+ n_rows: nil,
+ storage_options: nil,
+ parallel: "auto",
+ row_count_name: nil,
+ row_count_offset: 0,
+ low_memory: false
+ )
_prepare_file_arg(file) do |data|
- DataFrame._read_parquet(data)
+ DataFrame._read_parquet(
+ data,
+ columns: columns,
+ n_rows: n_rows,
+ parallel: parallel,
+ row_count_name: row_count_name,
+ row_count_offset: row_count_offset,
+ low_memory: low_memory
+ )
end
end
+ # Read into a DataFrame from a JSON file.
+ #
+ # @param file [Object]
+ # Path to a file or a file-like object.
+ #
+ # @return [DataFrame]
def read_json(file)
DataFrame._read_json(file)
end
+ # Read into a DataFrame from a newline delimited JSON file.
+ #
+ # @param file [Object]
+ # Path to a file or a file-like object.
+ #
+ # @return [DataFrame]
def read_ndjson(file)
DataFrame._read_ndjson(file)
end
# def read_sql
# end
# def read_excel
# end
+ # Read a CSV file in batches.
+ #
+ # Upon creation of the `BatchedCsvReader`,
+ # polars will gather statistics and determine the
+ # file chunks. After that work will only be done
+ # if `next_batches` is called.
+ #
+ # @param file [Object]
+ # Path to a file or a file-like object.
+ # @param has_header [Boolean]
+ # Indicate if the first row of dataset is a header or not.
+ # If set to False, column names will be autogenerated in the
+ # following format: `column_x`, with `x` being an
+ # enumeration over every column in the dataset starting at 1.
+ # @param columns [Object]
+ # Columns to select. Accepts a list of column indices (starting
+ # at zero) or a list of column names.
+ # @param new_columns [Object]
+ # Rename columns right after parsing the CSV file. If the given
+ # list is shorter than the width of the DataFrame the remaining
+ # columns will have their original name.
+ # @param sep [String]
+ # Single byte character to use as delimiter in the file.
+ # @param comment_char [String]
+ # Single byte character that indicates the start of a comment line,
+ # for instance `#`.
+ # @param quote_char [String]
+ # Single byte character used for csv quoting, default = `"`.
+ # Set to nil to turn off special handling and escaping of quotes.
+ # @param skip_rows [Integer]
+ # Start reading after `skip_rows` lines.
+ # @param dtypes [Object]
+ # Overwrite dtypes during inference.
+ # @param null_values [Object]
+ # Values to interpret as null values. You can provide a:
+ #
+ # - `String`: All values equal to this string will be null.
+ # - `Array`: All values equal to any string in this array will be null.
+ # - `Hash`: A hash that maps column name to a null value string.
+ # @param ignore_errors [Boolean]
+ # Try to keep reading lines if some lines yield errors.
+ # First try `infer_schema_length: 0` to read all columns as
+ # `:str` to check which values might cause an issue.
+ # @param parse_dates [Boolean]
+ # Try to automatically parse dates. If this does not succeed,
+ # the column remains of data type `:str`.
+ # @param n_threads [Integer]
+ # Number of threads to use in csv parsing.
+ # Defaults to the number of physical cpu's of your system.
+ # @param infer_schema_length [Integer]
+ # Maximum number of lines to read to infer schema.
+ # If set to 0, all columns will be read as `:str`.
+ # If set to `nil`, a full table scan will be done (slow).
+ # @param batch_size [Integer]
+ # Number of lines to read into the buffer at once.
+ # Modify this to change performance.
+ # @param n_rows [Integer]
+ # Stop reading from CSV file after reading `n_rows`.
+ # During multi-threaded parsing, an upper bound of `n_rows`
+ # rows cannot be guaranteed.
+ # @param encoding ["utf8", "utf8-lossy"]
+ # Lossy means that invalid utf8 values are replaced with `�`
+ # characters. When using other encodings than `utf8` or
+ # `utf8-lossy`, the input is first decoded im memory with
+ # Ruby. Defaults to `utf8`.
+ # @param low_memory [Boolean]
+ # Reduce memory usage at expense of performance.
+ # @param rechunk [Boolean]
+ # Make sure that all columns are contiguous in memory by
+ # aggregating the chunks into a single array.
+ # @param skip_rows_after_header [Integer]
+ # Skip this number of rows when the header is parsed.
+ # @param row_count_name [String]
+ # If not nil, this will insert a row count column with the given name into
+ # the DataFrame.
+ # @param row_count_offset [Integer]
+ # Offset to start the row_count column (only used if the name is set).
+ # @param sample_size [Integer]
+ # Set the sample size. This is used to sample statistics to estimate the
+ # allocation needed.
+ # @param eol_char [String]
+ # Single byte end of line character.
+ #
+ # @return [BatchedCsvReader]
+ #
+ # @example
+ # reader = Polars.read_csv_batched(
+ # "./tpch/tables_scale_100/lineitem.tbl", sep: "|", parse_dates: true
+ # )
+ # reader.next_batches(5)
def read_csv_batched(
file,
has_header: true,
columns: nil,
new_columns: nil,
@@ -329,17 +736,29 @@
eol_char: eol_char,
new_columns: new_columns
)
end
+ # Get a schema of the IPC file without reading data.
+ #
+ # @param file [Object]
+ # Path to a file or a file-like object.
+ #
+ # @return [Hash]
def read_ipc_schema(file)
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
file = Utils.format_path(file)
end
_ipc_schema(file)
end
+ # Get a schema of the Parquet file without reading data.
+ #
+ # @param file [Object]
+ # Path to a file or a file-like object.
+ #
+ # @return [Hash]
def read_parquet_schema(file)
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
file = Utils.format_path(file)
end