lib/polars/io.rb in polars-df-0.1.2 vs lib/polars/io.rb in polars-df-0.1.3

- old
+ new

@@ -1,7 +1,95 @@ module Polars module IO + # Read a CSV file into a DataFrame. + # + # @param file [Object] + # Path to a file or a file-like object. + # @param has_header [Boolean] + # Indicate if the first row of dataset is a header or not. + # If set to false, column names will be autogenerated in the + # following format: `column_x`, with `x` being an + # enumeration over every column in the dataset starting at 1. + # @param columns [Object] + # Columns to select. Accepts a list of column indices (starting + # at zero) or a list of column names. + # @param new_columns [Object] + # Rename columns right after parsing the CSV file. If the given + # list is shorter than the width of the DataFrame the remaining + # columns will have their original name. + # @param sep [String] + # Single byte character to use as delimiter in the file. + # @param comment_char [String] + # Single byte character that indicates the start of a comment line, + # for instance `#`. + # @param quote_char [String] + # Single byte character used for csv quoting. + # Set to nil to turn off special handling and escaping of quotes. + # @param skip_rows [Integer] + # Start reading after `skip_rows` lines. + # @param dtypes [Object] + # Overwrite dtypes during inference. + # @param null_values [Object] + # Values to interpret as null values. You can provide a: + # + # - `String`: All values equal to this string will be null. + # - `Array`: All values equal to any string in this array will be null. + # - `Hash`: A hash that maps column name to a null value string. + # @param ignore_errors [Boolean] + # Try to keep reading lines if some lines yield errors. + # First try `infer_schema_length: 0` to read all columns as + # `:str` to check which values might cause an issue. + # @param parse_dates [Boolean] + # Try to automatically parse dates. If this does not succeed, + # the column remains of data type `:str`. + # @param n_threads [Integer] + # Number of threads to use in csv parsing. + # Defaults to the number of physical cpu's of your system. + # @param infer_schema_length [Integer] + # Maximum number of lines to read to infer schema. + # If set to 0, all columns will be read as `:utf8`. + # If set to `nil`, a full table scan will be done (slow). + # @param batch_size [Integer] + # Number of lines to read into the buffer at once. + # Modify this to change performance. + # @param n_rows [Integer] + # Stop reading from CSV file after reading `n_rows`. + # During multi-threaded parsing, an upper bound of `n_rows` + # rows cannot be guaranteed. + # @param encoding ["utf8", "utf8-lossy"] + # Lossy means that invalid utf8 values are replaced with `�` + # characters. When using other encodings than `utf8` or + # `utf8-lossy`, the input is first decoded im memory with + # python. + # @param low_memory [Boolean] + # Reduce memory usage at expense of performance. + # @param rechunk [Boolean] + # Make sure that all columns are contiguous in memory by + # aggregating the chunks into a single array. + # @param storage_options [Hash] + # Extra options that make sense for a + # particular storage connection. + # @param skip_rows_after_header [Integer] + # Skip this number of rows when the header is parsed. + # @param row_count_name [String] + # If not nil, this will insert a row count column with the given name into + # the DataFrame. + # @param row_count_offset [Integer] + # Offset to start the row_count column (only used if the name is set). + # @param sample_size [Integer] + # Set the sample size. This is used to sample statistics to estimate the + # allocation needed. + # @param eol_char [String] + # Single byte end of line character. + # + # @return [DataFrame] + # + # @note + # This operation defaults to a `rechunk` operation at the end, meaning that + # all data will be stored continuously in memory. + # Set `rechunk: false` if you are benchmarking the csv-reader. A `rechunk` is + # an expensive operation. def read_csv( file, has_header: true, columns: nil, new_columns: nil, @@ -82,10 +170,79 @@ else df end end + # Lazily read from a CSV file or multiple files via glob patterns. + # + # This allows the query optimizer to push down predicates and + # projections to the scan level, thereby potentially reducing + # memory overhead. + # + # @param file [Object] + # Path to a file. + # @param has_header [Boolean] + # Indicate if the first row of dataset is a header or not. + # If set to false, column names will be autogenerated in the + # following format: ``column_x``, with ``x`` being an + # enumeration over every column in the dataset starting at 1. + # @param sep [String] + # Single byte character to use as delimiter in the file. + # @param comment_char [String] + # Single byte character that indicates the start of a comment line, + # for instance `#`. + # @param quote_char [String] + # Single byte character used for csv quoting. + # Set to None to turn off special handling and escaping of quotes. + # @param skip_rows [Integer] + # Start reading after `skip_rows` lines. The header will be parsed at this + # offset. + # @param dtypes [Object] + # Overwrite dtypes during inference. + # @param null_values [Object] + # Values to interpret as null values. You can provide a: + # + # - `String`: All values equal to this string will be null. + # - `Array`: All values equal to any string in this array will be null. + # - `Hash`: A hash that maps column name to a null value string. + # @param ignore_errors [Boolean] + # Try to keep reading lines if some lines yield errors. + # First try `infer_schema_length: 0` to read all columns as + # `:str` to check which values might cause an issue. + # @param cache [Boolean] + # Cache the result after reading. + # @param with_column_names [Object] + # Apply a function over the column names. + # This can be used to update a schema just in time, thus before + # scanning. + # @param infer_schema_length [Integer] + # Maximum number of lines to read to infer schema. + # If set to 0, all columns will be read as `:str`. + # If set to `nil`, a full table scan will be done (slow). + # @param n_rows [Integer] + # Stop reading from CSV file after reading `n_rows`. + # @param encoding ["utf8", "utf8-lossy"] + # Lossy means that invalid utf8 values are replaced with `�` + # characters. + # @param low_memory [Boolean] + # Reduce memory usage in expense of performance. + # @param rechunk [Boolean] + # Reallocate to contiguous memory when all chunks/ files are parsed. + # @param skip_rows_after_header [Integer] + # Skip this number of rows when the header is parsed. + # @param row_count_name [String] + # If not nil, this will insert a row count column with the given name into + # the DataFrame. + # @param row_count_offset [Integer] + # Offset to start the row_count column (only used if the name is set). + # @param parse_dates [Boolean] + # Try to automatically parse dates. If this does not succeed, + # the column remains of data type `:str`. + # @param eol_char [String] + # Single byte end of line character. + # + # @return [LazyFrame] def scan_csv( file, has_header: true, sep: ",", comment_char: nil, @@ -138,10 +295,36 @@ parse_dates: parse_dates, eol_char: eol_char, ) end + # Lazily read from an Arrow IPC (Feather v2) file or multiple files via glob patterns. + # + # This allows the query optimizer to push down predicates and projections to the scan + # level, thereby potentially reducing memory overhead. + # + # @param file [String] + # Path to a IPC file. + # @param n_rows [Integer] + # Stop reading from IPC file after reading `n_rows`. + # @param cache [Boolean] + # Cache the result after reading. + # @param rechunk [Boolean] + # Reallocate to contiguous memory when all chunks/ files are parsed. + # @param row_count_name [String] + # If not nil, this will insert a row count column with give name into the + # DataFrame. + # @param row_count_offset [Integer] + # Offset to start the row_count column (only use if the name is set). + # @param storage_options [Hash] + # Extra options that make sense for a particular storage connection. + # @param memory_map [Boolean] + # Try to memory map the file. This can greatly improve performance on repeated + # queries as the OS may cache pages. + # Only uncompressed IPC files can be memory mapped. + # + # @return [LazyFrame] def scan_ipc( file, n_rows: nil, cache: true, rechunk: true, @@ -160,10 +343,38 @@ storage_options: storage_options, memory_map: memory_map ) end + # Lazily read from a parquet file or multiple files via glob patterns. + # + # This allows the query optimizer to push down predicates and projections to the scan + # level, thereby potentially reducing memory overhead. + # + # @param file [String] + # Path to a file. + # @param n_rows [Integer] + # Stop reading from parquet file after reading `n_rows`. + # @param cache [Boolean] + # Cache the result after reading. + # @param parallel ["auto", "columns", "row_groups", "none"] + # This determines the direction of parallelism. 'auto' will try to determine the + # optimal direction. + # @param rechunk [Boolean] + # In case of reading multiple files via a glob pattern rechunk the final DataFrame + # into contiguous memory chunks. + # @param row_count_name [String] + # If not nil, this will insert a row count column with give name into the + # DataFrame. + # @param row_count_offset [Integer] + # Offset to start the row_count column (only use if the name is set). + # @param storage_options [Hash] + # Extra options that make sense for a particular storage connection. + # @param low_memory [Boolean] + # Reduce memory pressure at the expense of performance. + # + # @return [LazyFrame] def scan_parquet( file, n_rows: nil, cache: true, parallel: "auto", @@ -188,10 +399,34 @@ storage_options: storage_options, low_memory: low_memory ) end + # Lazily read from a newline delimited JSON file. + # + # This allows the query optimizer to push down predicates and projections to the scan + # level, thereby potentially reducing memory overhead. + # + # @param file [String] + # Path to a file. + # @param infer_schema_length [Integer] + # Infer the schema length from the first `infer_schema_length` rows. + # @param batch_size [Integer] + # Number of rows to read in each batch. + # @param n_rows [Integer] + # Stop reading from JSON file after reading `n_rows`. + # @param low_memory [Boolean] + # Reduce memory pressure at the expense of performance. + # @param rechunk [Boolean] + # Reallocate to contiguous memory when all chunks/ files are parsed. + # @param row_count_name [String] + # If not nil, this will insert a row count column with give name into the + # DataFrame. + # @param row_count_offset [Integer] + # Offset to start the row_count column (only use if the name is set). + # + # @return [LazyFrame] def scan_ndjson( file, infer_schema_length: 100, batch_size: 1024, n_rows: nil, @@ -217,10 +452,34 @@ end # def read_avro # end + # Read into a DataFrame from Arrow IPC (Feather v2) file. + # + # @param file [Object] + # Path to a file or a file-like object. + # @param columns [Object] + # Columns to select. Accepts a list of column indices (starting at zero) or a list + # of column names. + # @param n_rows [Integer] + # Stop reading from IPC file after reading `n_rows`. + # @param memory_map [Boolean] + # Try to memory map the file. This can greatly improve performance on repeated + # queries as the OS may cache pages. + # Only uncompressed IPC files can be memory mapped. + # @param storage_options [Hash] + # Extra options that make sense for a particular storage connection. + # @param row_count_name [String] + # If not nil, this will insert a row count column with give name into the + # DataFrame. + # @param row_count_offset [Integer] + # Offset to start the row_count column (only use if the name is set). + # @param rechunk [Boolean] + # Make sure that all data is contiguous. + # + # @return [DataFrame] def read_ipc( file, columns: nil, n_rows: nil, memory_map: true, @@ -241,30 +500,178 @@ memory_map: memory_map ) end end - def read_parquet(file) + # Read into a DataFrame from a parquet file. + # + # @param file [Object] + # Path to a file, or a file-like object. + # @param columns [Object] + # Columns to select. Accepts a list of column indices (starting at zero) or a list + # of column names. + # @param n_rows [Integer] + # Stop reading from parquet file after reading `n_rows`. + # @param storage_options [Hash] + # Extra options that make sense for a particular storage connection. + # @param parallel ["auto", "columns", "row_groups", "none"] + # This determines the direction of parallelism. 'auto' will try to determine the + # optimal direction. + # @param row_count_name [String] + # If not nil, this will insert a row count column with give name into the + # DataFrame. + # @param row_count_offset [Integer] + # Offset to start the row_count column (only use if the name is set). + # @param low_memory [Boolean] + # Reduce memory pressure at the expense of performance. + # + # @return [DataFrame] + # + # @note + # This operation defaults to a `rechunk` operation at the end, meaning that + # all data will be stored continuously in memory. + # Set `rechunk: false` if you are benchmarking the parquet-reader. A `rechunk` is + # an expensive operation. + def read_parquet( + file, + columns: nil, + n_rows: nil, + storage_options: nil, + parallel: "auto", + row_count_name: nil, + row_count_offset: 0, + low_memory: false + ) _prepare_file_arg(file) do |data| - DataFrame._read_parquet(data) + DataFrame._read_parquet( + data, + columns: columns, + n_rows: n_rows, + parallel: parallel, + row_count_name: row_count_name, + row_count_offset: row_count_offset, + low_memory: low_memory + ) end end + # Read into a DataFrame from a JSON file. + # + # @param file [Object] + # Path to a file or a file-like object. + # + # @return [DataFrame] def read_json(file) DataFrame._read_json(file) end + # Read into a DataFrame from a newline delimited JSON file. + # + # @param file [Object] + # Path to a file or a file-like object. + # + # @return [DataFrame] def read_ndjson(file) DataFrame._read_ndjson(file) end # def read_sql # end # def read_excel # end + # Read a CSV file in batches. + # + # Upon creation of the `BatchedCsvReader`, + # polars will gather statistics and determine the + # file chunks. After that work will only be done + # if `next_batches` is called. + # + # @param file [Object] + # Path to a file or a file-like object. + # @param has_header [Boolean] + # Indicate if the first row of dataset is a header or not. + # If set to False, column names will be autogenerated in the + # following format: `column_x`, with `x` being an + # enumeration over every column in the dataset starting at 1. + # @param columns [Object] + # Columns to select. Accepts a list of column indices (starting + # at zero) or a list of column names. + # @param new_columns [Object] + # Rename columns right after parsing the CSV file. If the given + # list is shorter than the width of the DataFrame the remaining + # columns will have their original name. + # @param sep [String] + # Single byte character to use as delimiter in the file. + # @param comment_char [String] + # Single byte character that indicates the start of a comment line, + # for instance `#`. + # @param quote_char [String] + # Single byte character used for csv quoting, default = `"`. + # Set to nil to turn off special handling and escaping of quotes. + # @param skip_rows [Integer] + # Start reading after `skip_rows` lines. + # @param dtypes [Object] + # Overwrite dtypes during inference. + # @param null_values [Object] + # Values to interpret as null values. You can provide a: + # + # - `String`: All values equal to this string will be null. + # - `Array`: All values equal to any string in this array will be null. + # - `Hash`: A hash that maps column name to a null value string. + # @param ignore_errors [Boolean] + # Try to keep reading lines if some lines yield errors. + # First try `infer_schema_length: 0` to read all columns as + # `:str` to check which values might cause an issue. + # @param parse_dates [Boolean] + # Try to automatically parse dates. If this does not succeed, + # the column remains of data type `:str`. + # @param n_threads [Integer] + # Number of threads to use in csv parsing. + # Defaults to the number of physical cpu's of your system. + # @param infer_schema_length [Integer] + # Maximum number of lines to read to infer schema. + # If set to 0, all columns will be read as `:str`. + # If set to `nil`, a full table scan will be done (slow). + # @param batch_size [Integer] + # Number of lines to read into the buffer at once. + # Modify this to change performance. + # @param n_rows [Integer] + # Stop reading from CSV file after reading `n_rows`. + # During multi-threaded parsing, an upper bound of `n_rows` + # rows cannot be guaranteed. + # @param encoding ["utf8", "utf8-lossy"] + # Lossy means that invalid utf8 values are replaced with `�` + # characters. When using other encodings than `utf8` or + # `utf8-lossy`, the input is first decoded im memory with + # Ruby. Defaults to `utf8`. + # @param low_memory [Boolean] + # Reduce memory usage at expense of performance. + # @param rechunk [Boolean] + # Make sure that all columns are contiguous in memory by + # aggregating the chunks into a single array. + # @param skip_rows_after_header [Integer] + # Skip this number of rows when the header is parsed. + # @param row_count_name [String] + # If not nil, this will insert a row count column with the given name into + # the DataFrame. + # @param row_count_offset [Integer] + # Offset to start the row_count column (only used if the name is set). + # @param sample_size [Integer] + # Set the sample size. This is used to sample statistics to estimate the + # allocation needed. + # @param eol_char [String] + # Single byte end of line character. + # + # @return [BatchedCsvReader] + # + # @example + # reader = Polars.read_csv_batched( + # "./tpch/tables_scale_100/lineitem.tbl", sep: "|", parse_dates: true + # ) + # reader.next_batches(5) def read_csv_batched( file, has_header: true, columns: nil, new_columns: nil, @@ -329,17 +736,29 @@ eol_char: eol_char, new_columns: new_columns ) end + # Get a schema of the IPC file without reading data. + # + # @param file [Object] + # Path to a file or a file-like object. + # + # @return [Hash] def read_ipc_schema(file) if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname)) file = Utils.format_path(file) end _ipc_schema(file) end + # Get a schema of the Parquet file without reading data. + # + # @param file [Object] + # Path to a file or a file-like object. + # + # @return [Hash] def read_parquet_schema(file) if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname)) file = Utils.format_path(file) end