io.rb in polars-df-0.1.3

- old
+ new

@@ -1,7 +1,95 @@
 module Polars
   module IO
+    # Read a CSV file into a DataFrame.
+    #
+    # @param file [Object]
+    #   Path to a file or a file-like object.
+    # @param has_header [Boolean]
+    #   Indicate if the first row of dataset is a header or not.
+    #   If set to false, column names will be autogenerated in the
+    #   following format: `column_x`, with `x` being an
+    #   enumeration over every column in the dataset starting at 1.
+    # @param columns [Object]
+    #   Columns to select. Accepts a list of column indices (starting
+    #   at zero) or a list of column names.
+    # @param new_columns [Object]
+    #   Rename columns right after parsing the CSV file. If the given
+    #   list is shorter than the width of the DataFrame the remaining
+    #   columns will have their original name.
+    # @param sep [String]
+    #   Single byte character to use as delimiter in the file.
+    # @param comment_char [String]
+    #   Single byte character that indicates the start of a comment line,
+    #   for instance `#`.
+    # @param quote_char [String]
+    #   Single byte character used for csv quoting.
+    #   Set to nil to turn off special handling and escaping of quotes.
+    # @param skip_rows [Integer]
+    #   Start reading after `skip_rows` lines.
+    # @param dtypes [Object]
+    #   Overwrite dtypes during inference.
+    # @param null_values [Object]
+    #   Values to interpret as null values. You can provide a:
+    #
+    #   - `String`: All values equal to this string will be null.
+    #   - `Array`: All values equal to any string in this array will be null.
+    #   - `Hash`: A hash that maps column name to a null value string.
+    # @param ignore_errors [Boolean]
+    #   Try to keep reading lines if some lines yield errors.
+    #   First try `infer_schema_length: 0` to read all columns as
+    #   `:str` to check which values might cause an issue.
+    # @param parse_dates [Boolean]
+    #   Try to automatically parse dates. If this does not succeed,
+    #   the column remains of data type `:str`.
+    # @param n_threads [Integer]
+    #   Number of threads to use in csv parsing.
+    #   Defaults to the number of physical cpu's of your system.
+    # @param infer_schema_length [Integer]
+    #   Maximum number of lines to read to infer schema.
+    #   If set to 0, all columns will be read as `:utf8`.
+    #   If set to `nil`, a full table scan will be done (slow).
+    # @param batch_size [Integer]
+    #   Number of lines to read into the buffer at once.
+    #   Modify this to change performance.
+    # @param n_rows [Integer]
+    #   Stop reading from CSV file after reading `n_rows`.
+    #   During multi-threaded parsing, an upper bound of `n_rows`
+    #   rows cannot be guaranteed.
+    # @param encoding ["utf8", "utf8-lossy"]
+    #   Lossy means that invalid utf8 values are replaced with `�`
+    #   characters. When using other encodings than `utf8` or
+    #   `utf8-lossy`, the input is first decoded im memory with
+    #   python.
+    # @param low_memory [Boolean]
+    #   Reduce memory usage at expense of performance.
+    # @param rechunk [Boolean]
+    #   Make sure that all columns are contiguous in memory by
+    #   aggregating the chunks into a single array.
+    # @param storage_options [Hash]
+    #   Extra options that make sense for a
+    #   particular storage connection.
+    # @param skip_rows_after_header [Integer]
+    #   Skip this number of rows when the header is parsed.
+    # @param row_count_name [String]
+    #   If not nil, this will insert a row count column with the given name into
+    #   the DataFrame.
+    # @param row_count_offset [Integer]
+    #   Offset to start the row_count column (only used if the name is set).
+    # @param sample_size [Integer]
+    #   Set the sample size. This is used to sample statistics to estimate the
+    #   allocation needed.
+    # @param eol_char [String]
+    #   Single byte end of line character.
+    #
+    # @return [DataFrame]
+    #
+    # @note
+    #   This operation defaults to a `rechunk` operation at the end, meaning that
+    #   all data will be stored continuously in memory.
+    #   Set `rechunk: false` if you are benchmarking the csv-reader. A `rechunk` is
+    #   an expensive operation.
     def read_csv(
       file,
       has_header: true,
       columns: nil,
       new_columns: nil,
@@ -82,10 +170,79 @@
       else
         df
       end
     end
 
+    # Lazily read from a CSV file or multiple files via glob patterns.
+    #
+    # This allows the query optimizer to push down predicates and
+    # projections to the scan level, thereby potentially reducing
+    # memory overhead.
+    #
+    # @param file [Object]
+    #   Path to a file.
+    # @param has_header [Boolean]
+    #   Indicate if the first row of dataset is a header or not.
+    #   If set to false, column names will be autogenerated in the
+    #   following format: ``column_x``, with ``x`` being an
+    #   enumeration over every column in the dataset starting at 1.
+    # @param sep [String]
+    #   Single byte character to use as delimiter in the file.
+    # @param comment_char [String]
+    #   Single byte character that indicates the start of a comment line,
+    #   for instance `#`.
+    # @param quote_char [String]
+    #   Single byte character used for csv quoting.
+    #   Set to None to turn off special handling and escaping of quotes.
+    # @param skip_rows [Integer]
+    #   Start reading after `skip_rows` lines. The header will be parsed at this
+    #   offset.
+    # @param dtypes [Object]
+    #   Overwrite dtypes during inference.
+    # @param null_values [Object]
+    #   Values to interpret as null values. You can provide a:
+    #
+    #   - `String`: All values equal to this string will be null.
+    #   - `Array`: All values equal to any string in this array will be null.
+    #   - `Hash`: A hash that maps column name to a null value string.
+    # @param ignore_errors [Boolean]
+    #   Try to keep reading lines if some lines yield errors.
+    #   First try `infer_schema_length: 0` to read all columns as
+    #   `:str` to check which values might cause an issue.
+    # @param cache [Boolean]
+    #   Cache the result after reading.
+    # @param with_column_names [Object]
+    #   Apply a function over the column names.
+    #   This can be used to update a schema just in time, thus before
+    #   scanning.
+    # @param infer_schema_length [Integer]
+    #   Maximum number of lines to read to infer schema.
+    #   If set to 0, all columns will be read as `:str`.
+    #   If set to `nil`, a full table scan will be done (slow).
+    # @param n_rows [Integer]
+    #   Stop reading from CSV file after reading `n_rows`.
+    # @param encoding ["utf8", "utf8-lossy"]
+    #   Lossy means that invalid utf8 values are replaced with `�`
+    #   characters.
+    # @param low_memory [Boolean]
+    #   Reduce memory usage in expense of performance.
+    # @param rechunk [Boolean]
+    #   Reallocate to contiguous memory when all chunks/ files are parsed.
+    # @param skip_rows_after_header [Integer]
+    #   Skip this number of rows when the header is parsed.
+    # @param row_count_name [String]
+    #   If not nil, this will insert a row count column with the given name into
+    #   the DataFrame.
+    # @param row_count_offset [Integer]
+    #   Offset to start the row_count column (only used if the name is set).
+    # @param parse_dates [Boolean]
+    #   Try to automatically parse dates. If this does not succeed,
+    #   the column remains of data type `:str`.
+    # @param eol_char [String]
+    #   Single byte end of line character.
+    #
+    # @return [LazyFrame]
     def scan_csv(
       file,
       has_header: true,
       sep: ",",
       comment_char: nil,
@@ -138,10 +295,36 @@
         parse_dates: parse_dates,
         eol_char: eol_char,
       )
     end
 
+    # Lazily read from an Arrow IPC (Feather v2) file or multiple files via glob patterns.
+    #
+    # This allows the query optimizer to push down predicates and projections to the scan
+    # level, thereby potentially reducing memory overhead.
+    #
+    # @param file [String]
+    #   Path to a IPC file.
+    # @param n_rows [Integer]
+    #   Stop reading from IPC file after reading `n_rows`.
+    # @param cache [Boolean]
+    #   Cache the result after reading.
+    # @param rechunk [Boolean]
+    #   Reallocate to contiguous memory when all chunks/ files are parsed.
+    # @param row_count_name [String]
+    #   If not nil, this will insert a row count column with give name into the
+    #   DataFrame.
+    # @param row_count_offset [Integer]
+    #   Offset to start the row_count column (only use if the name is set).
+    # @param storage_options [Hash]
+    #   Extra options that make sense for a particular storage connection.
+    # @param memory_map [Boolean]
+    #   Try to memory map the file. This can greatly improve performance on repeated
+    #   queries as the OS may cache pages.
+    #   Only uncompressed IPC files can be memory mapped.
+    #
+    # @return [LazyFrame]
     def scan_ipc(
       file,
       n_rows: nil,
       cache: true,
       rechunk: true,
@@ -160,10 +343,38 @@
         storage_options: storage_options,
         memory_map: memory_map
       )
     end
 
+    # Lazily read from a parquet file or multiple files via glob patterns.
+    #
+    # This allows the query optimizer to push down predicates and projections to the scan
+    # level, thereby potentially reducing memory overhead.
+    #
+    # @param file [String]
+    #   Path to a file.
+    # @param n_rows [Integer]
+    #   Stop reading from parquet file after reading `n_rows`.
+    # @param cache [Boolean]
+    #   Cache the result after reading.
+    # @param parallel ["auto", "columns", "row_groups", "none"]
+    #   This determines the direction of parallelism. 'auto' will try to determine the
+    #   optimal direction.
+    # @param rechunk [Boolean]
+    #   In case of reading multiple files via a glob pattern rechunk the final DataFrame
+    #   into contiguous memory chunks.
+    # @param row_count_name [String]
+    #   If not nil, this will insert a row count column with give name into the
+    #   DataFrame.
+    # @param row_count_offset [Integer]
+    #   Offset to start the row_count column (only use if the name is set).
+    # @param storage_options [Hash]
+    #   Extra options that make sense for a particular storage connection.
+    # @param low_memory [Boolean]
+    #   Reduce memory pressure at the expense of performance.
+    #
+    # @return [LazyFrame]
     def scan_parquet(
       file,
       n_rows: nil,
       cache: true,
       parallel: "auto",
@@ -188,10 +399,34 @@
         storage_options: storage_options,
         low_memory: low_memory
       )
     end
 
+    # Lazily read from a newline delimited JSON file.
+    #
+    # This allows the query optimizer to push down predicates and projections to the scan
+    # level, thereby potentially reducing memory overhead.
+    #
+    # @param file [String]
+    #   Path to a file.
+    # @param infer_schema_length [Integer]
+    #   Infer the schema length from the first `infer_schema_length` rows.
+    # @param batch_size [Integer]
+    #   Number of rows to read in each batch.
+    # @param n_rows [Integer]
+    #   Stop reading from JSON file after reading `n_rows`.
+    # @param low_memory [Boolean]
+    #   Reduce memory pressure at the expense of performance.
+    # @param rechunk [Boolean]
+    #   Reallocate to contiguous memory when all chunks/ files are parsed.
+    # @param row_count_name [String]
+    #   If not nil, this will insert a row count column with give name into the
+    #   DataFrame.
+    # @param row_count_offset [Integer]
+    #   Offset to start the row_count column (only use if the name is set).
+    #
+    # @return [LazyFrame]
     def scan_ndjson(
       file,
       infer_schema_length: 100,
       batch_size: 1024,
       n_rows: nil,
@@ -217,10 +452,34 @@
     end
 
     # def read_avro
     # end
 
+    # Read into a DataFrame from Arrow IPC (Feather v2) file.
+    #
+    # @param file [Object]
+    #   Path to a file or a file-like object.
+    # @param columns [Object]
+    #   Columns to select. Accepts a list of column indices (starting at zero) or a list
+    #   of column names.
+    # @param n_rows [Integer]
+    #   Stop reading from IPC file after reading `n_rows`.
+    # @param memory_map [Boolean]
+    #   Try to memory map the file. This can greatly improve performance on repeated
+    #   queries as the OS may cache pages.
+    #   Only uncompressed IPC files can be memory mapped.
+    # @param storage_options [Hash]
+    #   Extra options that make sense for a particular storage connection.
+    # @param row_count_name [String]
+    #   If not nil, this will insert a row count column with give name into the
+    #   DataFrame.
+    # @param row_count_offset [Integer]
+    #   Offset to start the row_count column (only use if the name is set).
+    # @param rechunk [Boolean]
+    #   Make sure that all data is contiguous.
+    #
+    # @return [DataFrame]
     def read_ipc(
       file,
       columns: nil,
       n_rows: nil,
       memory_map: true,
@@ -241,30 +500,178 @@
           memory_map: memory_map
         )
       end
     end
 
-    def read_parquet(file)
+    # Read into a DataFrame from a parquet file.
+    #
+    # @param file [Object]
+    #   Path to a file, or a file-like object.
+    # @param columns [Object]
+    #   Columns to select. Accepts a list of column indices (starting at zero) or a list
+    #   of column names.
+    # @param n_rows [Integer]
+    #   Stop reading from parquet file after reading `n_rows`.
+    # @param storage_options [Hash]
+    #   Extra options that make sense for a particular storage connection.
+    # @param parallel ["auto", "columns", "row_groups", "none"]
+    #   This determines the direction of parallelism. 'auto' will try to determine the
+    #   optimal direction.
+    # @param row_count_name [String]
+    #   If not nil, this will insert a row count column with give name into the
+    #   DataFrame.
+    # @param row_count_offset [Integer]
+    #   Offset to start the row_count column (only use if the name is set).
+    # @param low_memory [Boolean]
+    #   Reduce memory pressure at the expense of performance.
+    #
+    # @return [DataFrame]
+    #
+    # @note
+    #   This operation defaults to a `rechunk` operation at the end, meaning that
+    #   all data will be stored continuously in memory.
+    #   Set `rechunk: false` if you are benchmarking the parquet-reader. A `rechunk` is
+    #   an expensive operation.
+    def read_parquet(
+      file,
+      columns: nil,
+      n_rows: nil,
+      storage_options: nil,
+      parallel: "auto",
+      row_count_name: nil,
+      row_count_offset: 0,
+      low_memory: false
+    )
       _prepare_file_arg(file) do |data|
-        DataFrame._read_parquet(data)
+        DataFrame._read_parquet(
+          data,
+          columns: columns,
+          n_rows: n_rows,
+          parallel: parallel,
+          row_count_name: row_count_name,
+          row_count_offset: row_count_offset,
+          low_memory: low_memory
+        )
       end
     end
 
+    # Read into a DataFrame from a JSON file.
+    #
+    # @param file [Object]
+    #   Path to a file or a file-like object.
+    #
+    # @return [DataFrame]
     def read_json(file)
       DataFrame._read_json(file)
     end
 
+    # Read into a DataFrame from a newline delimited JSON file.
+    #
+    # @param file [Object]
+    #   Path to a file or a file-like object.
+    #
+    # @return [DataFrame]
     def read_ndjson(file)
       DataFrame._read_ndjson(file)
     end
 
     # def read_sql
     # end
 
     # def read_excel
     # end
 
+    # Read a CSV file in batches.
+    #
+    # Upon creation of the `BatchedCsvReader`,
+    # polars will gather statistics and determine the
+    # file chunks. After that work will only be done
+    # if `next_batches` is called.
+    #
+    # @param file [Object]
+    #   Path to a file or a file-like object.
+    # @param has_header [Boolean]
+    #   Indicate if the first row of dataset is a header or not.
+    #   If set to False, column names will be autogenerated in the
+    #   following format: `column_x`, with `x` being an
+    #   enumeration over every column in the dataset starting at 1.
+    # @param columns [Object]
+    #   Columns to select. Accepts a list of column indices (starting
+    #   at zero) or a list of column names.
+    # @param new_columns [Object]
+    #   Rename columns right after parsing the CSV file. If the given
+    #   list is shorter than the width of the DataFrame the remaining
+    #   columns will have their original name.
+    # @param sep [String]
+    #   Single byte character to use as delimiter in the file.
+    # @param comment_char [String]
+    #   Single byte character that indicates the start of a comment line,
+    #   for instance `#`.
+    # @param quote_char [String]
+    #   Single byte character used for csv quoting, default = `"`.
+    #   Set to nil to turn off special handling and escaping of quotes.
+    # @param skip_rows [Integer]
+    #   Start reading after `skip_rows` lines.
+    # @param dtypes [Object]
+    #   Overwrite dtypes during inference.
+    # @param null_values [Object]
+    #   Values to interpret as null values. You can provide a:
+    #
+    #   - `String`: All values equal to this string will be null.
+    #   - `Array`: All values equal to any string in this array will be null.
+    #   - `Hash`: A hash that maps column name to a null value string.
+    # @param ignore_errors [Boolean]
+    #   Try to keep reading lines if some lines yield errors.
+    #   First try `infer_schema_length: 0` to read all columns as
+    #   `:str` to check which values might cause an issue.
+    # @param parse_dates [Boolean]
+    #   Try to automatically parse dates. If this does not succeed,
+    #   the column remains of data type `:str`.
+    # @param n_threads [Integer]
+    #   Number of threads to use in csv parsing.
+    #   Defaults to the number of physical cpu's of your system.
+    # @param infer_schema_length [Integer]
+    #   Maximum number of lines to read to infer schema.
+    #   If set to 0, all columns will be read as `:str`.
+    #   If set to `nil`, a full table scan will be done (slow).
+    # @param batch_size [Integer]
+    #   Number of lines to read into the buffer at once.
+    #   Modify this to change performance.
+    # @param n_rows [Integer]
+    #   Stop reading from CSV file after reading `n_rows`.
+    #   During multi-threaded parsing, an upper bound of `n_rows`
+    #   rows cannot be guaranteed.
+    # @param encoding ["utf8", "utf8-lossy"]
+    #   Lossy means that invalid utf8 values are replaced with `�`
+    #   characters. When using other encodings than `utf8` or
+    #   `utf8-lossy`, the input is first decoded im memory with
+    #   Ruby. Defaults to `utf8`.
+    # @param low_memory [Boolean]
+    #   Reduce memory usage at expense of performance.
+    # @param rechunk [Boolean]
+    #   Make sure that all columns are contiguous in memory by
+    #   aggregating the chunks into a single array.
+    # @param skip_rows_after_header [Integer]
+    #   Skip this number of rows when the header is parsed.
+    # @param row_count_name [String]
+    #   If not nil, this will insert a row count column with the given name into
+    #   the DataFrame.
+    # @param row_count_offset [Integer]
+    #   Offset to start the row_count column (only used if the name is set).
+    # @param sample_size [Integer]
+    #   Set the sample size. This is used to sample statistics to estimate the
+    #   allocation needed.
+    # @param eol_char [String]
+    #   Single byte end of line character.
+    #
+    # @return [BatchedCsvReader]
+    #
+    # @example
+    #   reader = Polars.read_csv_batched(
+    #     "./tpch/tables_scale_100/lineitem.tbl", sep: "|", parse_dates: true
+    #   )
+    #   reader.next_batches(5)
     def read_csv_batched(
       file,
       has_header: true,
       columns: nil,
       new_columns: nil,
@@ -329,17 +736,29 @@
         eol_char: eol_char,
         new_columns: new_columns
       )
     end
 
+    # Get a schema of the IPC file without reading data.
+    #
+    # @param file [Object]
+    #   Path to a file or a file-like object.
+    #
+    # @return [Hash]
     def read_ipc_schema(file)
       if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
         file = Utils.format_path(file)
       end
 
       _ipc_schema(file)
     end
 
+    # Get a schema of the Parquet file without reading data.
+    #
+    # @param file [Object]
+    #   Path to a file or a file-like object.
+    #
+    # @return [Hash]
     def read_parquet_schema(file)
       if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
         file = Utils.format_path(file)
       end