lib/polars/io.rb in polars-df-0.1.1 vs lib/polars/io.rb in polars-df-0.1.2

- old
+ new

@@ -1,13 +1,250 @@ module Polars module IO - def read_csv(file, has_header: true) + def read_csv( + file, + has_header: true, + columns: nil, + new_columns: nil, + sep: ",", + comment_char: nil, + quote_char: '"', + skip_rows: 0, + dtypes: nil, + null_values: nil, + ignore_errors: false, + parse_dates: false, + n_threads: nil, + infer_schema_length: 100, + batch_size: 8192, + n_rows: nil, + encoding: "utf8", + low_memory: false, + rechunk: true, + storage_options: nil, + skip_rows_after_header: 0, + row_count_name: nil, + row_count_offset: 0, + sample_size: 1024, + eol_char: "\n" + ) + _check_arg_is_1byte("sep", sep, false) + _check_arg_is_1byte("comment_char", comment_char, false) + _check_arg_is_1byte("quote_char", quote_char, true) + _check_arg_is_1byte("eol_char", eol_char, false) + + projection, columns = Utils.handle_projection_columns(columns) + + storage_options ||= {} + + if columns && !has_header + columns.each do |column| + if !column.start_with?("column_") + raise ArgumentError, "Specified column names do not start with \"column_\", but autogenerated header names were requested." + end + end + end + + if projection || new_columns + raise Todo + end + + df = nil _prepare_file_arg(file) do |data| - DataFrame._read_csv(data, has_header: has_header) + df = DataFrame._read_csv( + data, + has_header: has_header, + columns: columns || projection, + sep: sep, + comment_char: comment_char, + quote_char: quote_char, + skip_rows: skip_rows, + dtypes: dtypes, + null_values: null_values, + ignore_errors: ignore_errors, + parse_dates: parse_dates, + n_threads: n_threads, + infer_schema_length: infer_schema_length, + batch_size: batch_size, + n_rows: n_rows, + encoding: encoding == "utf8-lossy" ? encoding : "utf8", + low_memory: low_memory, + rechunk: rechunk, + skip_rows_after_header: skip_rows_after_header, + row_count_name: row_count_name, + row_count_offset: row_count_offset, + sample_size: sample_size, + eol_char: eol_char + ) end + + if new_columns + Utils._update_columns(df, new_columns) + else + df + end end + def scan_csv( + file, + has_header: true, + sep: ",", + comment_char: nil, + quote_char: '"', + skip_rows: 0, + dtypes: nil, + null_values: nil, + ignore_errors: false, + cache: true, + with_column_names: nil, + infer_schema_length: 100, + n_rows: nil, + encoding: "utf8", + low_memory: false, + rechunk: true, + skip_rows_after_header: 0, + row_count_name: nil, + row_count_offset: 0, + parse_dates: false, + eol_char: "\n" + ) + _check_arg_is_1byte("sep", sep, false) + _check_arg_is_1byte("comment_char", comment_char, false) + _check_arg_is_1byte("quote_char", quote_char, true) + + if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname)) + file = Utils.format_path(file) + end + + LazyFrame._scan_csv( + file, + has_header: has_header, + sep: sep, + comment_char: comment_char, + quote_char: quote_char, + skip_rows: skip_rows, + dtypes: dtypes, + null_values: null_values, + ignore_errors: ignore_errors, + cache: cache, + with_column_names: with_column_names, + infer_schema_length: infer_schema_length, + n_rows: n_rows, + low_memory: low_memory, + rechunk: rechunk, + skip_rows_after_header: skip_rows_after_header, + encoding: encoding, + row_count_name: row_count_name, + row_count_offset: row_count_offset, + parse_dates: parse_dates, + eol_char: eol_char, + ) + end + + def scan_ipc( + file, + n_rows: nil, + cache: true, + rechunk: true, + row_count_name: nil, + row_count_offset: 0, + storage_options: nil, + memory_map: true + ) + LazyFrame._scan_ipc( + file, + n_rows: n_rows, + cache: cache, + rechunk: rechunk, + row_count_name: row_count_name, + row_count_offset: row_count_offset, + storage_options: storage_options, + memory_map: memory_map + ) + end + + def scan_parquet( + file, + n_rows: nil, + cache: true, + parallel: "auto", + rechunk: true, + row_count_name: nil, + row_count_offset: 0, + storage_options: nil, + low_memory: false + ) + if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname)) + file = Utils.format_path(file) + end + + LazyFrame._scan_parquet( + file, + n_rows:n_rows, + cache: cache, + parallel: parallel, + rechunk: rechunk, + row_count_name: row_count_name, + row_count_offset: row_count_offset, + storage_options: storage_options, + low_memory: low_memory + ) + end + + def scan_ndjson( + file, + infer_schema_length: 100, + batch_size: 1024, + n_rows: nil, + low_memory: false, + rechunk: true, + row_count_name: nil, + row_count_offset: 0 + ) + if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname)) + file = Utils.format_path(file) + end + + LazyFrame._scan_ndjson( + file, + infer_schema_length: infer_schema_length, + batch_size: batch_size, + n_rows: n_rows, + low_memory: low_memory, + rechunk: rechunk, + row_count_name: row_count_name, + row_count_offset: row_count_offset, + ) + end + + # def read_avro + # end + + def read_ipc( + file, + columns: nil, + n_rows: nil, + memory_map: true, + storage_options: nil, + row_count_name: nil, + row_count_offset: 0, + rechunk: true + ) + storage_options ||= {} + _prepare_file_arg(file, **storage_options) do |data| + DataFrame._read_ipc( + data, + columns: columns, + n_rows: n_rows, + row_count_name: row_count_name, + row_count_offset: row_count_offset, + rechunk: rechunk, + memory_map: memory_map + ) + end + end + def read_parquet(file) _prepare_file_arg(file) do |data| DataFrame._read_parquet(data) end end @@ -18,10 +255,100 @@ def read_ndjson(file) DataFrame._read_ndjson(file) end + # def read_sql + # end + + # def read_excel + # end + + def read_csv_batched( + file, + has_header: true, + columns: nil, + new_columns: nil, + sep: ",", + comment_char: nil, + quote_char: '"', + skip_rows: 0, + dtypes: nil, + null_values: nil, + ignore_errors: false, + parse_dates: false, + n_threads: nil, + infer_schema_length: 100, + batch_size: 50_000, + n_rows: nil, + encoding: "utf8", + low_memory: false, + rechunk: true, + skip_rows_after_header: 0, + row_count_name: nil, + row_count_offset: 0, + sample_size: 1024, + eol_char: "\n" + ) + projection, columns = Utils.handle_projection_columns(columns) + + if columns && !has_header + columns.each do |column| + if !column.start_with?("column_") + raise ArgumentError, "Specified column names do not start with \"column_\", but autogenerated header names were requested." + end + end + end + + if projection || new_columns + raise Todo + end + + BatchedCsvReader.new( + file, + has_header: has_header, + columns: columns || projection, + sep: sep, + comment_char: comment_char, + quote_char: quote_char, + skip_rows: skip_rows, + dtypes: dtypes, + null_values: null_values, + ignore_errors: ignore_errors, + parse_dates: parse_dates, + n_threads: n_threads, + infer_schema_length: infer_schema_length, + batch_size: batch_size, + n_rows: n_rows, + encoding: encoding == "utf8-lossy" ? encoding : "utf8", + low_memory: low_memory, + rechunk: rechunk, + skip_rows_after_header: skip_rows_after_header, + row_count_name: row_count_name, + row_count_offset: row_count_offset, + sample_size: sample_size, + eol_char: eol_char, + new_columns: new_columns + ) + end + + def read_ipc_schema(file) + if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname)) + file = Utils.format_path(file) + end + + _ipc_schema(file) + end + + def read_parquet_schema(file) + if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname)) + file = Utils.format_path(file) + end + + _parquet_schema(file) + end + private def _prepare_file_arg(file) if file.is_a?(String) && file =~ /\Ahttps?:\/\// raise ArgumentError, "use URI(...) for remote files" @@ -32,8 +359,21 @@ file = URI.open(file) end yield file + end + + def _check_arg_is_1byte(arg_name, arg, can_be_empty = false) + if arg.is_a?(String) + arg_byte_length = arg.bytesize + if can_be_empty + if arg_byte_length > 1 + raise ArgumentError, "#{arg_name} should be a single byte character or empty, but is #{arg_byte_length} bytes long." + end + elsif arg_byte_length != 1 + raise ArgumentError, "#{arg_name} should be a single byte character, but is #{arg_byte_length} bytes long." + end + end end end end