lib/smarter_csv.rb in smarter_csv-1.7.4 vs lib/smarter_csv.rb in smarter_csv-1.8.0
- old
+ new
@@ -1,12 +1,12 @@
# frozen_string_literal: true
require_relative "extensions/hash"
require_relative "smarter_csv/version"
-require_relative "smarter_csv/smarter_csv" unless ENV['CI'] # does not compile/link in CI?
-# require 'smarter_csv.bundle' unless ENV['CI'] # does not compile/link in CI?
+# require_relative "smarter_csv/smarter_csv" unless ENV['CI'] # does not compile/link in CI?
+require 'smarter_csv.bundle' unless ENV['CI'] # does not compile/link in CI?
module SmarterCSV
class SmarterCSVException < StandardError; end
class HeaderSizeMismatch < SmarterCSVException; end
class IncorrectOption < SmarterCSVException; end
@@ -37,15 +37,11 @@
if (options[:force_utf8] || options[:file_encoding] =~ /utf-8/i) && (fh.respond_to?(:external_encoding) && fh.external_encoding != Encoding.find('UTF-8') || fh.respond_to?(:encoding) && fh.encoding != Encoding.find('UTF-8'))
puts 'WARNING: you are trying to process UTF-8 input, but did not open the input with "b:utf-8" option. See README file "NOTES about File Encodings".'
end
- if options[:skip_lines].to_i > 0
- options[:skip_lines].to_i.times do
- readline_with_counts(fh, options)
- end
- end
+ skip_lines(fh, options)
headerA, header_size = process_headers(fh, options)
# in case we use chunking.. we'll need to set it up..
if !options[:chunk_size].nil? && options[:chunk_size].to_i > 0
@@ -205,11 +201,11 @@
def default_options
{
acceleration: true,
auto_row_sep_chars: 500,
chunk_size: nil,
- col_sep: ',',
+ col_sep: :auto, # was: ',',
comment_regexp: nil, # was: /\A#/,
convert_values_to_numeric: true,
downcase_header: true,
duplicate_header_suffix: nil,
file_encoding: 'utf-8',
@@ -224,11 +220,11 @@
remove_empty_values: true,
remove_unmapped_keys: false,
remove_values_matching: nil,
remove_zero_values: false,
required_headers: nil,
- row_sep: $/,
+ row_sep: :auto, # was: $/,
silence_missing_keys: false,
skip_lines: nil,
strings_as_keys: false,
strip_chars_from_headers: nil,
strip_whitespace: true,
@@ -241,13 +237,28 @@
def readline_with_counts(filehandle, options)
line = filehandle.readline(options[:row_sep])
@file_line_count += 1
@csv_line_count += 1
+ line = remove_bom(line) if @csv_line_count == 1
line
end
+ def skip_lines(filehandle, options)
+ return unless options[:skip_lines].to_i > 0
+
+ options[:skip_lines].to_i.times do
+ readline_with_counts(filehandle, options)
+ end
+ end
+
+ def rewind(filehandle)
+ @file_line_count = 0
+ @csv_line_count = 0
+ filehandle.rewind
+ end
+
###
### Thin wrapper around C-extension
###
def parse(line, options, header_size = nil)
# puts "SmarterCSV.parse OPTIONS: #{options[:acceleration]}" if options[:verbose]
@@ -376,10 +387,12 @@
# If file has headers, then guesses column separator from headers.
# Otherwise guesses column separator from contents.
# Raises exception if none is found.
def guess_column_separator(filehandle, options)
+ skip_lines(filehandle, options)
+
possible_delimiters = [',', "\t", ';', ':', '|']
candidates = if options.fetch(:headers_in_file)
candidated_column_separators_from_headers(filehandle, options, possible_delimiters)
else
@@ -415,11 +428,11 @@
end
last_char = c
lines += 1
break if options[:auto_row_sep_chars] && options[:auto_row_sep_chars] > 0 && lines >= options[:auto_row_sep_chars]
end
- filehandle.rewind
+ rewind(filehandle)
counts["\r"] += 1 if last_char == "\r"
# find the most frequent key/value pair:
k, _ = counts.max_by{|_, v| v}
return k
@@ -471,17 +484,17 @@
headerA.map!{|x| x.to_sym } unless options[:strings_as_keys] || options[:keep_original_headers]
unless options[:user_provided_headers] # wouldn't make sense to re-map user provided headers
key_mappingH = options[:key_mapping]
-
# do some key mapping on the keys in the file header
# if you want to completely delete a key, then map it to nil or to ''
if !key_mappingH.nil? && key_mappingH.class == Hash && key_mappingH.keys.size > 0
unless options[:silence_missing_keys]
# if silence_missing_keys are not set, raise error if missing header
missing_keys = key_mappingH.keys - headerA
+
puts "WARNING: missing header(s): #{missing_keys.join(",")}" unless missing_keys.empty?
end
headerA.map!{|x| key_mappingH.has_key?(x) ? (key_mappingH[x].nil? ? nil : key_mappingH[x]) : (options[:remove_unmapped_keys] ? nil : x)}
end
@@ -523,35 +536,54 @@
result
end
private
+ UTF_32_BOM = %w[0 0 fe ff].freeze
+ UTF_32LE_BOM = %w[ff fe 0 0].freeze
+ UTF_8_BOM = %w[ef bb bf].freeze
+ UTF_16_BOM = %w[fe ff].freeze
+ UTF_16LE_BOM = %w[ff fe].freeze
+
+ def remove_bom(str)
+ str_as_hex = str.bytes.map{|x| x.to_s(16)}
+ # if string does not start with one of the bytes above, there is no BOM
+ return str unless %w[ef fe ff 0].include?(str_as_hex[0])
+
+ return str.byteslice(4..-1) if [UTF_32_BOM, UTF_32LE_BOM].include?(str_as_hex[0..3])
+ return str.byteslice(3..-1) if str_as_hex[0..2] == UTF_8_BOM
+ return str.byteslice(2..-1) if [UTF_16_BOM, UTF_16LE_BOM].include?(str_as_hex[0..1])
+
+ puts "SmarterCSV found unhandled BOM! #{str.chars[0..7].inspect}"
+ str
+ end
+
def candidated_column_separators_from_headers(filehandle, options, delimiters)
candidates = Hash.new(0)
- line = filehandle.readline(options[:row_sep])
+ line = readline_with_counts(filehandle, options.slice(:row_sep))
delimiters.each do |d|
candidates[d] += line.scan(d).count
end
- filehandle.rewind
+ rewind(filehandle)
candidates
end
def candidated_column_separators_from_contents(filehandle, options, delimiters)
candidates = Hash.new(0)
5.times do
- line = filehandle.readline(options[:row_sep])
+ line = readline_with_counts(filehandle, options.slice(:row_sep))
delimiters.each do |d|
candidates[d] += line.scan(d).count
end
rescue EOFError # short files
break
end
- filehandle.rewind
+ rewind(filehandle)
candidates
end
end
end