lib/smarter_csv.rb in smarter_csv-1.7.4 vs lib/smarter_csv.rb in smarter_csv-1.8.0

- old
+ new

@@ -1,12 +1,12 @@ # frozen_string_literal: true require_relative "extensions/hash" require_relative "smarter_csv/version" -require_relative "smarter_csv/smarter_csv" unless ENV['CI'] # does not compile/link in CI? -# require 'smarter_csv.bundle' unless ENV['CI'] # does not compile/link in CI? +# require_relative "smarter_csv/smarter_csv" unless ENV['CI'] # does not compile/link in CI? +require 'smarter_csv.bundle' unless ENV['CI'] # does not compile/link in CI? module SmarterCSV class SmarterCSVException < StandardError; end class HeaderSizeMismatch < SmarterCSVException; end class IncorrectOption < SmarterCSVException; end @@ -37,15 +37,11 @@ if (options[:force_utf8] || options[:file_encoding] =~ /utf-8/i) && (fh.respond_to?(:external_encoding) && fh.external_encoding != Encoding.find('UTF-8') || fh.respond_to?(:encoding) && fh.encoding != Encoding.find('UTF-8')) puts 'WARNING: you are trying to process UTF-8 input, but did not open the input with "b:utf-8" option. See README file "NOTES about File Encodings".' end - if options[:skip_lines].to_i > 0 - options[:skip_lines].to_i.times do - readline_with_counts(fh, options) - end - end + skip_lines(fh, options) headerA, header_size = process_headers(fh, options) # in case we use chunking.. we'll need to set it up.. if !options[:chunk_size].nil? && options[:chunk_size].to_i > 0 @@ -205,11 +201,11 @@ def default_options { acceleration: true, auto_row_sep_chars: 500, chunk_size: nil, - col_sep: ',', + col_sep: :auto, # was: ',', comment_regexp: nil, # was: /\A#/, convert_values_to_numeric: true, downcase_header: true, duplicate_header_suffix: nil, file_encoding: 'utf-8', @@ -224,11 +220,11 @@ remove_empty_values: true, remove_unmapped_keys: false, remove_values_matching: nil, remove_zero_values: false, required_headers: nil, - row_sep: $/, + row_sep: :auto, # was: $/, silence_missing_keys: false, skip_lines: nil, strings_as_keys: false, strip_chars_from_headers: nil, strip_whitespace: true, @@ -241,13 +237,28 @@ def readline_with_counts(filehandle, options) line = filehandle.readline(options[:row_sep]) @file_line_count += 1 @csv_line_count += 1 + line = remove_bom(line) if @csv_line_count == 1 line end + def skip_lines(filehandle, options) + return unless options[:skip_lines].to_i > 0 + + options[:skip_lines].to_i.times do + readline_with_counts(filehandle, options) + end + end + + def rewind(filehandle) + @file_line_count = 0 + @csv_line_count = 0 + filehandle.rewind + end + ### ### Thin wrapper around C-extension ### def parse(line, options, header_size = nil) # puts "SmarterCSV.parse OPTIONS: #{options[:acceleration]}" if options[:verbose] @@ -376,10 +387,12 @@ # If file has headers, then guesses column separator from headers. # Otherwise guesses column separator from contents. # Raises exception if none is found. def guess_column_separator(filehandle, options) + skip_lines(filehandle, options) + possible_delimiters = [',', "\t", ';', ':', '|'] candidates = if options.fetch(:headers_in_file) candidated_column_separators_from_headers(filehandle, options, possible_delimiters) else @@ -415,11 +428,11 @@ end last_char = c lines += 1 break if options[:auto_row_sep_chars] && options[:auto_row_sep_chars] > 0 && lines >= options[:auto_row_sep_chars] end - filehandle.rewind + rewind(filehandle) counts["\r"] += 1 if last_char == "\r" # find the most frequent key/value pair: k, _ = counts.max_by{|_, v| v} return k @@ -471,17 +484,17 @@ headerA.map!{|x| x.to_sym } unless options[:strings_as_keys] || options[:keep_original_headers] unless options[:user_provided_headers] # wouldn't make sense to re-map user provided headers key_mappingH = options[:key_mapping] - # do some key mapping on the keys in the file header # if you want to completely delete a key, then map it to nil or to '' if !key_mappingH.nil? && key_mappingH.class == Hash && key_mappingH.keys.size > 0 unless options[:silence_missing_keys] # if silence_missing_keys are not set, raise error if missing header missing_keys = key_mappingH.keys - headerA + puts "WARNING: missing header(s): #{missing_keys.join(",")}" unless missing_keys.empty? end headerA.map!{|x| key_mappingH.has_key?(x) ? (key_mappingH[x].nil? ? nil : key_mappingH[x]) : (options[:remove_unmapped_keys] ? nil : x)} end @@ -523,35 +536,54 @@ result end private + UTF_32_BOM = %w[0 0 fe ff].freeze + UTF_32LE_BOM = %w[ff fe 0 0].freeze + UTF_8_BOM = %w[ef bb bf].freeze + UTF_16_BOM = %w[fe ff].freeze + UTF_16LE_BOM = %w[ff fe].freeze + + def remove_bom(str) + str_as_hex = str.bytes.map{|x| x.to_s(16)} + # if string does not start with one of the bytes above, there is no BOM + return str unless %w[ef fe ff 0].include?(str_as_hex[0]) + + return str.byteslice(4..-1) if [UTF_32_BOM, UTF_32LE_BOM].include?(str_as_hex[0..3]) + return str.byteslice(3..-1) if str_as_hex[0..2] == UTF_8_BOM + return str.byteslice(2..-1) if [UTF_16_BOM, UTF_16LE_BOM].include?(str_as_hex[0..1]) + + puts "SmarterCSV found unhandled BOM! #{str.chars[0..7].inspect}" + str + end + def candidated_column_separators_from_headers(filehandle, options, delimiters) candidates = Hash.new(0) - line = filehandle.readline(options[:row_sep]) + line = readline_with_counts(filehandle, options.slice(:row_sep)) delimiters.each do |d| candidates[d] += line.scan(d).count end - filehandle.rewind + rewind(filehandle) candidates end def candidated_column_separators_from_contents(filehandle, options, delimiters) candidates = Hash.new(0) 5.times do - line = filehandle.readline(options[:row_sep]) + line = readline_with_counts(filehandle, options.slice(:row_sep)) delimiters.each do |d| candidates[d] += line.scan(d).count end rescue EOFError # short files break end - filehandle.rewind + rewind(filehandle) candidates end end end