smarter_csv.rb in smarter_csv-1.8.0

- old
+ new

@@ -1,12 +1,12 @@
 # frozen_string_literal: true
 
 require_relative "extensions/hash"
 require_relative "smarter_csv/version"
 
-require_relative "smarter_csv/smarter_csv" unless ENV['CI'] # does not compile/link in CI?
-# require 'smarter_csv.bundle' unless ENV['CI'] # does not compile/link in CI?
+# require_relative "smarter_csv/smarter_csv" unless ENV['CI'] # does not compile/link in CI?
+require 'smarter_csv.bundle' unless ENV['CI'] # does not compile/link in CI?
 
 module SmarterCSV
   class SmarterCSVException < StandardError; end
   class HeaderSizeMismatch < SmarterCSVException; end
   class IncorrectOption < SmarterCSVException; end
@@ -37,15 +37,11 @@
 
       if (options[:force_utf8] || options[:file_encoding] =~ /utf-8/i) && (fh.respond_to?(:external_encoding) && fh.external_encoding != Encoding.find('UTF-8') || fh.respond_to?(:encoding) && fh.encoding != Encoding.find('UTF-8'))
         puts 'WARNING: you are trying to process UTF-8 input, but did not open the input with "b:utf-8" option. See README file "NOTES about File Encodings".'
       end
 
-      if options[:skip_lines].to_i > 0
-        options[:skip_lines].to_i.times do
-          readline_with_counts(fh, options)
-        end
-      end
+      skip_lines(fh, options)
 
       headerA, header_size = process_headers(fh, options)
 
       # in case we use chunking.. we'll need to set it up..
       if !options[:chunk_size].nil? && options[:chunk_size].to_i > 0
@@ -205,11 +201,11 @@
     def default_options
       {
         acceleration: true,
         auto_row_sep_chars: 500,
         chunk_size: nil,
-        col_sep: ',',
+        col_sep: :auto, # was: ',',
         comment_regexp: nil, # was: /\A#/,
         convert_values_to_numeric: true,
         downcase_header: true,
         duplicate_header_suffix: nil,
         file_encoding: 'utf-8',
@@ -224,11 +220,11 @@
         remove_empty_values: true,
         remove_unmapped_keys: false,
         remove_values_matching: nil,
         remove_zero_values: false,
         required_headers: nil,
-        row_sep: $/,
+        row_sep: :auto, # was: $/,
         silence_missing_keys: false,
         skip_lines: nil,
         strings_as_keys: false,
         strip_chars_from_headers: nil,
         strip_whitespace: true,
@@ -241,13 +237,28 @@
 
     def readline_with_counts(filehandle, options)
       line = filehandle.readline(options[:row_sep])
       @file_line_count += 1
       @csv_line_count += 1
+      line = remove_bom(line) if @csv_line_count == 1
       line
     end
 
+    def skip_lines(filehandle, options)
+      return unless options[:skip_lines].to_i > 0
+
+      options[:skip_lines].to_i.times do
+        readline_with_counts(filehandle, options)
+      end
+    end
+
+    def rewind(filehandle)
+      @file_line_count = 0
+      @csv_line_count = 0
+      filehandle.rewind
+    end
+
     ###
     ### Thin wrapper around C-extension
     ###
     def parse(line, options, header_size = nil)
       # puts "SmarterCSV.parse OPTIONS: #{options[:acceleration]}" if options[:verbose]
@@ -376,10 +387,12 @@
 
     # If file has headers, then guesses column separator from headers.
     # Otherwise guesses column separator from contents.
     # Raises exception if none is found.
     def guess_column_separator(filehandle, options)
+      skip_lines(filehandle, options)
+
       possible_delimiters = [',', "\t", ';', ':', '|']
 
       candidates = if options.fetch(:headers_in_file)
                      candidated_column_separators_from_headers(filehandle, options, possible_delimiters)
                    else
@@ -415,11 +428,11 @@
         end
         last_char = c
         lines += 1
         break if options[:auto_row_sep_chars] && options[:auto_row_sep_chars] > 0 && lines >= options[:auto_row_sep_chars]
       end
-      filehandle.rewind
+      rewind(filehandle)
 
       counts["\r"] += 1 if last_char == "\r"
       # find the most frequent key/value pair:
       k, _ = counts.max_by{|_, v| v}
       return k
@@ -471,17 +484,17 @@
 
       headerA.map!{|x| x.to_sym } unless options[:strings_as_keys] || options[:keep_original_headers]
 
       unless options[:user_provided_headers] # wouldn't make sense to re-map user provided headers
         key_mappingH = options[:key_mapping]
-
         # do some key mapping on the keys in the file header
         #   if you want to completely delete a key, then map it to nil or to ''
         if !key_mappingH.nil? && key_mappingH.class == Hash && key_mappingH.keys.size > 0
           unless options[:silence_missing_keys]
             # if silence_missing_keys are not set, raise error if missing header
             missing_keys = key_mappingH.keys - headerA
+
             puts "WARNING: missing header(s): #{missing_keys.join(",")}" unless missing_keys.empty?
           end
 
           headerA.map!{|x| key_mappingH.has_key?(x) ? (key_mappingH[x].nil? ? nil : key_mappingH[x]) : (options[:remove_unmapped_keys] ? nil : x)}
         end
@@ -523,35 +536,54 @@
       result
     end
 
     private
 
+    UTF_32_BOM = %w[0 0 fe ff].freeze
+    UTF_32LE_BOM = %w[ff fe 0 0].freeze
+    UTF_8_BOM = %w[ef bb bf].freeze
+    UTF_16_BOM = %w[fe ff].freeze
+    UTF_16LE_BOM = %w[ff fe].freeze
+
+    def remove_bom(str)
+      str_as_hex = str.bytes.map{|x| x.to_s(16)}
+      # if string does not start with one of the bytes above, there is no BOM
+      return str unless %w[ef fe ff 0].include?(str_as_hex[0])
+
+      return str.byteslice(4..-1) if [UTF_32_BOM, UTF_32LE_BOM].include?(str_as_hex[0..3])
+      return str.byteslice(3..-1) if str_as_hex[0..2] == UTF_8_BOM
+      return str.byteslice(2..-1) if [UTF_16_BOM, UTF_16LE_BOM].include?(str_as_hex[0..1])
+
+      puts "SmarterCSV found unhandled BOM! #{str.chars[0..7].inspect}"
+      str
+    end
+
     def candidated_column_separators_from_headers(filehandle, options, delimiters)
       candidates = Hash.new(0)
-      line = filehandle.readline(options[:row_sep])
+      line = readline_with_counts(filehandle, options.slice(:row_sep))
 
       delimiters.each do |d|
         candidates[d] += line.scan(d).count
       end
 
-      filehandle.rewind
+      rewind(filehandle)
 
       candidates
     end
 
     def candidated_column_separators_from_contents(filehandle, options, delimiters)
       candidates = Hash.new(0)
 
       5.times do
-        line = filehandle.readline(options[:row_sep])
+        line = readline_with_counts(filehandle, options.slice(:row_sep))
         delimiters.each do |d|
           candidates[d] += line.scan(d).count
         end
       rescue EOFError # short files
         break
       end
 
-      filehandle.rewind
+      rewind(filehandle)
 
       candidates
     end
   end
 end