# frozen_string_literal: true require_relative "extensions/hash" require_relative "smarter_csv/version" if `uname -s`.chomp == 'Darwin' require 'smarter_csv.bundle' unless ENV['CI'] # local testing else require_relative "smarter_csv/smarter_csv" unless ENV['CI'] # does not compile/link in CI? end module SmarterCSV class SmarterCSVException < StandardError; end class HeaderSizeMismatch < SmarterCSVException; end class IncorrectOption < SmarterCSVException; end class ValidationError < SmarterCSVException; end class DuplicateHeaders < SmarterCSVException; end class MissingHeaders < SmarterCSVException; end class NoColSepDetected < SmarterCSVException; end class KeyMappingError < SmarterCSVException; end # CURRENTLY UNUSED -> version 1.9.0 # first parameter: filename or input object which responds to readline method def SmarterCSV.process(input, options = {}, &block) options = default_options.merge(options) options[:invalid_byte_sequence] = '' if options[:invalid_byte_sequence].nil? puts "SmarterCSV OPTIONS: #{options.inspect}" if options[:verbose] validate_options!(options) headerA = [] result = [] @file_line_count = 0 @csv_line_count = 0 has_rails = !!defined?(Rails) begin fh = input.respond_to?(:readline) ? input : File.open(input, "r:#{options[:file_encoding]}") # auto-detect the row separator options[:row_sep] = guess_line_ending(fh, options) if options[:row_sep]&.to_sym == :auto # attempt to auto-detect column separator options[:col_sep] = guess_column_separator(fh, options) if options[:col_sep]&.to_sym == :auto if (options[:force_utf8] || options[:file_encoding] =~ /utf-8/i) && (fh.respond_to?(:external_encoding) && fh.external_encoding != Encoding.find('UTF-8') || fh.respond_to?(:encoding) && fh.encoding != Encoding.find('UTF-8')) puts 'WARNING: you are trying to process UTF-8 input, but did not open the input with "b:utf-8" option. See README file "NOTES about File Encodings".' end skip_lines(fh, options) headerA, header_size = process_headers(fh, options) # in case we use chunking.. we'll need to set it up.. if !options[:chunk_size].nil? && options[:chunk_size].to_i > 0 use_chunks = true chunk_size = options[:chunk_size].to_i chunk_count = 0 chunk = [] else use_chunks = false end # now on to processing all the rest of the lines in the CSV file: until fh.eof? # we can't use fh.readlines() here, because this would read the whole file into memory at once, and eof => true line = readline_with_counts(fh, options) # replace invalid byte sequence in UTF-8 with question mark to avoid errors line = line.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence]) if options[:force_utf8] || options[:file_encoding] !~ /utf-8/i print "processing file line %10d, csv line %10d\r" % [@file_line_count, @csv_line_count] if options[:verbose] next if options[:comment_regexp] && line =~ options[:comment_regexp] # ignore all comment lines if there are any # cater for the quoted csv data containing the row separator carriage return character # in which case the row data will be split across multiple lines (see the sample content in spec/fixtures/carriage_returns_rn.csv) # by detecting the existence of an uneven number of quote characters multiline = line.count(options[:quote_char]).odd? # should handle quote_char nil while line.count(options[:quote_char]).odd? # should handle quote_char nil next_line = fh.readline(options[:row_sep]) next_line = next_line.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence]) if options[:force_utf8] || options[:file_encoding] !~ /utf-8/i line += next_line @file_line_count += 1 end print "\nline contains uneven number of quote chars so including content through file line %d\n" % @file_line_count if options[:verbose] && multiline line.chomp!(options[:row_sep]) dataA, _data_size = parse(line, options, header_size) dataA.map!{|x| x.strip} if options[:strip_whitespace] # if all values are blank, then ignore this line next if options[:remove_empty_hashes] && (dataA.empty? || blank?(dataA)) hash = Hash.zip(headerA, dataA) # from Facets of Ruby library # make sure we delete any key/value pairs from the hash, which the user wanted to delete: # Note: Ruby < 1.9 doesn't allow empty symbol literals! hash.delete(nil) hash.delete('') eval('hash.delete(:"")') if RUBY_VERSION.to_f > 1.8 if options[:remove_empty_values] == true hash.delete_if{|_k, v| has_rails ? v.blank? : blank?(v)} end hash.delete_if{|_k, v| !v.nil? && v =~ /^(\d+|\d+\.\d+)$/ && v.to_f == 0} if options[:remove_zero_values] # values are typically Strings! hash.delete_if{|_k, v| v =~ options[:remove_values_matching]} if options[:remove_values_matching] if options[:convert_values_to_numeric] hash.each do |k, v| # deal with the :only / :except options to :convert_values_to_numeric next if only_or_except_limit_execution(options, :convert_values_to_numeric, k) # convert if it's a numeric value: case v when /^[+-]?\d+\.\d+$/ hash[k] = v.to_f when /^[+-]?\d+$/ hash[k] = v.to_i end end end if options[:value_converters] hash.each do |k, v| converter = options[:value_converters][k] next unless converter hash[k] = converter.convert(v) end end next if options[:remove_empty_hashes] && hash.empty? hash[:csv_line_number] = @csv_line_count if options[:with_line_numbers] if use_chunks chunk << hash # append temp result to chunk if chunk.size >= chunk_size || fh.eof? # if chunk if full, or EOF reached # do something with the chunk if block_given? yield chunk # do something with the hashes in the chunk in the block else result << chunk # not sure yet, why anybody would want to do this without a block end chunk_count += 1 chunk = [] # initialize for next chunk of data else # the last chunk may contain partial data, which also needs to be returned (BUG / ISSUE-18) end # while a chunk is being filled up we don't need to do anything else here else # no chunk handling if block_given? yield [hash] # do something with the hash in the block (better to use chunking here) else result << hash end end end # print new line to retain last processing line message print "\n" if options[:verbose] # last chunk: if !chunk.nil? && chunk.size > 0 # do something with the chunk if block_given? yield chunk # do something with the hashes in the chunk in the block else result << chunk # not sure yet, why anybody would want to do this without a block end chunk_count += 1 chunk = [] # initialize for next chunk of data end ensure fh.close if fh.respond_to?(:close) end if block_given? return chunk_count # when we do processing through a block we only care how many chunks we processed else return result # returns either an Array of Hashes, or an Array of Arrays of Hashes (if in chunked mode) end end class << self def has_acceleration? @has_acceleration ||= !!defined?(parse_csv_line_c) end def raw_header @raw_header end def headers @headers end protected # NOTE: this is not called when "parse" methods are tested by themselves def default_options { acceleration: true, auto_row_sep_chars: 500, chunk_size: nil, col_sep: :auto, # was: ',', comment_regexp: nil, # was: /\A#/, convert_values_to_numeric: true, downcase_header: true, duplicate_header_suffix: nil, file_encoding: 'utf-8', force_simple_split: false, force_utf8: false, headers_in_file: true, invalid_byte_sequence: '', keep_original_headers: false, key_mapping: nil, quote_char: '"', remove_empty_hashes: true, remove_empty_values: true, remove_unmapped_keys: false, remove_values_matching: nil, remove_zero_values: false, required_headers: nil, required_keys: nil, row_sep: :auto, # was: $/, silence_missing_keys: false, skip_lines: nil, strings_as_keys: false, strip_chars_from_headers: nil, strip_whitespace: true, user_provided_headers: nil, value_converters: nil, verbose: false, with_line_numbers: false, } end def readline_with_counts(filehandle, options) line = filehandle.readline(options[:row_sep]) @file_line_count += 1 @csv_line_count += 1 line = remove_bom(line) if @csv_line_count == 1 line end def skip_lines(filehandle, options) return unless options[:skip_lines].to_i > 0 options[:skip_lines].to_i.times do readline_with_counts(filehandle, options) end end def rewind(filehandle) @file_line_count = 0 @csv_line_count = 0 filehandle.rewind end ### ### Thin wrapper around C-extension ### def parse(line, options, header_size = nil) # puts "SmarterCSV.parse OPTIONS: #{options[:acceleration]}" if options[:verbose] if options[:acceleration] && has_acceleration? # :nocov: has_quotes = line =~ /#{options[:quote_char]}/ elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size) elements.map!{|x| cleanup_quotes(x, options[:quote_char])} if has_quotes return [elements, elements.size] # :nocov: else # puts "WARNING: SmarterCSV is using un-accelerated parsing of lines. Check options[:acceleration]" return parse_csv_line_ruby(line, options, header_size) end end # ------------------------------------------------------------------ # Ruby equivalent of the C-extension for parse_line # # parses a single line: either a CSV header and body line # - quoting rules compared to RFC-4180 are somewhat relaxed # - we are not assuming that quotes inside a fields need to be doubled # - we are not assuming that all fields need to be quoted (0 is even) # - works with multi-char col_sep # - if header_size is given, only up to header_size fields are parsed # # We use header_size for parsing the body lines to make sure we always match the number of headers # in case there are trailing col_sep characters in line # # Our convention is that empty fields are returned as empty strings, not as nil. # # # the purpose of the max_size parameter is to handle a corner case where # CSV lines contain more fields than the header. # In which case the remaining fields in the line are ignored # def parse_csv_line_ruby(line, options, header_size = nil) return [] if line.nil? line_size = line.size col_sep = options[:col_sep] col_sep_size = col_sep.size quote = options[:quote_char] quote_count = 0 elements = [] start = 0 i = 0 while i < line_size if line[i...i+col_sep_size] == col_sep && quote_count.even? break if !header_size.nil? && elements.size >= header_size elements << cleanup_quotes(line[start...i], quote) i += col_sep.size start = i else quote_count += 1 if line[i] == quote i += 1 end end elements << cleanup_quotes(line[start..-1], quote) if header_size.nil? || elements.size < header_size [elements, elements.size] end def cleanup_quotes(field, quote) return field if field.nil? # return if field !~ /#{quote}/ # this check can probably eliminated if field.start_with?(quote) && field.end_with?(quote) field.delete_prefix!(quote) field.delete_suffix!(quote) end field.gsub!("#{quote}#{quote}", quote) field end # SEE: https://github.com/rails/rails/blob/32015b6f369adc839c4f0955f2d9dce50c0b6123/activesupport/lib/active_support/core_ext/object/blank.rb#L121 # and in the future we might also include UTF-8 space characters: https://www.compart.com/en/unicode/category/Zs BLANK_RE = /\A\s*\z/.freeze def blank?(value) case value when String value.empty? || BLANK_RE.match?(value) when NilClass true when Array value.empty? || value.inject(true){|result, x| result &&= elem_blank?(x)} when Hash value.empty? || value.values.inject(true){|result, x| result &&= elem_blank?(x)} else false end end def elem_blank?(value) case value when String value.empty? || BLANK_RE.match?(value) when NilClass true else false end end # acts as a road-block to limit processing when iterating over all k/v pairs of a CSV-hash: def only_or_except_limit_execution(options, option_name, key) if options[option_name].is_a?(Hash) if options[option_name].has_key?(:except) return true if Array(options[option_name][:except]).include?(key) elsif options[option_name].has_key?(:only) return true unless Array(options[option_name][:only]).include?(key) end end return false end # If file has headers, then guesses column separator from headers. # Otherwise guesses column separator from contents. # Raises exception if none is found. def guess_column_separator(filehandle, options) skip_lines(filehandle, options) delimiters = [',', "\t", ';', ':', '|'] line = nil has_header = options[:headers_in_file] candidates = Hash.new(0) count = has_header ? 1 : 5 count.times do line = readline_with_counts(filehandle, options) delimiters.each do |d| candidates[d] += line.scan(d).count end rescue EOFError # short files break end rewind(filehandle) if candidates.values.max == 0 # if the header only contains return ',' if line.chomp(options[:row_sep]) =~ /^\w+$/ raise SmarterCSV::NoColSepDetected end candidates.key(candidates.values.max) end # limitation: this currently reads the whole file in before making a decision def guess_line_ending(filehandle, options) counts = {"\n" => 0, "\r" => 0, "\r\n" => 0} quoted_char = false # count how many of the pre-defined line-endings we find # ignoring those contained within quote characters last_char = nil lines = 0 filehandle.each_char do |c| quoted_char = !quoted_char if c == options[:quote_char] next if quoted_char if last_char == "\r" if c == "\n" counts["\r\n"] += 1 else counts["\r"] += 1 # \r are counted after they appeared end elsif c == "\n" counts["\n"] += 1 end last_char = c lines += 1 break if options[:auto_row_sep_chars] && options[:auto_row_sep_chars] > 0 && lines >= options[:auto_row_sep_chars] end rewind(filehandle) counts["\r"] += 1 if last_char == "\r" # find the most frequent key/value pair: k, _ = counts.max_by{|_, v| v} return k end def process_headers(filehandle, options) @raw_header = nil @headers = nil if options[:headers_in_file] # extract the header line # process the header line in the CSV file.. # the first line of a CSV file contains the header .. it might be commented out, so we need to read it anyhow header = readline_with_counts(filehandle, options) @raw_header = header header = header.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence]) if options[:force_utf8] || options[:file_encoding] !~ /utf-8/i header = header.sub(options[:comment_regexp], '') if options[:comment_regexp] header = header.chomp(options[:row_sep]) header = header.gsub(options[:strip_chars_from_headers], '') if options[:strip_chars_from_headers] file_headerA, file_header_size = parse(header, options) file_headerA.map!{|x| x.gsub(%r/#{options[:quote_char]}/, '')} file_headerA.map!{|x| x.strip} if options[:strip_whitespace] unless options[:keep_original_headers] file_headerA.map!{|x| x.gsub(/\s+|-+/, '_')} file_headerA.map!{|x| x.downcase} if options[:downcase_header] end else raise SmarterCSV::IncorrectOption, "ERROR: If :headers_in_file is set to false, you have to provide :user_provided_headers" unless options[:user_provided_headers] end if options[:user_provided_headers] && options[:user_provided_headers].class == Array && !options[:user_provided_headers].empty? # use user-provided headers headerA = options[:user_provided_headers] if defined?(file_header_size) && !file_header_size.nil? if headerA.size != file_header_size raise SmarterCSV::HeaderSizeMismatch, "ERROR: :user_provided_headers defines #{headerA.size} headers != CSV-file has #{file_header_size} headers" else # we could print out the mapping of file_headerA to headerA here end end else headerA = file_headerA end # detect duplicate headers and disambiguate headerA = process_duplicate_headers(headerA, options) if options[:duplicate_header_suffix] header_size = headerA.size # used for splitting lines headerA.map!{|x| x.to_sym } unless options[:strings_as_keys] || options[:keep_original_headers] unless options[:user_provided_headers] # wouldn't make sense to re-map user provided headers key_mappingH = options[:key_mapping] # do some key mapping on the keys in the file header # if you want to completely delete a key, then map it to nil or to '' if !key_mappingH.nil? && key_mappingH.class == Hash && key_mappingH.keys.size > 0 unless options[:silence_missing_keys] # if silence_missing_keys are not set, raise error if missing header missing_keys = key_mappingH.keys - headerA puts "WARNING: missing header(s): #{missing_keys.join(",")}" unless missing_keys.empty? end headerA.map!{|x| key_mappingH.has_key?(x) ? (key_mappingH[x].nil? ? nil : key_mappingH[x]) : (options[:remove_unmapped_keys] ? nil : x)} end end # header_validations duplicate_headers = [] headerA.compact.each do |k| duplicate_headers << k if headerA.select{|x| x == k}.size > 1 end unless options[:user_provided_headers] || duplicate_headers.empty? raise SmarterCSV::DuplicateHeaders, "ERROR: duplicate headers: #{duplicate_headers.join(',')}" end # deprecate required_headers if !options[:required_headers].nil? puts "DEPRECATION WARNING: please use 'required_keys' instead of 'required headers'" if options[:required_keys].nil? options[:required_keys] = options[:required_headers] options[:required_headers] = nil end end if options[:required_keys] && options[:required_keys].is_a?(Array) missing_keys = [] options[:required_keys].each do |k| missing_keys << k unless headerA.include?(k) end raise SmarterCSV::MissingHeaders, "ERROR: missing attributes: #{missing_keys.join(',')}" unless missing_keys.empty? end @headers = headerA [headerA, header_size] end def process_duplicate_headers(headers, options) counts = Hash.new(0) result = [] headers.each do |key| counts[key] += 1 if counts[key] == 1 result << key else result << [key, options[:duplicate_header_suffix], counts[key]].join end end result end private UTF_32_BOM = %w[0 0 fe ff].freeze UTF_32LE_BOM = %w[ff fe 0 0].freeze UTF_8_BOM = %w[ef bb bf].freeze UTF_16_BOM = %w[fe ff].freeze UTF_16LE_BOM = %w[ff fe].freeze def remove_bom(str) str_as_hex = str.bytes.map{|x| x.to_s(16)} # if string does not start with one of the bytes, there is no BOM return str unless %w[ef fe ff 0].include?(str_as_hex[0]) return str.byteslice(4..-1) if [UTF_32_BOM, UTF_32LE_BOM].include?(str_as_hex[0..3]) return str.byteslice(3..-1) if str_as_hex[0..2] == UTF_8_BOM return str.byteslice(2..-1) if [UTF_16_BOM, UTF_16LE_BOM].include?(str_as_hex[0..1]) puts "SmarterCSV found unhandled BOM! #{str.chars[0..7].inspect}" str end def validate_options!(options) keys = options.keys errors = [] errors << "invalid row_sep" if keys.include?(:row_sep) && !option_valid?(options[:row_sep]) errors << "invalid col_sep" if keys.include?(:col_sep) && !option_valid?(options[:col_sep]) errors << "invalid quote_char" if keys.include?(:quote_char) && !option_valid?(options[:quote_char]) raise SmarterCSV::ValidationError, errors.inspect if errors.any? end def option_valid?(str) return true if str.is_a?(Symbol) && str == :auto return true if str.is_a?(String) && !str.empty? false end end end