module SmarterCSV
  class SmarterCSVException < StandardError; end
  class HeaderSizeMismatch < SmarterCSVException; end
  class IncorrectOption < SmarterCSVException; end
  class DuplicateHeaders < SmarterCSVException; end
  class MissingHeaders < SmarterCSVException; end
  class NoColSepDetected < SmarterCSVException; end
  class KeyMappingError < SmarterCSVException; end

  # first parameter: filename or input object which responds to readline method
  def SmarterCSV.process(input, options={}, &block)
    options = default_options.merge(options)
    options[:invalid_byte_sequence] = '' if options[:invalid_byte_sequence].nil?

    headerA = []
    result = []
    @file_line_count = 0
    @csv_line_count = 0
    has_rails = !! defined?(Rails)
    begin
      fh = input.respond_to?(:readline) ? input : File.open(input, "r:#{options[:file_encoding]}")

      # auto-detect the row separator
      options[:row_sep] = SmarterCSV.guess_line_ending(fh, options) if options[:row_sep].to_sym == :auto
      # attempt to auto-detect column separator
      options[:col_sep] = guess_column_separator(fh, options) if options[:col_sep].to_sym == :auto
      # preserve options, in case we need to call the CSV class
      csv_options = options.select{|k,v| [:col_sep, :row_sep, :quote_char].include?(k)} # options.slice(:col_sep, :row_sep, :quote_char)
      csv_options.delete(:row_sep) if [nil, :auto].include?( options[:row_sep].to_sym )
      csv_options.delete(:col_sep) if [nil, :auto].include?( options[:col_sep].to_sym )

      if (options[:force_utf8] || options[:file_encoding] =~ /utf-8/i) && ( fh.respond_to?(:external_encoding) && fh.external_encoding != Encoding.find('UTF-8') || fh.respond_to?(:encoding) && fh.encoding != Encoding.find('UTF-8') )
        puts 'WARNING: you are trying to process UTF-8 input, but did not open the input with "b:utf-8" option. See README file "NOTES about File Encodings".'
      end

      if options[:skip_lines].to_i > 0
        options[:skip_lines].to_i.times do
          readline_with_counts(fh, options)
        end
      end

      headerA, header_size = process_headers(fh, options, csv_options)

      # in case we use chunking.. we'll need to set it up..
      if ! options[:chunk_size].nil? && options[:chunk_size].to_i > 0
        use_chunks = true
        chunk_size = options[:chunk_size].to_i
        chunk_count = 0
        chunk = []
      else
        use_chunks = false
      end

      # now on to processing all the rest of the lines in the CSV file:
      while ! fh.eof?    # we can't use fh.readlines() here, because this would read the whole file into memory at once, and eof => true
        line = readline_with_counts(fh, options)

        # replace invalid byte sequence in UTF-8 with question mark to avoid errors
        line = line.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence]) if options[:force_utf8] || options[:file_encoding] !~ /utf-8/i

        print "processing file line %10d, csv line %10d\r" % [@file_line_count, @csv_line_count] if options[:verbose]

        next if options[:comment_regexp] && line =~ options[:comment_regexp] # ignore all comment lines if there are any

        # cater for the quoted csv data containing the row separator carriage return character
        # in which case the row data will be split across multiple lines (see the sample content in spec/fixtures/carriage_returns_rn.csv)
        # by detecting the existence of an uneven number of quote characters
        multiline = line.count(options[:quote_char])%2 == 1 # should handle quote_char nil
        while line.count(options[:quote_char])%2 == 1 # should handle quote_char nil
          next_line = fh.readline(options[:row_sep])
          next_line = next_line.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence]) if options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
          line += next_line
          @file_line_count += 1
        end
        print "\nline contains uneven number of quote chars so including content through file line %d\n" % @file_line_count if options[:verbose] && multiline

        line.chomp!(options[:row_sep])

        if (line =~ %r{#{options[:quote_char]}}) and (! options[:force_simple_split])
          dataA = begin
            CSV.parse( line, **csv_options ).flatten.collect!{|x| x.nil? ? '' : x} # to deal with nil values from CSV.parse
          rescue CSV::MalformedCSVError => e
            raise $!, "#{$!} [SmarterCSV: csv line #{@csv_line_count}]", $!.backtrace
          end
        else
          dataA = line.split(options[:col_sep], header_size)
        end
        dataA.map!{|x| x.sub(/(#{options[:col_sep]})+\z/, '')} # remove any unwanted trailing col_sep characters at the end
        dataA.map!{|x| x.strip} if options[:strip_whitespace]

        # if all values are blank, then ignore this line
        # SEE: https://github.com/rails/rails/blob/32015b6f369adc839c4f0955f2d9dce50c0b6123/activesupport/lib/active_support/core_ext/object/blank.rb#L121
        next if options[:remove_empty_hashes] && blank?(dataA)

        hash = Hash.zip(headerA,dataA)  # from Facets of Ruby library

        # make sure we delete any key/value pairs from the hash, which the user wanted to delete:
        # Note: Ruby < 1.9 doesn't allow empty symbol literals!
        hash.delete(nil); hash.delete('');
        if RUBY_VERSION.to_f > 1.8
          eval('hash.delete(:"")')
        end

        if options[:remove_empty_values] == true
          if has_rails
            hash.delete_if{|k,v| v.blank?}
          else
            hash.delete_if{|k,v| blank?(v)}
          end
        end

        hash.delete_if{|k,v| ! v.nil? && v =~ /^(\d+|\d+\.\d+)$/ && v.to_f == 0} if options[:remove_zero_values]   # values are typically Strings!
        hash.delete_if{|k,v| v =~ options[:remove_values_matching]} if options[:remove_values_matching]

        if options[:convert_values_to_numeric]
          hash.each do |k,v|
            # deal with the :only / :except options to :convert_values_to_numeric
            next if SmarterCSV.only_or_except_limit_execution( options, :convert_values_to_numeric , k )

            # convert if it's a numeric value:
            case v
            when /^[+-]?\d+\.\d+$/
              hash[k] = v.to_f
            when /^[+-]?\d+$/
              hash[k] = v.to_i
            end
          end
        end

        if options[:value_converters]
          hash.each do |k,v|
            converter = options[:value_converters][k]
            next unless converter
            hash[k] = converter.convert(v)
          end
        end

        next if hash.empty? if options[:remove_empty_hashes]

        if use_chunks
          chunk << hash  # append temp result to chunk

          if chunk.size >= chunk_size || fh.eof?   # if chunk if full, or EOF reached
            # do something with the chunk
            if block_given?
              yield chunk  # do something with the hashes in the chunk in the block
            else
              result << chunk  # not sure yet, why anybody would want to do this without a block
            end
            chunk_count += 1
            chunk = []  # initialize for next chunk of data
          else

            # the last chunk may contain partial data, which also needs to be returned (BUG / ISSUE-18)

          end

          # while a chunk is being filled up we don't need to do anything else here

        else # no chunk handling
          if block_given?
            yield [hash]  # do something with the hash in the block (better to use chunking here)
          else
            result << hash
          end
        end
      end

      # print new line to retain last processing line message
      print "\n" if options[:verbose]

      # last chunk:
      if ! chunk.nil? && chunk.size > 0
        # do something with the chunk
        if block_given?
          yield chunk  # do something with the hashes in the chunk in the block
        else
          result << chunk  # not sure yet, why anybody would want to do this without a block
        end
        chunk_count += 1
        chunk = []  # initialize for next chunk of data
      end
    ensure
      fh.close if fh.respond_to?(:close)
    end
    if block_given?
      return chunk_count  # when we do processing through a block we only care how many chunks we processed
    else
      return result # returns either an Array of Hashes, or an Array of Arrays of Hashes (if in chunked mode)
    end
  end

  private

  def self.readline_with_counts(filehandle, options)
    line  = filehandle.readline(options[:row_sep])
    @file_line_count += 1
    @csv_line_count += 1
    line
  end

  def self.default_options
    {
      auto_row_sep_chars: 500,
      chunk_size: nil ,
      col_sep: ',',
      comment_regexp: nil, # was: /\A#/,
      convert_values_to_numeric: true,
      downcase_header: true,
      duplicate_header_suffix: nil,
      file_encoding: 'utf-8',
      force_simple_split: false ,
      force_utf8: false,
      headers_in_file: true,
      invalid_byte_sequence: '',
      keep_original_headers: false,
      key_mapping_hash: nil ,
      quote_char: '"',
      remove_empty_hashes: true ,
      remove_empty_values: true,
      remove_unmapped_keys: false,
      remove_values_matching: nil,
      remove_zero_values: false,
      required_headers: nil,
      row_sep: $INPUT_RECORD_SEPARATOR,
      skip_lines: nil,
      strings_as_keys: false,
      strip_chars_from_headers: nil,
      strip_whitespace: true,
      user_provided_headers: nil,
      value_converters: nil,
      verbose: false,
    }
  end

  def self.blank?(value)
    case value
    when Array
      value.inject(true){|result, x| result &&= elem_blank?(x)}
    when Hash
      value.inject(true){|result, x| result &&= elem_blank?(x.last)}
    else
      elem_blank?(value)
    end
  end

  def self.elem_blank?(value)
    case value
    when NilClass
      true
    when String
      value !~ /\S/
    else
      false
    end
  end

  # acts as a road-block to limit processing when iterating over all k/v pairs of a CSV-hash:
  def self.only_or_except_limit_execution( options, option_name, key )
    if options[option_name].is_a?(Hash)
      if options[option_name].has_key?( :except )
        return true if Array( options[ option_name ][:except] ).include?(key)
      elsif options[ option_name ].has_key?(:only)
        return true unless Array( options[ option_name ][:only] ).include?(key)
      end
    end
    return false
  end

  # raise exception if none is found
  def self.guess_column_separator(filehandle, options)
    del = [',', "\t", ';', ':', '|']
    n = Hash.new(0)
    5.times do
      line = filehandle.readline(options[:row_sep])
      del.each do |d|
        n[d] += line.scan(d).count
      end
    rescue EOFError # short files
      break
    end
    filehandle.rewind
    raise SmarterCSV::NoColSepDetected if n.values.max == 0

    col_sep = n.key(n.values.max)
  end

  # limitation: this currently reads the whole file in before making a decision
  def self.guess_line_ending( filehandle, options )
    counts = {"\n" => 0 , "\r" => 0, "\r\n" => 0}
    quoted_char = false

    # count how many of the pre-defined line-endings we find
    # ignoring those contained within quote characters
    last_char = nil
    lines = 0
    filehandle.each_char do |c|
      quoted_char = !quoted_char if c == options[:quote_char]
      next if quoted_char

      if last_char == "\r"
        if c == "\n"
          counts["\r\n"] +=  1
        else
          counts["\r"] += 1  # \r are counted after they appeared, we might
        end
      elsif c == "\n"
        counts["\n"] += 1
      end
      last_char = c
      lines += 1
      break if options[:auto_row_sep_chars] && options[:auto_row_sep_chars] > 0 && lines >= options[:auto_row_sep_chars]
    end
    filehandle.rewind

    counts["\r"] += 1 if last_char == "\r"
    # find the key/value pair with the largest counter:
    k,_ = counts.max_by{|_,v| v}
    return k                    # the most frequent one is it
  end

  def self.process_headers(filehandle, options, csv_options)
    if options[:headers_in_file]        # extract the header line
      # process the header line in the CSV file..
      # the first line of a CSV file contains the header .. it might be commented out, so we need to read it anyhow
      header = readline_with_counts(filehandle, options)

      header = header.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence]) if options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
      header = header.sub(options[:comment_regexp],'') if options[:comment_regexp]
      header = header.chomp(options[:row_sep])

      header = header.gsub(options[:strip_chars_from_headers], '') if options[:strip_chars_from_headers]

      if (header =~ %r{#{options[:quote_char]}}) and (! options[:force_simple_split])
        file_headerA = begin
          CSV.parse( header, **csv_options ).flatten.collect!{|x| x.nil? ? '' : x} # to deal with nil values from CSV.parse
        rescue CSV::MalformedCSVError => e
          raise $!, "#{$!} [SmarterCSV: csv line #{@csv_line_count}]", $!.backtrace
        end
      else
        file_headerA =  header.split(options[:col_sep])
      end
      file_header_size = file_headerA.size # before mapping, which could delete keys

      file_headerA.map!{|x| x.gsub(%r/#{options[:quote_char]}/,'') }
      file_headerA.map!{|x| x.strip}  if options[:strip_whitespace]
      unless options[:keep_original_headers]
        file_headerA.map!{|x| x.gsub(/\s+|-+/,'_')}
        file_headerA.map!{|x| x.downcase }   if options[:downcase_header]
      end
    else
      raise SmarterCSV::IncorrectOption , "ERROR: If :headers_in_file is set to false, you have to provide :user_provided_headers" unless options[:user_provided_headers]
    end
    if options[:user_provided_headers] && options[:user_provided_headers].class == Array && ! options[:user_provided_headers].empty?
      # use user-provided headers
      headerA = options[:user_provided_headers]
      if defined?(file_header_size) && ! file_header_size.nil?
        if headerA.size != file_header_size
          raise SmarterCSV::HeaderSizeMismatch , "ERROR: :user_provided_headers defines #{headerA.size} headers !=  CSV-file #{input} has #{file_header_size} headers"
        else
          # we could print out the mapping of file_headerA to headerA here
        end
      end
    else
      headerA = file_headerA
    end

    # detect duplicate headers and disambiguate
    headerA = process_duplicate_headers(headerA, options) if options[:duplicate_header_suffix]
    header_size = headerA.size # used for splitting lines

    headerA.map!{|x| x.to_sym } unless options[:strings_as_keys] || options[:keep_original_headers]

    unless options[:user_provided_headers] # wouldn't make sense to re-map user provided headers
      key_mappingH = options[:key_mapping]

      # do some key mapping on the keys in the file header
      #   if you want to completely delete a key, then map it to nil or to ''
      if ! key_mappingH.nil? && key_mappingH.class == Hash && key_mappingH.keys.size > 0
        # we can't map keys that are not there
        missing_keys = key_mappingH.keys - headerA
        raise(SmarterCSV::KeyMappingError, "missing header(s): #{missing_keys.join(",")}") unless missing_keys.empty?

        headerA.map!{|x| key_mappingH.has_key?(x) ? (key_mappingH[x].nil? ? nil : key_mappingH[x]) : (options[:remove_unmapped_keys] ? nil : x)}
      end
    end

    # header_validations
    duplicate_headers = []
    headerA.compact.each do |k|
      duplicate_headers << k if headerA.select{|x| x == k}.size > 1
    end
    raise SmarterCSV::DuplicateHeaders , "ERROR: duplicate headers: #{duplicate_headers.join(',')}" unless duplicate_headers.empty?

    if options[:required_headers] && options[:required_headers].is_a?(Array)
      missing_headers = []
      options[:required_headers].each do |k|
        missing_headers << k unless headerA.include?(k)
      end
      raise SmarterCSV::MissingHeaders , "ERROR: missing headers: #{missing_headers.join(',')}" unless missing_headers.empty?
    end

    [headerA, header_size]
  end

  def self.process_duplicate_headers(headers, options)
    counts = Hash.new(0)
    result = []
    headers.each do |key|
      counts[key] += 1
      if counts[key] == 1
        result << key
      else
        result << [key, options[:duplicate_header_suffix], counts[key]].join
      end
    end
    result
  end
end