Sha256: 54a691b7ffeb9fe2dceb97df28dc031973f90d9b60dc1d4ac1d7e3f3d5019613

Contents?: true

Size: 1.31 KB

Versions: 1

Compression:

Stored size: 1.31 KB

Contents

#!/usr/bin/env ruby

require 'csv'
begin
  require 'rchardet'
rescue LoadError
  $stderr.puts 'gem install rchardet'
  exit 1
end

def utf8?(str)
  str
    .force_encoding('utf-8')
    .valid_encoding?
end

def convert_to_utf8(str, current_encoding)
  str.force_encoding(current_encoding)
  return nil unless str.valid_encoding?

  str.encode('utf-8')
end

def detect_encoding(col)
  CharDet.detect(col)['encoding']
end

csv = CSV.open(ARGV[0], 'rb')
out = CSV.open(ARGV[1], 'wb') if ARGV[1]

headers = csv.shift
out << headers if out
csv_lineno = 1

while (row = csv.shift)
  csv_lineno += 1

  unless row.size == headers.size
    $stderr.puts "row(#{csv_lineno}): invalid number of columns, expected #{headers.size} got #{row.size}"
  end

  converted = false
  row.each_with_index do |col, idx|
    next if utf8?(col)

    $stderr.puts "row(#{csv_lineno}),col(#{idx + 1}): none UTF-8 characters found in \"#{col}\""
    if (col_utf8_encoded = convert_to_utf8(col, detect_encoding(col)))
      converted = true
      puts "row(#{csv_lineno}),col(#{idx + 1}): converted to UTF-8 from #{detect_encoding(col)} \"#{col_utf8_encoded}\""
      row[idx] = col_utf8_encoded
    else
      $stderr.puts "row(#{csv_lineno}),col(#{idx + 1}): unknown character encoding"
    end
  end

  out << row if out && converted
end

csv.close
out.close if out

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
csv-utils-0.3.2 bin/csv-validator