#!/usr/bin/env ruby require 'csv' begin require 'rchardet' rescue LoadError $stderr.puts 'gem install rchardet' exit 1 end def utf8?(str) str .force_encoding('utf-8') .valid_encoding? end def convert_to_utf8(str, current_encoding) str.force_encoding(current_encoding) return nil unless str.valid_encoding? str.encode('utf-8') end def detect_encoding(col) CharDet.detect(col)['encoding'] end csv = CSV.open(ARGV[0], 'rb') out = CSV.open(ARGV[1], 'wb') if ARGV[1] headers = csv.shift out << headers if out csv_lineno = 1 while (row = csv.shift) csv_lineno += 1 unless row.size == headers.size $stderr.puts "row(#{csv_lineno}): invalid number of columns, expected #{headers.size} got #{row.size}" end converted = false row.each_with_index do |col, idx| next if utf8?(col) $stderr.puts "row(#{csv_lineno}),col(#{idx + 1}): none UTF-8 characters found in \"#{col}\"" if (col_utf8_encoded = convert_to_utf8(col, detect_encoding(col))) converted = true puts "row(#{csv_lineno}),col(#{idx + 1}): converted to UTF-8 from #{detect_encoding(col)} \"#{col_utf8_encoded}\"" row[idx] = col_utf8_encoded else $stderr.puts "row(#{csv_lineno}),col(#{idx + 1}): unknown character encoding" end end out << row if out && converted end csv.close out.close if out