Sha256: c276dde74ba12475bd141a17d3e55eb2640f3a89307fa2ce94275e3f790ec437

Contents?: true

Size: 1.78 KB

Versions: 3

Compression:

Stored size: 1.78 KB

Contents

#!/usr/bin/env ruby

require 'csv'
begin
  require 'rchardet'
rescue LoadError
  $stderr.puts 'gem install rchardet'
  exit 1
end

def utf8?(str)
  str
    .force_encoding('utf-8')
    .valid_encoding?
end

def convert_to_utf8(str, current_encoding)
  str.force_encoding(current_encoding)
  return nil unless str.valid_encoding?

  str.encode('utf-8')
end

def detect_encoding(col)
  CharDet.detect(col)['encoding']
end

def strip_bom!(col)
  col.sub!("\xEF\xBB\xBF".force_encoding('ASCII-8BIT'), '')
end

csv = CSV.open(ARGV[0], 'rb')
id_column_name = ARGV[1]

headers = csv.shift
strip_bom!(headers[0])

id_column_num = nil
if id_column_name
  unless headers.include?(id_column_name)
    $stderr.puts("header #{id_column_name} not found in current set of headers")
    exit 1
  end

  id_column_num = headers.index(id_column_name)
end

out = nil
if id_column_num
  out = CSV.open('utf8-correctsion.csv', 'wb')
  out << [id_column_name, 'Row', 'Col', 'Header', 'Value']
end

csv_lineno = 1

while (row = csv.shift)
  csv_lineno += 1

  unless row.size == headers.size
    $stderr.puts "row(#{csv_lineno}): invalid number of columns, expected #{headers.size} got #{row.size}"
  end

  row.each_with_index do |col, idx|
    next if col.nil? || utf8?(col)

    $stderr.puts "row(#{csv_lineno}),col(#{idx + 1}) #{headers[idx]}: none UTF-8 characters found in \"#{col}\""
    if (col_utf8_encoded = convert_to_utf8(col, detect_encoding(col)))
      puts "row(#{csv_lineno}),col(#{idx + 1}) #{headers[idx]}: converted to UTF-8 from #{detect_encoding(col)} \"#{col_utf8_encoded}\""
      out << [row[id_column_num], csv_lineno, (idx + 1), headers[idx], col_utf8_encoded]
    else
      $stderr.puts "row(#{csv_lineno}),col(#{idx + 1}) #{headers[idx]}: unknown character encoding"
    end
  end
end

csv.close
out.close if out

Version data entries

3 entries across 3 versions & 1 rubygems

Version Path
csv-utils-0.3.8 bin/csv-validator
csv-utils-0.3.7 bin/csv-validator
csv-utils-0.3.6 bin/csv-validator