Sha256: c276dde74ba12475bd141a17d3e55eb2640f3a89307fa2ce94275e3f790ec437
Contents?: true
Size: 1.78 KB
Versions: 3
Compression:
Stored size: 1.78 KB
Contents
#!/usr/bin/env ruby require 'csv' begin require 'rchardet' rescue LoadError $stderr.puts 'gem install rchardet' exit 1 end def utf8?(str) str .force_encoding('utf-8') .valid_encoding? end def convert_to_utf8(str, current_encoding) str.force_encoding(current_encoding) return nil unless str.valid_encoding? str.encode('utf-8') end def detect_encoding(col) CharDet.detect(col)['encoding'] end def strip_bom!(col) col.sub!("\xEF\xBB\xBF".force_encoding('ASCII-8BIT'), '') end csv = CSV.open(ARGV[0], 'rb') id_column_name = ARGV[1] headers = csv.shift strip_bom!(headers[0]) id_column_num = nil if id_column_name unless headers.include?(id_column_name) $stderr.puts("header #{id_column_name} not found in current set of headers") exit 1 end id_column_num = headers.index(id_column_name) end out = nil if id_column_num out = CSV.open('utf8-correctsion.csv', 'wb') out << [id_column_name, 'Row', 'Col', 'Header', 'Value'] end csv_lineno = 1 while (row = csv.shift) csv_lineno += 1 unless row.size == headers.size $stderr.puts "row(#{csv_lineno}): invalid number of columns, expected #{headers.size} got #{row.size}" end row.each_with_index do |col, idx| next if col.nil? || utf8?(col) $stderr.puts "row(#{csv_lineno}),col(#{idx + 1}) #{headers[idx]}: none UTF-8 characters found in \"#{col}\"" if (col_utf8_encoded = convert_to_utf8(col, detect_encoding(col))) puts "row(#{csv_lineno}),col(#{idx + 1}) #{headers[idx]}: converted to UTF-8 from #{detect_encoding(col)} \"#{col_utf8_encoded}\"" out << [row[id_column_num], csv_lineno, (idx + 1), headers[idx], col_utf8_encoded] else $stderr.puts "row(#{csv_lineno}),col(#{idx + 1}) #{headers[idx]}: unknown character encoding" end end end csv.close out.close if out
Version data entries
3 entries across 3 versions & 1 rubygems
Version | Path |
---|---|
csv-utils-0.3.8 | bin/csv-validator |
csv-utils-0.3.7 | bin/csv-validator |
csv-utils-0.3.6 | bin/csv-validator |