bin/csv-validator in csv-utils-0.3.3 vs bin/csv-validator in csv-utils-0.3.4

- old
+ new

@@ -23,38 +23,55 @@ def detect_encoding(col) CharDet.detect(col)['encoding'] end +def strip_bom!(col) + col.sub!("\xEF\xBB\xBF".force_encoding('ASCII-8BIT'), '') +end + csv = CSV.open(ARGV[0], 'rb') -out = CSV.open(ARGV[1], 'wb') if ARGV[1] +id_column_name = ARGV[1] headers = csv.shift -out << headers if out +strip_bom!(headers[0]) + +id_column_num = nil +if id_column_name + unless headers.include?(id_column_name) + $stderr.puts("header #{id_column_name} not found in current set of headers") + exit 1 + end + + id_column_num = headers.index(id_column_name) +end + +out = nil +if id_column_num + out = CSV.open('utf8-correctsion.csv', 'wb') + out << [id_column_name, 'Row', 'Col', 'Header', 'Value'] +end + csv_lineno = 1 while (row = csv.shift) csv_lineno += 1 unless row.size == headers.size $stderr.puts "row(#{csv_lineno}): invalid number of columns, expected #{headers.size} got #{row.size}" end - converted = false row.each_with_index do |col, idx| next if utf8?(col) $stderr.puts "row(#{csv_lineno}),col(#{idx + 1}) #{headers[idx]}: none UTF-8 characters found in \"#{col}\"" if (col_utf8_encoded = convert_to_utf8(col, detect_encoding(col))) - converted = true puts "row(#{csv_lineno}),col(#{idx + 1}) #{headers[idx]}: converted to UTF-8 from #{detect_encoding(col)} \"#{col_utf8_encoded}\"" - row[idx] = col_utf8_encoded + out << [row[id_column_num], csv_lineno, (idx + 1), headers[idx], col_utf8_encoded] else $stderr.puts "row(#{csv_lineno}),col(#{idx + 1}) #{headers[idx]}: unknown character encoding" end end - - out << row if out && converted end csv.close out.close if out