bin/csv-validator in csv-utils-0.3.3 vs bin/csv-validator in csv-utils-0.3.4
- old
+ new
@@ -23,38 +23,55 @@
def detect_encoding(col)
CharDet.detect(col)['encoding']
end
+def strip_bom!(col)
+ col.sub!("\xEF\xBB\xBF".force_encoding('ASCII-8BIT'), '')
+end
+
csv = CSV.open(ARGV[0], 'rb')
-out = CSV.open(ARGV[1], 'wb') if ARGV[1]
+id_column_name = ARGV[1]
headers = csv.shift
-out << headers if out
+strip_bom!(headers[0])
+
+id_column_num = nil
+if id_column_name
+ unless headers.include?(id_column_name)
+ $stderr.puts("header #{id_column_name} not found in current set of headers")
+ exit 1
+ end
+
+ id_column_num = headers.index(id_column_name)
+end
+
+out = nil
+if id_column_num
+ out = CSV.open('utf8-correctsion.csv', 'wb')
+ out << [id_column_name, 'Row', 'Col', 'Header', 'Value']
+end
+
csv_lineno = 1
while (row = csv.shift)
csv_lineno += 1
unless row.size == headers.size
$stderr.puts "row(#{csv_lineno}): invalid number of columns, expected #{headers.size} got #{row.size}"
end
- converted = false
row.each_with_index do |col, idx|
next if utf8?(col)
$stderr.puts "row(#{csv_lineno}),col(#{idx + 1}) #{headers[idx]}: none UTF-8 characters found in \"#{col}\""
if (col_utf8_encoded = convert_to_utf8(col, detect_encoding(col)))
- converted = true
puts "row(#{csv_lineno}),col(#{idx + 1}) #{headers[idx]}: converted to UTF-8 from #{detect_encoding(col)} \"#{col_utf8_encoded}\""
- row[idx] = col_utf8_encoded
+ out << [row[id_column_num], csv_lineno, (idx + 1), headers[idx], col_utf8_encoded]
else
$stderr.puts "row(#{csv_lineno}),col(#{idx + 1}) #{headers[idx]}: unknown character encoding"
end
end
-
- out << row if out && converted
end
csv.close
out.close if out