lib/csvlint/validate.rb in csvlint-0.1.0 vs lib/csvlint/validate.rb in csvlint-0.1.1

- old
+ new

@@ -1,23 +1,21 @@ -require "open_uri_redirections" - module Csvlint class Validator include Csvlint::ErrorCollector - include Csvlint::Types attr_reader :encoding, :content_type, :extension, :headers, :line_breaks, :dialect, :csv_header, :schema, :data ERROR_MATCHERS = { "Missing or stray quote" => :stray_quote, "Illegal quoting" => :whitespace, "Unclosed quoted field" => :unclosed_quote, + "Unquoted fields do not allow \\r or \\n" => :line_breaks, } - def initialize(source, dialect = nil, schema = nil) + def initialize(source, dialect = nil, schema = nil, options = {}) @source = source @formats = [] @schema = schema @supplied_dialect = dialect != nil @@ -29,11 +27,11 @@ "lineTerminator" => :auto, "quoteChar" => '"' }.merge(dialect || {}) @csv_header = @dialect["header"] - + @limit_lines = options[:limit_lines] @csv_options = dialect_to_csv_options(@dialect) @extension = parse_extension(source) reset validate end @@ -109,23 +107,26 @@ if @line_breaks != "\r\n" build_info_messages(:nonrfc_line_breaks, :structure) end row = nil loop do - current_line = current_line + 1 + current_line += 1 + if @limit_lines && current_line > @limit_lines + break + end begin wrapper.reset_line row = csv.shift @data << row if row if current_line == 1 && header? - row = row.reject {|r| r.blank? } + row = row.reject{|col| col.nil? || col.empty?} validate_header(row) @col_counts << row.size else - build_formats(row, current_line) - @col_counts << row.reject {|r| r.blank? }.size + build_formats(row) + @col_counts << row.reject{|col| col.nil? || col.empty?}.size @expected_columns = row.size unless @expected_columns != 0 build_errors(:blank_rows, :structure, current_line, nil, wrapper.line) if row.reject{ |c| c.nil? || c.empty? }.size == 0 if @schema @@ -148,11 +149,11 @@ build_errors(type, :structure, current_line, nil, wrapper.line) end end end rescue ArgumentError => ae - build_errors(:invalid_encoding, :structure, current_line, wrapper.line) unless reported_invalid_encoding + build_errors(:invalid_encoding, :structure, current_line, nil, wrapper.line) unless reported_invalid_encoding reported_invalid_encoding = true end end def validate_header(header) @@ -176,11 +177,11 @@ def header? @csv_header end def fetch_error(error) - e = error.message.match(/^([a-z ]+) (i|o)n line ([0-9]+)\.?$/i) + e = error.message.match(/^(.+?)(?: [io]n)? \(?line \d+\)?\.?$/i) message = e[1] rescue nil ERROR_MATCHERS.fetch(message, :unknown_error) end def dialect_to_csv_options(dialect) @@ -193,44 +194,56 @@ :quote_char => dialect["quoteChar"], :skip_blanks => false } end - def build_formats(row, line) + def build_formats(row) row.each_with_index do |col, i| - next if col.blank? - @formats[i] ||= [] - - SIMPLE_FORMATS.each do |type, lambda| - begin - if lambda.call(col) - @format = type - end - rescue ArgumentError, URI::InvalidURIError - end + next if col.nil? || col.empty? + @formats[i] ||= Hash.new(0) + + format = if col.strip[FORMATS[:numeric]] + :numeric + elsif uri?(col) + :uri + elsif col[FORMATS[:date_db]] && date_format?(Date, col, '%Y-%m-%d') + :date_db + elsif col[FORMATS[:date_short]] && date_format?(Date, col, '%e %b') + :date_short + elsif col[FORMATS[:date_rfc822]] && date_format?(Date, col, '%e %b %Y') + :date_rfc822 + elsif col[FORMATS[:date_long]] && date_format?(Date, col, '%B %e, %Y') + :date_long + elsif col[FORMATS[:dateTime_time]] && date_format?(Time, col, '%H:%M') + :dateTime_time + elsif col[FORMATS[:dateTime_hms]] && date_format?(Time, col, '%H:%M:%S') + :dateTime_hms + elsif col[FORMATS[:dateTime_db]] && date_format?(Time, col, '%Y-%m-%d %H:%M:%S') + :dateTime_db + elsif col[FORMATS[:dateTime_iso8601]] && date_format?(Time, col, '%Y-%m-%dT%H:%M:%SZ') + :dateTime_iso8601 + elsif col[FORMATS[:dateTime_short]] && date_format?(Time, col, '%d %b %H:%M') + :dateTime_short + elsif col[FORMATS[:dateTime_long]] && date_format?(Time, col, '%B %d, %Y %H:%M') + :dateTime_long + else + :string end - @formats[i] << @format + @formats[i][format] += 1 end end def check_consistency - percentages = [] - - SIMPLE_FORMATS.keys.each do |type| - @formats.each_with_index do |format,i| - percentages[i] ||= {} - unless format.nil? - percentages[i][type] = format.count(type) / format.size.to_f + @formats.each_with_index do |format,i| + if format + total = format.values.reduce(:+).to_f + if format.none?{|_,count| count / total >= 0.9} + build_warnings(:inconsistent_values, :schema, nil, i + 1) end end end - - percentages.each_with_index do |col, i| - next if col.values.blank? - build_warnings(:inconsistent_values, :schema, nil, i+1) if col.values.max < 0.9 - end end private def parse_extension(source) @@ -246,8 +259,38 @@ else parsed = URI.parse(source) File.extname(parsed.path) end end - + + def uri?(value) + if value.strip[FORMATS[:uri]] + uri = URI.parse(value) + uri.kind_of?(URI::HTTP) || uri.kind_of?(URI::HTTPS) + end + rescue URI::InvalidURIError + false + end + + def date_format?(klass, value, format) + klass.strptime(value, format).strftime(format) == value + rescue ArgumentError # invalid date + false + end + + FORMATS = { + :string => nil, + :numeric => /\A[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?\z/, + :uri => /\Ahttps?:/, + :date_db => /\A\d{4,}-\d\d-\d\d\z/, # "12345-01-01" + :date_long => /\A(?:#{Date::MONTHNAMES.join('|')}) [ \d]\d, \d{4,}\z/, # "January 1, 12345" + :date_rfc822 => /\A[ \d]\d (?:#{Date::ABBR_MONTHNAMES.join('|')}) \d{4,}\z/, # " 1 Jan 12345" + :date_short => /\A[ \d]\d (?:#{Date::ABBR_MONTHNAMES.join('|')})\z/, # "1 Jan" + :dateTime_db => /\A\d{4,}-\d\d-\d\d \d\d:\d\d:\d\d\z/, # "12345-01-01 00:00:00" + :dateTime_hms => /\A\d\d:\d\d:\d\d\z/, # "00:00:00" + :dateTime_iso8601 => /\A\d{4,}-\d\d-\d\dT\d\d:\d\d:\d\dZ\z/, # "12345-01-01T00:00:00Z" + :dateTime_long => /\A(?:#{Date::MONTHNAMES.join('|')}) \d\d, \d{4,} \d\d:\d\d\z/, # "January 01, 12345 00:00" + :dateTime_short => /\A\d\d (?:#{Date::ABBR_MONTHNAMES.join('|')}) \d\d:\d\d\z/, # "01 Jan 00:00" + :dateTime_time => /\A\d\d:\d\d\z/, # "00:00" + }.freeze end -end \ No newline at end of file +end