module Csvlint class Validator include Csvlint::ErrorCollector attr_reader :encoding, :content_type, :extension, :headers, :link_headers, :line_breaks, :dialect, :csv_header, :schema, :data ERROR_MATCHERS = { "Missing or stray quote" => :stray_quote, "Illegal quoting" => :whitespace, "Unclosed quoted field" => :unclosed_quote, "Unquoted fields do not allow \\r or \\n" => :line_breaks, } def initialize(source, dialect = nil, schema = nil, options = {}) reset @source = source @formats = [] @schema = schema @supplied_dialect = dialect != nil @limit_lines = options[:limit_lines] @extension = parse_extension(source) unless @source.nil? @errors += @schema.errors unless @schema.nil? @warnings += @schema.warnings unless @schema.nil? validate(dialect) end def validate(dialect = nil) single_col = false io = nil begin if @extension =~ /.xls(x)?/ build_warnings(:excel, :context) return end io = @source.respond_to?(:gets) ? @source : open(@source, :allow_redirections=>:all) validate_metadata(io) locate_schema unless @schema.instance_of?(Csvlint::Schema) set_dialect(dialect) parse_csv(io) sum = @col_counts.inject(:+) unless sum.nil? build_warnings(:title_row, :structure) if @col_counts.first < (sum / @col_counts.size.to_f) end build_warnings(:check_options, :structure) if @expected_columns == 1 check_consistency check_foreign_keys rescue OpenURI::HTTPError, Errno::ENOENT build_errors(:not_found, nil, nil, nil, @source) ensure io.close if io && io.respond_to?(:close) end end def validate_metadata(io) @csv_header = true @encoding = io.charset rescue nil @content_type = io.content_type rescue nil @headers = io.meta rescue nil @link_headers = @headers["link"].split(",") rescue nil assumed_header = undeclared_header = !@supplied_dialect if @headers if @headers["content-type"] =~ /text\/csv/ @csv_header = true undeclared_header = false assumed_header = true end if @headers["content-type"] =~ /header=(present|absent)/ @csv_header = true if $1 == "present" @csv_header = false if $1 == "absent" undeclared_header = false assumed_header = false end if @headers["content-type"] !~ /charset=/ build_warnings(:no_encoding, :context) else build_warnings(:encoding, :context) if @encoding != "utf-8" end build_warnings(:no_content_type, :context) if @content_type == nil build_errors(:wrong_content_type, :context) unless (@content_type && @content_type =~ /text\/csv/) if undeclared_header build_errors(:undeclared_header, :structure) assumed_header = false end end build_info_messages(:assumed_header, :structure) if assumed_header end def set_dialect(dialect) begin schema_dialect = @schema.tables[@source_url].dialect || {} rescue schema_dialect = {} end @dialect = { "header" => true, "delimiter" => ",", "skipInitialSpace" => true, "lineTerminator" => :auto, "quoteChar" => '"', "trim" => :true }.merge(schema_dialect).merge(dialect || {}) @csv_header = @csv_header && @dialect["header"] @csv_options = dialect_to_csv_options(@dialect) end # analyses the provided csv and builds errors, warnings and info messages def parse_csv(io) @expected_columns = 0 current_line = 0 reported_invalid_encoding = false all_errors = [] @col_counts = [] @csv_options[:encoding] = @encoding begin wrapper = WrappedIO.new( io ) csv = CSV.new( wrapper, @csv_options ) @data = [] @line_breaks = csv.row_sep if @line_breaks != "\r\n" build_info_messages(:nonrfc_line_breaks, :structure) end row = nil loop do current_line += 1 if @limit_lines && current_line > @limit_lines break end begin wrapper.reset_line row = csv.shift @data << row if row if current_line == 1 && header? row = row.reject{|col| col.nil? || col.empty?} validate_header(row) @col_counts << row.size else build_formats(row) @col_counts << row.reject{|col| col.nil? || col.empty?}.size @expected_columns = row.size unless @expected_columns != 0 build_errors(:blank_rows, :structure, current_line, nil, wrapper.line) if row.reject{ |c| c.nil? || c.empty? }.size == 0 # Builds errors and warnings related to the provided schema file if @schema @schema.validate_row(row, current_line, all_errors, @source) @errors += @schema.errors all_errors += @schema.errors @warnings += @schema.warnings else build_errors(:ragged_rows, :structure, current_line, nil, wrapper.line) if !row.empty? && row.size != @expected_columns end end else break end rescue CSV::MalformedCSVError => e type = fetch_error(e) if type == :stray_quote && !wrapper.line.match(csv.row_sep) build_errors(:line_breaks, :structure) else build_errors(type, :structure, current_line, nil, wrapper.line) end end end rescue ArgumentError => ae build_errors(:invalid_encoding, :structure, current_line, nil, wrapper.line) unless reported_invalid_encoding reported_invalid_encoding = true end end def validate_header(header) names = Set.new header.map{|h| h.strip! } if @dialect["trim"] == :true header.each_with_index do |name,i| build_warnings(:empty_column_name, :schema, nil, i+1) if name == "" if names.include?(name) build_warnings(:duplicate_column_name, :schema, nil, i+1) else names << name end end if @schema @schema.validate_header(header, @source) @errors += @schema.errors @warnings += @schema.warnings end return valid? end def header? @csv_header end def fetch_error(error) e = error.message.match(/^(.+?)(?: [io]n)? \(?line \d+\)?\.?$/i) message = e[1] rescue nil ERROR_MATCHERS.fetch(message, :unknown_error) end def dialect_to_csv_options(dialect) skipinitialspace = dialect["skipInitialSpace"] || true delimiter = dialect["delimiter"] delimiter = delimiter + " " if !skipinitialspace return { :col_sep => delimiter, :row_sep => dialect["lineTerminator"], :quote_char => dialect["quoteChar"], :skip_blanks => false } end def build_formats(row) row.each_with_index do |col, i| next if col.nil? || col.empty? @formats[i] ||= Hash.new(0) format = if col.strip[FORMATS[:numeric]] :numeric elsif uri?(col) :uri elsif col[FORMATS[:date_db]] && date_format?(Date, col, '%Y-%m-%d') :date_db elsif col[FORMATS[:date_short]] && date_format?(Date, col, '%e %b') :date_short elsif col[FORMATS[:date_rfc822]] && date_format?(Date, col, '%e %b %Y') :date_rfc822 elsif col[FORMATS[:date_long]] && date_format?(Date, col, '%B %e, %Y') :date_long elsif col[FORMATS[:dateTime_time]] && date_format?(Time, col, '%H:%M') :dateTime_time elsif col[FORMATS[:dateTime_hms]] && date_format?(Time, col, '%H:%M:%S') :dateTime_hms elsif col[FORMATS[:dateTime_db]] && date_format?(Time, col, '%Y-%m-%d %H:%M:%S') :dateTime_db elsif col[FORMATS[:dateTime_iso8601]] && date_format?(Time, col, '%Y-%m-%dT%H:%M:%SZ') :dateTime_iso8601 elsif col[FORMATS[:dateTime_short]] && date_format?(Time, col, '%d %b %H:%M') :dateTime_short elsif col[FORMATS[:dateTime_long]] && date_format?(Time, col, '%B %d, %Y %H:%M') :dateTime_long else :string end @formats[i][format] += 1 end end def check_consistency @formats.each_with_index do |format,i| if format total = format.values.reduce(:+).to_f if format.none?{|_,count| count / total >= 0.9} build_warnings(:inconsistent_values, :schema, nil, i + 1) end end end end def check_foreign_keys if @schema.instance_of? Csvlint::Csvw::TableGroup @schema.validate_foreign_keys @errors += @schema.errors @warnings += @schema.warnings end end def locate_schema @source_url = nil warn_if_unsuccessful = false case @source when StringIO return when File @source_url = "file:#{File.expand_path(@source)}" else @source_url = @source end unless @schema.nil? if @schema.tables[@source_url] return else @schema = nil end end link_schema = nil @link_headers.each do |link_header| match = LINK_HEADER_REGEXP.match(link_header) uri = match["uri"].gsub(/(^\<|\>$)/, "") rescue nil rel = match["rel-relationship"].gsub(/(^\"|\"$)/, "") rescue nil param = match["param"] param_value = match["param-value"].gsub(/(^\"|\"$)/, "") rescue nil if rel == "describedby" && param == "type" && ["application/csvm+json", "application/ld+json", "application/json"].include?(param_value) begin url = URI.join(@source_url, uri) schema = Schema.load_from_json(url) if schema.instance_of? Csvlint::Csvw::TableGroup if schema.tables[@source_url] link_schema = schema else warn_if_unsuccessful = true build_warnings(:schema_mismatch, :context, nil, nil, @source_url, schema) end end rescue OpenURI::HTTPError end end end if @link_headers @schema = link_schema if link_schema paths = [] if @source_url =~ /^http(s)?/ begin well_known_uri = URI.join(@source_url, "/.well-known/csvm") well_known = open(well_known_uri).read # TODO rescue OpenURI::HTTPError end end paths = ["{+url}-metadata.json", "csv-metadata.json"] if paths.empty? paths.each do |template| begin template = URITemplate.new(template) path = template.expand('url' => @source_url) url = URI.join(@source_url, path) url = File.new(url.to_s.sub(/^file:/, "")) if url.to_s =~ /^file:/ schema = Schema.load_from_json(url) if schema.instance_of? Csvlint::Csvw::TableGroup if schema.tables[@source_url] @schema = schema else warn_if_unsuccessful = true build_warnings(:schema_mismatch, :context, nil, nil, @source_url, schema) end end rescue Errno::ENOENT rescue OpenURI::HTTPError rescue ArgumentError rescue => e STDERR.puts e.class STDERR.puts e.message STDERR.puts e.backtrace raise e end end build_warnings(:schema_mismatch, :context, nil, nil, @source_url, schema) if warn_if_unsuccessful @schema = nil end private def parse_extension(source) case source when File return File.extname( source.path ) when IO return "" when StringIO return "" when Tempfile # this is triggered when the revalidate dialect use case happens return "" else begin parsed = URI.parse(source) File.extname(parsed.path) rescue URI::InvalidURIError return "" end end end def uri?(value) if value.strip[FORMATS[:uri]] uri = URI.parse(value) uri.kind_of?(URI::HTTP) || uri.kind_of?(URI::HTTPS) end rescue URI::InvalidURIError false end def date_format?(klass, value, format) klass.strptime(value, format).strftime(format) == value rescue ArgumentError # invalid date false end FORMATS = { :string => nil, :numeric => /\A[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?\z/, :uri => /\Ahttps?:/, :date_db => /\A\d{4,}-\d\d-\d\d\z/, # "12345-01-01" :date_long => /\A(?:#{Date::MONTHNAMES.join('|')}) [ \d]\d, \d{4,}\z/, # "January 1, 12345" :date_rfc822 => /\A[ \d]\d (?:#{Date::ABBR_MONTHNAMES.join('|')}) \d{4,}\z/, # " 1 Jan 12345" :date_short => /\A[ \d]\d (?:#{Date::ABBR_MONTHNAMES.join('|')})\z/, # "1 Jan" :dateTime_db => /\A\d{4,}-\d\d-\d\d \d\d:\d\d:\d\d\z/, # "12345-01-01 00:00:00" :dateTime_hms => /\A\d\d:\d\d:\d\d\z/, # "00:00:00" :dateTime_iso8601 => /\A\d{4,}-\d\d-\d\dT\d\d:\d\d:\d\dZ\z/, # "12345-01-01T00:00:00Z" :dateTime_long => /\A(?:#{Date::MONTHNAMES.join('|')}) \d\d, \d{4,} \d\d:\d\d\z/, # "January 01, 12345 00:00" :dateTime_short => /\A\d\d (?:#{Date::ABBR_MONTHNAMES.join('|')}) \d\d:\d\d\z/, # "01 Jan 00:00" :dateTime_time => /\A\d\d:\d\d\z/, # "00:00" }.freeze URI_REGEXP = /(?.*?)/ TOKEN_REGEXP = /([^\(\)\<\>@,;:\\"\/\[\]\?=\{\} \t]+)/ QUOTED_STRING_REGEXP = /("[^"]*")/ SGML_NAME_REGEXP = /([A-Za-z][-A-Za-z0-9\.]*)/ RELATIONSHIP_REGEXP = Regexp.new("(?#{SGML_NAME_REGEXP}|(\"#{SGML_NAME_REGEXP}(\\s+#{SGML_NAME_REGEXP})*\"))") REL_REGEXP = Regexp.new("(?\\s*rel\\s*=\\s*(?#{RELATIONSHIP_REGEXP}))") REV_REGEXP = Regexp.new("(?\\s*rev\\s*=\\s*#{RELATIONSHIP_REGEXP})") TITLE_REGEXP = Regexp.new("(?\\s*title\\s*=\\s*#{QUOTED_STRING_REGEXP})") ANCHOR_REGEXP = Regexp.new("(?<anchor>\\s*anchor\\s*=\\s*\\<#{URI_REGEXP}\\>)") LINK_EXTENSION_REGEXP = Regexp.new("(?<link-extension>(?<param>#{TOKEN_REGEXP})(\\s*=\\s*(?<param-value>#{TOKEN_REGEXP}|#{QUOTED_STRING_REGEXP}))?)") LINK_PARAM_REGEXP = Regexp.new("(#{REL_REGEXP}|#{REV_REGEXP}|#{TITLE_REGEXP}|#{ANCHOR_REGEXP}|#{LINK_EXTENSION_REGEXP})") LINK_HEADER_REGEXP = Regexp.new("\<#{URI_REGEXP}\>(\\s*;\\s*#{LINK_PARAM_REGEXP})*") end end