lib/csvlint/validate.rb in csvlint-0.1.4 vs lib/csvlint/validate.rb in csvlint-0.2.0

- old
+ new

@@ -2,68 +2,68 @@ class Validator include Csvlint::ErrorCollector - attr_reader :encoding, :content_type, :extension, :headers, :line_breaks, :dialect, :csv_header, :schema, :data + attr_reader :encoding, :content_type, :extension, :headers, :link_headers, :line_breaks, :dialect, :csv_header, :schema, :data ERROR_MATCHERS = { "Missing or stray quote" => :stray_quote, "Illegal quoting" => :whitespace, "Unclosed quoted field" => :unclosed_quote, "Unquoted fields do not allow \\r or \\n" => :line_breaks, } def initialize(source, dialect = nil, schema = nil, options = {}) - + reset @source = source @formats = [] @schema = schema @supplied_dialect = dialect != nil - @dialect = { - "header" => true, - "delimiter" => ",", - "skipInitialSpace" => true, - "lineTerminator" => :auto, - "quoteChar" => '"' - }.merge(dialect || {}) - - @csv_header = @dialect["header"] @limit_lines = options[:limit_lines] - @csv_options = dialect_to_csv_options(@dialect) @extension = parse_extension(source) unless @source.nil? - reset - validate + @errors += @schema.errors unless @schema.nil? + @warnings += @schema.warnings unless @schema.nil? + validate(dialect) end - def validate + def validate(dialect = nil) single_col = false io = nil begin + if @extension =~ /.xls(x)?/ + build_warnings(:excel, :context) + return + end io = @source.respond_to?(:gets) ? @source : open(@source, :allow_redirections=>:all) validate_metadata(io) + locate_schema unless @schema.instance_of?(Csvlint::Schema) + set_dialect(dialect) parse_csv(io) sum = @col_counts.inject(:+) unless sum.nil? build_warnings(:title_row, :structure) if @col_counts.first < (sum / @col_counts.size.to_f) end build_warnings(:check_options, :structure) if @expected_columns == 1 check_consistency + check_foreign_keys rescue OpenURI::HTTPError, Errno::ENOENT - build_errors(:not_found) + build_errors(:not_found, nil, nil, nil, @source) ensure io.close if io && io.respond_to?(:close) end end def validate_metadata(io) + @csv_header = true @encoding = io.charset rescue nil @content_type = io.content_type rescue nil @headers = io.meta rescue nil + @link_headers = @headers["link"].split(",") rescue nil assumed_header = undeclared_header = !@supplied_dialect if @headers if @headers["content-type"] =~ /text\/csv/ @csv_header = true undeclared_header = false @@ -79,11 +79,10 @@ build_warnings(:no_encoding, :context) else build_warnings(:encoding, :context) if @encoding != "utf-8" end build_warnings(:no_content_type, :context) if @content_type == nil - build_warnings(:excel, :context) if @content_type == nil && @extension =~ /.xls(x)?/ build_errors(:wrong_content_type, :context) unless (@content_type && @content_type =~ /text\/csv/) if undeclared_header build_errors(:undeclared_header, :structure) assumed_header = false @@ -91,10 +90,29 @@ end build_info_messages(:assumed_header, :structure) if assumed_header end + def set_dialect(dialect) + begin + schema_dialect = @schema.tables[@source_url].dialect || {} + rescue + schema_dialect = {} + end + @dialect = { + "header" => true, + "delimiter" => ",", + "skipInitialSpace" => true, + "lineTerminator" => :auto, + "quoteChar" => '"', + "trim" => :true + }.merge(schema_dialect).merge(dialect || {}) + + @csv_header = @csv_header && @dialect["header"] + @csv_options = dialect_to_csv_options(@dialect) + end + # analyses the provided csv and builds errors, warnings and info messages def parse_csv(io) @expected_columns = 0 current_line = 0 reported_invalid_encoding = false @@ -132,11 +150,11 @@ @expected_columns = row.size unless @expected_columns != 0 build_errors(:blank_rows, :structure, current_line, nil, wrapper.line) if row.reject{ |c| c.nil? || c.empty? }.size == 0 # Builds errors and warnings related to the provided schema file if @schema - @schema.validate_row(row, current_line, all_errors) + @schema.validate_row(row, current_line, all_errors, @source) @errors += @schema.errors all_errors += @schema.errors @warnings += @schema.warnings else build_errors(:ragged_rows, :structure, current_line, nil, wrapper.line) if !row.empty? && row.size != @expected_columns @@ -161,20 +179,21 @@ end end def validate_header(header) names = Set.new + header.map{|h| h.strip! } if @dialect["trim"] == :true header.each_with_index do |name,i| build_warnings(:empty_column_name, :schema, nil, i+1) if name == "" if names.include?(name) build_warnings(:duplicate_column_name, :schema, nil, i+1) else names << name end end if @schema - @schema.validate_header(header) + @schema.validate_header(header, @source) @errors += @schema.errors @warnings += @schema.warnings end return valid? end @@ -247,10 +266,100 @@ end end end end + def check_foreign_keys + if @schema.instance_of? Csvlint::Csvw::TableGroup + @schema.validate_foreign_keys + @errors += @schema.errors + @warnings += @schema.warnings + end + end + + def locate_schema + @source_url = nil + warn_if_unsuccessful = false + case @source + when StringIO + return + when File + @source_url = "file:#{File.expand_path(@source)}" + else + @source_url = @source + end + unless @schema.nil? + if @schema.tables[@source_url] + return + else + @schema = nil + end + end + link_schema = nil + @link_headers.each do |link_header| + match = LINK_HEADER_REGEXP.match(link_header) + uri = match["uri"].gsub(/(^\<|\>$)/, "") rescue nil + rel = match["rel-relationship"].gsub(/(^\"|\"$)/, "") rescue nil + param = match["param"] + param_value = match["param-value"].gsub(/(^\"|\"$)/, "") rescue nil + if rel == "describedby" && param == "type" && ["application/csvm+json", "application/ld+json", "application/json"].include?(param_value) + begin + url = URI.join(@source_url, uri) + schema = Schema.load_from_json(url) + if schema.instance_of? Csvlint::Csvw::TableGroup + if schema.tables[@source_url] + link_schema = schema + else + warn_if_unsuccessful = true + build_warnings(:schema_mismatch, :context, nil, nil, @source_url, schema) + end + end + rescue OpenURI::HTTPError + end + end + end if @link_headers + @schema = link_schema if link_schema + + paths = [] + if @source_url =~ /^http(s)?/ + begin + well_known_uri = URI.join(@source_url, "/.well-known/csvm") + well_known = open(well_known_uri).read + # TODO + rescue OpenURI::HTTPError + end + end + paths = ["{+url}-metadata.json", "csv-metadata.json"] if paths.empty? + paths.each do |template| + begin + template = URITemplate.new(template) + path = template.expand('url' => @source_url) + url = URI.join(@source_url, path) + url = File.new(url.to_s.sub(/^file:/, "")) if url.to_s =~ /^file:/ + schema = Schema.load_from_json(url) + if schema.instance_of? Csvlint::Csvw::TableGroup + if schema.tables[@source_url] + @schema = schema + else + warn_if_unsuccessful = true + build_warnings(:schema_mismatch, :context, nil, nil, @source_url, schema) + end + end + rescue Errno::ENOENT + rescue OpenURI::HTTPError + rescue ArgumentError + rescue => e + STDERR.puts e.class + STDERR.puts e.message + STDERR.puts e.backtrace + raise e + end + end + build_warnings(:schema_mismatch, :context, nil, nil, @source_url, schema) if warn_if_unsuccessful + @schema = nil + end + private def parse_extension(source) case source when File @@ -300,7 +409,21 @@ :dateTime_iso8601 => /\A\d{4,}-\d\d-\d\dT\d\d:\d\d:\d\dZ\z/, # "12345-01-01T00:00:00Z" :dateTime_long => /\A(?:#{Date::MONTHNAMES.join('|')}) \d\d, \d{4,} \d\d:\d\d\z/, # "January 01, 12345 00:00" :dateTime_short => /\A\d\d (?:#{Date::ABBR_MONTHNAMES.join('|')}) \d\d:\d\d\z/, # "01 Jan 00:00" :dateTime_time => /\A\d\d:\d\d\z/, # "00:00" }.freeze + + URI_REGEXP = /(?<uri>.*?)/ + TOKEN_REGEXP = /([^\(\)\<\>@,;:\\"\/\[\]\?=\{\} \t]+)/ + QUOTED_STRING_REGEXP = /("[^"]*")/ + SGML_NAME_REGEXP = /([A-Za-z][-A-Za-z0-9\.]*)/ + RELATIONSHIP_REGEXP = Regexp.new("(?<relationship>#{SGML_NAME_REGEXP}|(\"#{SGML_NAME_REGEXP}(\\s+#{SGML_NAME_REGEXP})*\"))") + REL_REGEXP = Regexp.new("(?<rel>\\s*rel\\s*=\\s*(?<rel-relationship>#{RELATIONSHIP_REGEXP}))") + REV_REGEXP = Regexp.new("(?<rev>\\s*rev\\s*=\\s*#{RELATIONSHIP_REGEXP})") + TITLE_REGEXP = Regexp.new("(?<title>\\s*title\\s*=\\s*#{QUOTED_STRING_REGEXP})") + ANCHOR_REGEXP = Regexp.new("(?<anchor>\\s*anchor\\s*=\\s*\\<#{URI_REGEXP}\\>)") + LINK_EXTENSION_REGEXP = Regexp.new("(?<link-extension>(?<param>#{TOKEN_REGEXP})(\\s*=\\s*(?<param-value>#{TOKEN_REGEXP}|#{QUOTED_STRING_REGEXP}))?)") + LINK_PARAM_REGEXP = Regexp.new("(#{REL_REGEXP}|#{REV_REGEXP}|#{TITLE_REGEXP}|#{ANCHOR_REGEXP}|#{LINK_EXTENSION_REGEXP})") + LINK_HEADER_REGEXP = Regexp.new("\<#{URI_REGEXP}\>(\\s*;\\s*#{LINK_PARAM_REGEXP})*") + end end