lib/csvlint/validate.rb in csvlint-1.0.0 vs lib/csvlint/validate.rb in csvlint-1.1.0

- old
+ new

@@ -1,23 +1,22 @@ module Csvlint - class Validator class LineCSV < CSV - ENCODE_RE = Hash.new do |h,str| + ENCODE_RE = Hash.new do |h, str| h[str] = Regexp.new(str) end - ENCODE_STR = Hash.new do |h,encoding_name| - h[encoding_name] = Hash.new do |h,chunks| - h[chunks] = chunks.map { |chunk| chunk.encode(encoding_name) }.join('') + ENCODE_STR = Hash.new do |h, encoding_name| + h[encoding_name] = Hash.new do |h, chunks| + h[chunks] = chunks.map { |chunk| chunk.encode(encoding_name) }.join("") end end - ESCAPE_RE = Hash.new do |h,re_chars| - h[re_chars] = Hash.new do |h,re_esc| - h[re_esc] = Hash.new do |h,str| - h[str] = str.gsub(re_chars) {|c| re_esc + c} + ESCAPE_RE = Hash.new do |h, re_chars| + h[re_chars] = Hash.new do |h, re_esc| + h[re_esc] = Hash.new do |h, str| + h[str] = str.gsub(re_chars) { |c| re_esc + c } end end end # Optimization: Memoize `encode_re`. @@ -36,11 +35,11 @@ # @see https://github.com/ruby/ruby/blob/v2_2_3/lib/csv.rb#L2265 def escape_re(str) ESCAPE_RE[@re_chars][@re_esc][str] end - if RUBY_VERSION < '2.5' + if RUBY_VERSION < "2.5" # Optimization: Disable the CSV library's converters feature. # @see https://github.com/ruby/ruby/blob/v2_2_3/lib/csv.rb#L2100 def init_converters(options, field_name = :converters) @converters = [] @header_converters = [] @@ -53,15 +52,15 @@ include Csvlint::ErrorCollector attr_reader :encoding, :content_type, :extension, :headers, :link_headers, :dialect, :csv_header, :schema, :data, :current_line ERROR_MATCHERS = { - "Missing or stray quote" => :stray_quote, - "Illegal quoting" => :whitespace, - "Unclosed quoted field" => :unclosed_quote, - "Any value after quoted field isn't allowed" => :unclosed_quote, - "Unquoted fields do not allow \\r or \\n" => :line_breaks, + "Missing or stray quote" => :stray_quote, + "Illegal quoting" => :whitespace, + "Unclosed quoted field" => :unclosed_quote, + "Any value after quoted field isn't allowed" => :unclosed_quote, + "Unquoted fields do not allow \\r or \\n" => :line_breaks } def initialize(source, dialect = {}, schema = nil, options = {}) reset @source = source @@ -88,18 +87,18 @@ validate end def validate - if @extension =~ /.xls(x)?/ + if /.xls(x)?/.match?(@extension) build_warnings(:excel, :context) return end locate_schema unless @schema.instance_of?(Csvlint::Schema) set_dialect - if @source.class == String + if @source.instance_of?(String) validate_url else validate_metadata validate_stream end @@ -118,11 +117,15 @@ def validate_url @current_line = 1 request = Typhoeus::Request.new(@source, followlocation: true) request.on_headers do |response| @headers = response.headers || {} - @content_type = response.headers["content-type"] rescue nil + @content_type = begin + response.headers["content-type"] + rescue + nil + end @response_code = response.code return build_errors(:not_found) if response.code == 404 validate_metadata end request.on_body do |chunk| @@ -146,30 +149,30 @@ if line.count(@dialect["quoteChar"]).odd? @leading = line else validate_line(line, @current_line) @leading = "" - @current_line = @current_line+1 + @current_line += 1 end else # If it's not a full line, then prepare to add it to the beginning of the next chunk @leading = line end rescue ArgumentError => ae build_errors(:invalid_encoding, :structure, @current_line, nil, @current_line) unless @reported_invalid_encoding - @current_line = @current_line+1 + @current_line += 1 @reported_invalid_encoding = true end def validate_line(input = nil, index = nil) @input = input single_col = false line = index.present? ? index : 0 @encoding = input.encoding.to_s report_line_breaks(line) parse_contents(input, line) - @lambda.call(self) unless @lambda.nil? + @lambda&.call(self) rescue ArgumentError => ae build_errors(:invalid_encoding, :structure, @current_line, nil, index) unless @reported_invalid_encoding @reported_invalid_encoding = true end @@ -202,12 +205,12 @@ if @schema @schema.validate_row(row, current_line, all_errors, @source, @validate) @errors += @schema.errors all_errors += @schema.errors @warnings += @schema.warnings - else - build_errors(:ragged_rows, :structure, current_line, nil, stream.to_s) if !row.empty? && row.size != @expected_columns + elsif !row.empty? && row.size != @expected_columns + build_errors(:ragged_rows, :structure, current_line, nil, stream.to_s) end end end @data << row end @@ -226,32 +229,48 @@ end def validate_metadata assumed_header = !@supplied_dialect unless @headers.empty? - if @headers["content-type"] =~ /text\/csv/ - @csv_header = @csv_header && true + if /text\/csv/.match?(@headers["content-type"]) + @csv_header &&= true assumed_header = @assumed_header.present? end if @headers["content-type"] =~ /header=(present|absent)/ @csv_header = true if $1 == "present" @csv_header = false if $1 == "absent" assumed_header = false end - build_warnings(:no_content_type, :context) if @content_type == nil - build_errors(:wrong_content_type, :context) unless (@content_type && @content_type =~ /text\/csv/) + build_warnings(:no_content_type, :context) if @content_type.nil? + build_errors(:wrong_content_type, :context) unless @content_type && @content_type =~ /text\/csv/ end @header_processed = true build_info_messages(:assumed_header, :structure) if assumed_header - @link_headers = @headers["link"].split(",") rescue nil - @link_headers.each do |link_header| + @link_headers = begin + @headers["link"].split(",") + rescue + nil + end + @link_headers&.each do |link_header| match = LINK_HEADER_REGEXP.match(link_header) - uri = match["uri"].gsub(/(^\<|\>$)/, "") rescue nil - rel = match["rel-relationship"].gsub(/(^\"|\"$)/, "") rescue nil + uri = begin + match["uri"].gsub(/(^<|>$)/, "") + rescue + nil + end + rel = begin + match["rel-relationship"].gsub(/(^"|"$)/, "") + rescue + nil + end param = match["param"] - param_value = match["param-value"].gsub(/(^\"|\"$)/, "") rescue nil + param_value = begin + match["param-value"].gsub(/(^"|"$)/, "") + rescue + nil + end if rel == "describedby" && param == "type" && ["application/csvm+json", "application/ld+json", "application/json"].include?(param_value) begin url = URI.join(@source_url, uri) schema = Schema.load_from_uri(url) if schema.instance_of? Csvlint::Csvw::TableGroup @@ -263,18 +282,18 @@ end end rescue OpenURI::HTTPError end end - end if @link_headers + end end def header? @csv_header && @dialect["header"] end - def report_line_breaks(line_no=nil) + def report_line_breaks(line_no = nil) return unless @input[-1, 1].include?("\n") # Return straight away if there's no newline character - i.e. we're on the last line line_break = get_line_break(@input) @line_breaks << line_break unless line_breaks_reported? if line_break != "\r\n" @@ -296,28 +315,28 @@ schema_dialect = @schema.tables[@source_url].dialect || {} rescue schema_dialect = {} end @dialect = { - "header" => true, - "headerRowCount" => 1, - "delimiter" => ",", - "skipInitialSpace" => true, - "lineTerminator" => :auto, - "quoteChar" => '"', - "trim" => :true + "header" => true, + "headerRowCount" => 1, + "delimiter" => ",", + "skipInitialSpace" => true, + "lineTerminator" => :auto, + "quoteChar" => '"', + "trim" => :true }.merge(schema_dialect).merge(@dialect || {}) - @csv_header = @csv_header && @dialect["header"] + @csv_header &&= @dialect["header"] @csv_options = dialect_to_csv_options(@dialect) end def validate_encoding if @headers["content-type"] - if @headers["content-type"] !~ /charset=/ + if !/charset=/.match?(@headers["content-type"]) build_warnings(:no_encoding, :context) - elsif @headers["content-type"] !~ /charset=utf-8/i + elsif !/charset=utf-8/i.match?(@headers["content-type"]) build_warnings(:encoding, :context) end end build_warnings(:encoding, :context) if @encoding != "UTF-8" end @@ -337,14 +356,14 @@ def row_count data.count end def build_exception_messages(csvException, errChars, lineNo) - #TODO 1 - this is a change in logic, rather than straight refactor of previous error building, however original logic is bonkers - #TODO 2 - using .kind_of? is a very ugly fix here and it meant to work around instances where :auto symbol is preserved in @csv_options + # TODO 1 - this is a change in logic, rather than straight refactor of previous error building, however original logic is bonkers + # TODO 2 - using .kind_of? is a very ugly fix here and it meant to work around instances where :auto symbol is preserved in @csv_options type = fetch_error(csvException) - if !@csv_options[:row_sep].kind_of?(Symbol) && [:unclosed_quote,:stray_quote].include?(type) && !@input.match(@csv_options[:row_sep]) + if !@csv_options[:row_sep].is_a?(Symbol) && [:unclosed_quote, :stray_quote].include?(type) && !@input.match(@csv_options[:row_sep]) build_linebreak_error else build_errors(type, :structure, lineNo, nil, errChars) end end @@ -353,70 +372,74 @@ build_errors(:line_breaks, :structure) unless @errors.any? { |e| e.type == :line_breaks } end def validate_header(header) names = Set.new - header.map{|h| h.strip! } if @dialect["trim"] == :true - header.each_with_index do |name,i| - build_warnings(:empty_column_name, :schema, nil, i+1) if name == "" + header.map { |h| h.strip! } if @dialect["trim"] == :true + header.each_with_index do |name, i| + build_warnings(:empty_column_name, :schema, nil, i + 1) if name == "" if names.include?(name) - build_warnings(:duplicate_column_name, :schema, nil, i+1) + build_warnings(:duplicate_column_name, :schema, nil, i + 1) else names << name end end if @schema @schema.validate_header(header, @source, @validate) @errors += @schema.errors @warnings += @schema.warnings end - return valid? + valid? end def fetch_error(error) e = error.message.match(/^(.+?)(?: [io]n)? \(?line \d+\)?\.?$/i) - message = e[1] rescue nil + message = begin + e[1] + rescue + nil + end ERROR_MATCHERS.fetch(message, :unknown_error) end def dialect_to_csv_options(dialect) skipinitialspace = dialect["skipInitialSpace"] || true delimiter = dialect["delimiter"] - delimiter = delimiter + " " if !skipinitialspace - return { - :col_sep => delimiter, - :row_sep => dialect["lineTerminator"], - :quote_char => dialect["quoteChar"], - :skip_blanks => false + delimiter += " " if !skipinitialspace + { + col_sep: delimiter, + row_sep: dialect["lineTerminator"], + quote_char: dialect["quoteChar"], + skip_blanks: false } end def build_formats(row) row.each_with_index do |col, i| next if col.nil? || col.empty? @formats[i] ||= Hash.new(0) format = - if col.strip[FORMATS[:numeric]] - :numeric - elsif uri?(col) - :uri - elsif possible_date?(col) - date_formats(col) - else - :string - end + if col.strip[FORMATS[:numeric]] + :numeric + elsif uri?(col) + :uri + elsif possible_date?(col) + date_formats(col) + else + :string + end @formats[i][format] += 1 end end def check_consistency - @formats.each_with_index do |format,i| + @formats.each_with_index do |format, i| if format total = format.values.reduce(:+).to_f - if format.none?{|_,count| count / total >= 0.9} + if format.none? { |_, count| count / total >= 0.9 } build_warnings(:inconsistent_values, :schema, nil, i + 1) end end end end @@ -428,121 +451,117 @@ @warnings += @schema.warnings end end def locate_schema - @source_url = nil warn_if_unsuccessful = false case @source - when StringIO - return - when File - uri_parser = URI::Parser.new - @source_url = "file:#{uri_parser.escape(File.expand_path(@source))}" - else - @source_url = @source + when StringIO + return + when File + uri_parser = URI::DEFAULT_PARSER + @source_url = "file:#{uri_parser.escape(File.expand_path(@source))}" + else + @source_url = @source end unless @schema.nil? if @schema.tables[@source_url] return else @schema = nil end end paths = [] - if @source_url =~ /^http(s)?/ + if /^http(s)?/.match?(@source_url) begin well_known_uri = URI.join(@source_url, "/.well-known/csvm") paths = URI.open(well_known_uri.to_s).read.split("\n") rescue OpenURI::HTTPError, URI::BadURIError end end paths = ["{+url}-metadata.json", "csv-metadata.json"] if paths.empty? paths.each do |template| - begin - template = URITemplate.new(template) - path = template.expand('url' => @source_url) - url = URI.join(@source_url, path) - url = File.new(url.to_s.sub(/^file:/, "")) if url.to_s =~ /^file:/ - schema = Schema.load_from_uri(url) - if schema.instance_of? Csvlint::Csvw::TableGroup - if schema.tables[@source_url] - @schema = schema - return - else - warn_if_unsuccessful = true - build_warnings(:schema_mismatch, :context, nil, nil, @source_url, schema) - end + template = URITemplate.new(template) + path = template.expand("url" => @source_url) + url = URI.join(@source_url, path) + url = File.new(url.to_s.sub(/^file:/, "")) if /^file:/.match?(url.to_s) + schema = Schema.load_from_uri(url) + if schema.instance_of? Csvlint::Csvw::TableGroup + if schema.tables[@source_url] + @schema = schema + return + else + warn_if_unsuccessful = true + build_warnings(:schema_mismatch, :context, nil, nil, @source_url, schema) end - rescue Errno::ENOENT - rescue OpenURI::HTTPError, URI::BadURIError, ArgumentError - rescue => e - raise e end + rescue Errno::ENOENT + rescue OpenURI::HTTPError, URI::BadURIError, ArgumentError + rescue => e + raise e end build_warnings(:schema_mismatch, :context, nil, nil, @source_url, schema) if warn_if_unsuccessful @schema = nil end private def parse_extension(source) - case source - when File - return File.extname( source.path ) - when IO - return "" - when StringIO - return "" - when Tempfile - # this is triggered when the revalidate dialect use case happens - return "" - else - begin - parsed = URI.parse(source) - File.extname(parsed.path) - rescue URI::InvalidURIError - return "" - end + when File + File.extname(source.path) + when IO + "" + when StringIO + "" + when Tempfile + # this is triggered when the revalidate dialect use case happens + "" + else + begin + parsed = URI.parse(source) + File.extname(parsed.path) + rescue URI::InvalidURIError + "" + end end end def uri?(value) if value.strip[FORMATS[:uri]] uri = URI.parse(value) - uri.kind_of?(URI::HTTP) || uri.kind_of?(URI::HTTPS) + uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS) end rescue URI::InvalidURIError false end def possible_date?(col) col[POSSIBLE_DATE_REGEXP] end def date_formats(col) - if col[FORMATS[:date_db]] && date_format?(Date, col, '%Y-%m-%d') + if col[FORMATS[:date_db]] && date_format?(Date, col, "%Y-%m-%d") :date_db - elsif col[FORMATS[:date_short]] && date_format?(Date, col, '%e %b') + elsif col[FORMATS[:date_short]] && date_format?(Date, col, "%e %b") :date_short - elsif col[FORMATS[:date_rfc822]] && date_format?(Date, col, '%e %b %Y') + elsif col[FORMATS[:date_rfc822]] && date_format?(Date, col, "%e %b %Y") :date_rfc822 - elsif col[FORMATS[:date_long]] && date_format?(Date, col, '%B %e, %Y') + elsif col[FORMATS[:date_long]] && date_format?(Date, col, "%B %e, %Y") :date_long - elsif col[FORMATS[:dateTime_time]] && date_format?(Time, col, '%H:%M') + elsif col[FORMATS[:dateTime_time]] && date_format?(Time, col, "%H:%M") :dateTime_time - elsif col[FORMATS[:dateTime_hms]] && date_format?(Time, col, '%H:%M:%S') + elsif col[FORMATS[:dateTime_hms]] && date_format?(Time, col, "%H:%M:%S") :dateTime_hms - elsif col[FORMATS[:dateTime_db]] && date_format?(Time, col, '%Y-%m-%d %H:%M:%S') + elsif col[FORMATS[:dateTime_db]] && date_format?(Time, col, "%Y-%m-%d %H:%M:%S") :dateTime_db - elsif col[FORMATS[:dateTime_iso8601]] && date_format?(Time, col, '%Y-%m-%dT%H:%M:%SZ') + elsif col[FORMATS[:dateTime_iso8601]] && date_format?(Time, col, "%Y-%m-%dT%H:%M:%SZ") :dateTime_iso8601 - elsif col[FORMATS[:dateTime_short]] && date_format?(Time, col, '%d %b %H:%M') + elsif col[FORMATS[:dateTime_short]] && date_format?(Time, col, "%d %b %H:%M") :dateTime_short - elsif col[FORMATS[:dateTime_long]] && date_format?(Time, col, '%B %d, %Y %H:%M') + elsif col[FORMATS[:dateTime_long]] && date_format?(Time, col, "%B %d, %Y %H:%M") :dateTime_long else :string end end @@ -565,36 +584,35 @@ "\n" end end FORMATS = { - :string => nil, - :numeric => /\A[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?\z/, - :uri => /\Ahttps?:/, - :date_db => /\A\d{4,}-\d\d-\d\d\z/, # "12345-01-01" - :date_long => /\A(?:#{Date::MONTHNAMES.join('|')}) [ \d]\d, \d{4,}\z/, # "January 1, 12345" - :date_rfc822 => /\A[ \d]\d (?:#{Date::ABBR_MONTHNAMES.join('|')}) \d{4,}\z/, # " 1 Jan 12345" - :date_short => /\A[ \d]\d (?:#{Date::ABBR_MONTHNAMES.join('|')})\z/, # "1 Jan" - :dateTime_db => /\A\d{4,}-\d\d-\d\d \d\d:\d\d:\d\d\z/, # "12345-01-01 00:00:00" - :dateTime_hms => /\A\d\d:\d\d:\d\d\z/, # "00:00:00" - :dateTime_iso8601 => /\A\d{4,}-\d\d-\d\dT\d\d:\d\d:\d\dZ\z/, # "12345-01-01T00:00:00Z" - :dateTime_long => /\A(?:#{Date::MONTHNAMES.join('|')}) \d\d, \d{4,} \d\d:\d\d\z/, # "January 01, 12345 00:00" - :dateTime_short => /\A\d\d (?:#{Date::ABBR_MONTHNAMES.join('|')}) \d\d:\d\d\z/, # "01 Jan 00:00" - :dateTime_time => /\A\d\d:\d\d\z/, # "00:00" + string: nil, + numeric: /\A[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?\z/, + uri: /\Ahttps?:/, + date_db: /\A\d{4,}-\d\d-\d\d\z/, # "12345-01-01" + date_long: /\A(?:#{Date::MONTHNAMES.join('|')}) [ \d]\d, \d{4,}\z/, # "January 1, 12345" + date_rfc822: /\A[ \d]\d (?:#{Date::ABBR_MONTHNAMES.join('|')}) \d{4,}\z/, # " 1 Jan 12345" + date_short: /\A[ \d]\d (?:#{Date::ABBR_MONTHNAMES.join('|')})\z/, # "1 Jan" + dateTime_db: /\A\d{4,}-\d\d-\d\d \d\d:\d\d:\d\d\z/, # "12345-01-01 00:00:00" + dateTime_hms: /\A\d\d:\d\d:\d\d\z/, # "00:00:00" + dateTime_iso8601: /\A\d{4,}-\d\d-\d\dT\d\d:\d\d:\d\dZ\z/, # "12345-01-01T00:00:00Z" + dateTime_long: /\A(?:#{Date::MONTHNAMES.join('|')}) \d\d, \d{4,} \d\d:\d\d\z/, # "January 01, 12345 00:00" + dateTime_short: /\A\d\d (?:#{Date::ABBR_MONTHNAMES.join('|')}) \d\d:\d\d\z/, # "01 Jan 00:00" + dateTime_time: /\A\d\d:\d\d\z/ # "00:00" }.freeze URI_REGEXP = /(?<uri>.*?)/ - TOKEN_REGEXP = /([^\(\)\<\>@,;:\\"\/\[\]\?=\{\} \t]+)/ + TOKEN_REGEXP = /([^()<>@,;:\\"\/\[\]?={} \t]+)/ QUOTED_STRING_REGEXP = /("[^"]*")/ - SGML_NAME_REGEXP = /([A-Za-z][-A-Za-z0-9\.]*)/ + SGML_NAME_REGEXP = /([A-Za-z][-A-Za-z0-9.]*)/ RELATIONSHIP_REGEXP = Regexp.new("(?<relationship>#{SGML_NAME_REGEXP}|(\"#{SGML_NAME_REGEXP}(\\s+#{SGML_NAME_REGEXP})*\"))") REL_REGEXP = Regexp.new("(?<rel>\\s*rel\\s*=\\s*(?<rel-relationship>#{RELATIONSHIP_REGEXP}))") REV_REGEXP = Regexp.new("(?<rev>\\s*rev\\s*=\\s*#{RELATIONSHIP_REGEXP})") TITLE_REGEXP = Regexp.new("(?<title>\\s*title\\s*=\\s*#{QUOTED_STRING_REGEXP})") ANCHOR_REGEXP = Regexp.new("(?<anchor>\\s*anchor\\s*=\\s*\\<#{URI_REGEXP}\\>)") LINK_EXTENSION_REGEXP = Regexp.new("(?<link-extension>(?<param>#{TOKEN_REGEXP})(\\s*=\\s*(?<param-value>#{TOKEN_REGEXP}|#{QUOTED_STRING_REGEXP}))?)") LINK_PARAM_REGEXP = Regexp.new("(#{REL_REGEXP}|#{REV_REGEXP}|#{TITLE_REGEXP}|#{ANCHOR_REGEXP}|#{LINK_EXTENSION_REGEXP})") - LINK_HEADER_REGEXP = Regexp.new("\<#{URI_REGEXP}\>(\\s*;\\s*#{LINK_PARAM_REGEXP})*") - POSSIBLE_DATE_REGEXP = Regexp.new("\\A(\\d|\\s\\d#{Date::ABBR_MONTHNAMES.join('|')}#{Date::MONTHNAMES.join('|')})") - + LINK_HEADER_REGEXP = Regexp.new("<#{URI_REGEXP}>(\\s*;\\s*#{LINK_PARAM_REGEXP})*") + POSSIBLE_DATE_REGEXP = Regexp.new("\\A(\\d|\\s\\d#{Date::ABBR_MONTHNAMES.join("|")}#{Date::MONTHNAMES.join("|")})") end end