require 'set' require 'sequenceserver/config' require 'sequenceserver/database' require 'sequenceserver/sequence' module SequenceServer # Doctor detects inconsistencies likely to cause problems with Sequenceserver # operation. class Doctor ERROR_PARSE_SEQIDS = 1 ERROR_NUMERIC_IDS = 2 ERROR_PROBLEMATIC_IDS = 3 AVOID_ID_REGEX = /^(?!gi|bbs)\w+\|\w*\|?/ class << self extend Forwardable def_delegators SequenceServer, :config # Returns an array of database objects in which each of the object has an # array of sequence_ids satisfying the block passed to the method. def inspect_seqids(seqids, &block) seqids.map do |sq| sq[:db] unless sq[:seqids].select(&block).empty? end.compact end # Retrieve sequence ids (specified by %i) from all databases. Using # accession number is problematic because of several reasons.[2] def all_sequence_ids(ignore) Database.map do |db| next if ignore.include? db out = `blastdbcmd -entry all -db #{db.name} -outfmt "%i" 2> /dev/null` { db: db, seqids: out.to_s.split } end.compact end # FASTA files formatted without -parse_seqids option won't support the # blastdbcmd command of fetching sequence ids using '%i' identifier. # In such cases, an array of 'N/A' values are returned which is # checked in this case. def inspect_parse_seqids(seqids) seqids.map do |sq| sq[:db] if sq[:seqids].include? 'N/A' end.compact end # Pretty print database list. def bullet_list(values) list = '' values.each do |value| list << " - #{value}\n" end list end # Print diagnostic error messages according to the type of error. # rubocop:disable Metrics/MethodLength def show_message(error, values) return if values.empty? case error when ERROR_PARSE_SEQIDS puts <<~MSG *** Doctor has found improperly formatted database: #{bullet_list(values)} Please reformat your databases with -parse_seqids switch (or use sequenceserver -m) for using SequenceServer as the current format may cause problems. These databases are ignored in further checks. MSG when ERROR_NUMERIC_IDS puts <<~MSG *** Doctor has found databases with numeric sequence ids: #{bullet_list(values)} Note that this may cause problems with sequence retrieval. MSG when ERROR_PROBLEMATIC_IDS puts <<~MSG *** Doctor has found databases with problematic sequence ids: #{bullet_list(values)} This causes some sequence to contain extraneous words like `gnl|` appended to their id string. MSG end end # rubocop:enable Metrics/MethodLength end def initialize @ignore = [] @all_seqids = Doctor.all_sequence_ids(@ignore) end attr_reader :invalids, :all_seqids def diagnose puts "\n1/3 Inspecting databases for proper -parse_seqids formatting.." check_parse_seqids remove_invalid_databases puts "\n2/3 Inspecting databases for numeric sequence ids.." check_numeric_ids puts "\n3/3 Inspecting databases for problematic sequence ids.." check_id_format end # Remove entried which are in ignore list or not formatted with # -parse_seqids option. def remove_invalid_databases @all_seqids.delete_if { |sq| @ignore.include? sq[:db] } end # Obtain files which aren't formatted with -parse_seqids and add them to # ignore list. def check_parse_seqids without_parse_seqids = Doctor.inspect_parse_seqids(@all_seqids) Doctor.show_message(ERROR_PARSE_SEQIDS, without_parse_seqids) @ignore.concat(without_parse_seqids) end # Check for the presence of numeric sequence ids within a database. def check_numeric_ids selector = proc { |id| !id.to_i.zero? } Doctor.show_message(ERROR_NUMERIC_IDS, Doctor.inspect_seqids(@all_seqids, &selector)) end # Warn users about sequence identifiers of format abc|def because then # BLAST+ appends a gnl (for general) infront of the database # identifiers. There are only two identifiers that we need to avoid # when searching for this format.[1] # bbs|number, gi|number # Note that while sequence ids could have been arbitrary, using # parse_seqids reduces our search space substantially. def check_id_format selector = proc { |id| id.match(AVOID_ID_REGEX) } Doctor.show_message(ERROR_PROBLEMATIC_IDS, Doctor.inspect_seqids(@all_seqids, &selector)) end end end # [1]: http://etutorials.org/Misc/blast/Part+IV+Industrial-Strength+BLAST/Chapter+11.+BLAST+Databases/11.1+FASTA+Files/ # [2]: For sequence deflines of the kind abc|def, accession number is returned # as abc:def. Even though you take hacky measure and ensure that latter is # queried, correct entries are not returned. Using the value returned by %i # works in all cases. This may change in later versions of BLAST+.