require 'find' require 'open3' require 'digest/md5' require 'forwardable' require 'sequenceserver/sequence' # Define Database class. module SequenceServer # Captures a directory containing FASTA files and BLAST databases. # # Formatting a FASTA for use with BLAST+ will create 3 or 6 files, # collectively referred to as a BLAST database. # # It is important that formatted BLAST database files have the same dirname # and basename as the source FASTA for SequenceServer to be able to tell # formatted FASTA from unformatted. And that FASTA files be formatted with # `parse_seqids` option of `makeblastdb` for sequence retrieval to work. # # SequenceServer will always place BLAST database files alongside input FASTA, # and use `parse_seqids` option of `makeblastdb` to format databases. Database = Struct.new(:name, :title, :type, :nsequences, :ncharacters, :updated_on) do def initialize(*args) args[2].downcase! # database type args.each(&:freeze) super @id = Digest::MD5.hexdigest args.first end attr_reader :id def include?(accession) out = `blastdbcmd -entry '#{accession}' -db #{name} 2> /dev/null` !out.empty? end def to_s "#{type}: #{title} #{name}" end end # Model Database's eigenclass as a collection of Database objects. class Database class << self include Enumerable extend Forwardable def_delegators SequenceServer, :config, :logger def collection @collection ||= {} end private :collection def <<(database) collection[database.id] = database end def [](ids) ids = Array ids collection.values_at(*ids) end def ids collection.keys end def all collection.values end def each(&block) all.each(&block) end def include?(path) collection.include? Digest::MD5.hexdigest path end def group_by(&block) all.group_by(&block) end # Intended to be used only for testing. def first all.first end # Intended to be used only for testing. def clear collection.clear end # Recurisvely scan `database_dir` for blast databases. # # rubocop:disable Metrics/AbcSize, Metrics/MethodLength def scan_databases_dir cmd = "blastdbcmd -recursive -list #{config[:database_dir]}" \ ' -list_outfmt "%f %t %p %n %l %d"' Open3.popen3(cmd) do |_, out, err| out = out.read err = err.read throw_scan_error(cmd, out, err, $CHILD_STATUS) out.each_line do |line| name = line.split(' ')[0] next if multipart_database_name?(name) begin self << Database.new(*line.split(' ')) rescue NoMethodError => e err << "BLAST Database error:\n#{e}\n#{line}" end end throw_scan_error(cmd, out, err, $CHILD_STATUS) end end # rubocop:enable Metrics/AbcSize, Metrics/MethodLength def throw_scan_error(cmd, out, err, child_status) errpat = /BLAST Database error/ if !child_status.success? || err.match(errpat) fail BLAST_DATABASE_ERROR.new(cmd, err) end fail NO_BLAST_DATABASE_FOUND, config[:database_dir] if out.empty? end # Recursively scan `database_dir` for un-formatted FASTA and format them # for use with BLAST+. def make_blast_databases unformatted_fastas.select do |file, sequence_type| make_blast_database(file, sequence_type) end end # Returns an Array of FASTA files that may require formatting, and the # type of sequence contained in each FASTA. # # > unformatted_fastas # => [['/foo/bar.fasta', :nulceotide], ...] def unformatted_fastas list = [] database_dir = config[:database_dir] Find.find database_dir do |file| next if File.directory? file next if Database.include? file next unless probably_fasta? file sequence_type = guess_sequence_type_in_fasta file if [:protein, :nucleotide].include?(sequence_type) list << [file, sequence_type] end end list end # Create BLAST database, given FASTA file and sequence type in FASTA file. def make_blast_database(file, type) return unless make_blast_database? file, type title = get_database_title(file) taxid = fetch_tax_id _make_blast_database(file, type, title, taxid) end def _make_blast_database(file, type, title, taxid, quiet = false) cmd = 'makeblastdb -parse_seqids -hash_index ' \ "-in #{file} -dbtype #{type.to_s.slice(0, 4)} -title '#{title}'" \ " -taxid #{taxid}" cmd << ' &> /dev/null' if quiet system cmd end # Show file path and guessed sequence type to the user and obtain a y/n # response. # # Returns true if the user entered anything but 'n' or 'N'. def make_blast_database?(file, type) puts puts puts "FASTA file: #{file}" puts "FASTA type: #{type}" print 'Proceed? [y/n] (Default: y): ' response = STDIN.gets.to_s.strip !response.match(/n/i) end # Generate a title for the given database and show it to the user for # confirmation. # # Returns user input if any. Auto-generated title otherwise. def get_database_title(path) default = make_db_title(File.basename(path)) print "Enter a database title or will use '#{default}': " from_user = STDIN.gets.to_s.strip from_user.empty? && default || from_user end # Get taxid from the user. Returns user input or 0. # # Using 0 as taxid is equivalent to not setting taxid for the database # that will be created. def fetch_tax_id default = 0 print 'Enter taxid (optional): ' response_user = STDIN.gets.to_s.strip response_user.empty? && default || response_user end # Returns true if the database name appears to be a multi-part database # name. # # e.g. # /home/ben/pd.ben/sequenceserver/db/nr.00 => yes # /home/ben/pd.ben/sequenceserver/db/nr => no # /home/ben/pd.ben/sequenceserver/db/img3.5.finished.faa.01 => yes def multipart_database_name?(db_name) !(db_name.match(%r{.+/\S+\d{2}$}).nil?) end # Returns true if first character of the file is '>'. def probably_fasta?(file) File.read(file, 1) == '>' end # Suggests improved titles when generating database names from files # for improved apperance and readability in web interface. # For example: # Cobs1.4.proteins.fasta -> Cobs 1.4 proteins # S_invicta.xx.2.5.small.nucl.fa -> S invicta xx 2.5 small nucl def make_db_title(db_name) db_name.gsub!('"', "'") # removes .fasta like extension names db_name.gsub!(File.extname(db_name), '') # replaces _ with ' ', db_name.gsub!(/(_)/, ' ') # replaces '.' with ' ' when no numbers are on either side, db_name.gsub!(/(\D)\.(?=\D)/, '\1 ') # preserves version numbers db_name.gsub!(/\W*(\d+([.-]\d+)+)\W*/, ' \1 ') db_name end # Guess whether FASTA file contains protein or nucleotide sequences based # on first 32768 characters. # # NOTE: 2^15 == 32786. Approximately 546 lines, assuming 60 characters on # each line. def guess_sequence_type_in_fasta(file) sequences = sample_sequences(file) sequence_types = sequences.map { |seq| Sequence.guess_type(seq) } sequence_types = sequence_types.uniq.compact (sequence_types.length == 1) && sequence_types.first end # Read first 32768 characters of the file. Split on fasta def line # pattern and return. # # If the given file is FASTA, returns Array of as many different # sequences in the portion of the file read. Returns the portion # of the file read wrapped in an Array otherwise. def sample_sequences(file) File.read(file, 32_768).split(/^>.+$/).delete_if(&:empty?) end end end end