require "plugin" require "make_blast_db" ######################################################## # Author: Almudena Bocinos Rioboo # # Defines the main methods that are necessary to execute PluginContaminants # Inherit: Plugin ######################################################## class PluginContaminants < Plugin MAX_TARGETS_SEQS=4 #MAXIMUM NUMBER OF DIFFERENT ALIGNED SEQUENCES TO KEEP FROM BLAST DATABASE def near_to_extrem(c,seq,min_cont_size) max_to_extreme=(min_cont_size/2).to_i return ((c.q_beg-max_to_extreme<0) || (( c.q_end+max_to_extreme)>=seq.seq_fasta.size-1) ) #return if vector is very near to the extremes of insert) end #Begins the plugin1's execution to warn that there are contaminants in the sequence "seq" def execute(seqs) blasts= do_blasts(seqs) seqs.each_with_index do |s,i| exec_seq(s,blasts.querys[i]) end end def do_blasts(seqs) # find MIDS with less results than max_target_seqs value # blast = BatchBlast.new("-db #{@params.get_param('contaminants_db')}",'blastn'," -task blastn-short -evalue #{@params.get_param('blast_evalue_contaminants')} -perc_identity #{@params.get_param('blast_percent_contaminants')} -culling_limit 1") #get contaminants -max_target_seqs #{MAX_TARGETS_SEQS} # TODO - Culling limit = 2 porque el blast falla con este comando cuando se le pasa cl=1 y dust=no # y una secuencia de baja complejidad como entrada blast = BatchBlast.new("-db #{@params.get_param('contaminants_db')}",'blastn'," -task blastn -evalue #{@params.get_param('blast_evalue_contaminants')} -perc_identity #{@params.get_param('blast_percent_contaminants')} -culling_limit 1") #get contaminants -max_target_seqs #{MAX_TARGETS_SEQS} $LOG.info('BLAST:'+blast.get_blast_cmd(:xml)) fastas=[] seqs.each do |seq| fastas.push ">"+seq.seq_name fastas.push seq.seq_fasta end # fastas=fastas.join("\n") # $LOG.info('doing blast to:') # $LOG.info('-'*20) # $LOG.info(fastas) # $LOG.info('-'*20) blast_table_results = blast.do_blast(fastas,:xml) # $LOG.info(blast_table_results.inspect) return blast_table_results end # TODO - Contaminants databases grouped by folders # TODO - User can select a set of contaminants folders def exec_seq(seq,blast_query) if blast_query.query_id != seq.seq_name # raise "Blast and seq names does not match, blast:#{blast_query.query_id} sn:#{seq.seq_name}" end $LOG.info "[#{self.class.to_s}, seq: #{seq.seq_name}]: looking for contaminants into the sequence" #blast = BatchBlast.new('-db DB/formatted/contaminants.fasta','blastn',' -task blastn -evalue 1e-10 -perc_identity 95') #get contaminants # blast = BatchBlast.new("-db #{@params.get_param('contaminants_db')}",'blastn'," -task blastn-short -evalue #{@params.get_param('blast_evalue_contaminants')} -perc_identity #{@params.get_param('blast_percent_contaminants')} -culling_limit 1") #get contaminants -max_target_seqs #{MAX_TARGETS_SEQS} # blast_table_results = blast.do_blast(seq.seq_fasta,:xml) #rise seq to contaminants executing over blast #blast_table_results = BlastTableResult.new(res) type = "ActionIsContaminated" contaminants=[] contaminants_ids=[] # blast_table_results.querys.each do |query| #first round to save contaminants without overlap # contaminants_ids.push query.hits.definition if (not contaminants_ids.include?(query.hits.definition)) merge_hits(blast_query.hits,contaminants,contaminants_ids) # end begin contaminants2=contaminants contaminants = [] #second round to save contaminants without overlap merge_hits(contaminants2,contaminants) # DONE describir cada ID contaminante encontradomerge_hits(contaminants2,contaminants,ids_contaminants) end until (contaminants2.count == contaminants.count) actions=[] contaminants_size=0 # @stats[:contaminants_size]={} @stats['contaminants_size']={} @stats['rejected_seqs']={} min_cont_size=@params.get_param('min_contam_seq_presence').to_i contaminants.each do |c| contaminants_size=c.q_end - c.q_beg + 1 #if ( (@params.get_param('genus')!=c.subject_id.split('_')[1]) && valid_genus=@params.get_param('genus').empty? || !c.definition.upcase.index(@params.get_param('genus').upcase) if (valid_genus) && (contaminants_size>=min_cont_size) #( (min_cont_size<=contaminants_size) || (near_to_extrem(c,seq,min_cont_size)) ) ) if !seq.range_inside_action_type?(c.q_beg,c.q_end,ActionVectors) # puts "DIFFERENT SPECIE #{specie} ,#{hit.subject_id.split('_')[1].to_s}" a = seq.new_action(c.q_beg,c.q_end,type) # adds the correspondent action to the sequence a.message = c.definition a.found_definition = contaminants_ids # save the contaminants definitions, each separately actions.push a contaminants_size=c.q_end-c.q_beg+1 # if @stats[:contaminants_size][contaminants_size].nil? # @stats[:contaminants_size][contaminants_size] = 0 # end # # @stats[:contaminants_size][contaminants_size] += 1 add_stats('contaminants_size',contaminants_size) contaminants_ids.each do |c| add_stats('contaminants_ids',c) end end else $LOG.info('Contaminant ignored due to genus match: '+c.definition) end end reject=@params.get_param('contaminants_reject') # cond_if=false # cond_if=true if (not actions.empty? ) && (reject=='true') # # puts "Before check SEQ_REJECTED= TRUE (reject= .#{reject}. #{reject.class}&& not actions empty= #{not actions.empty?} ) == #{cond_if} >>> " if ((not actions.empty? ) && (reject=='true')) #reject sequence # puts "SEQ_REJECTED= TRUE >>> " seq.seq_rejected=true seq.seq_rejected_by_message='contaminated' # @stats[:rejected_seqs]={'rejected_seqs_by_contaminants' => 1} add_stats('rejected','contaminated') end seq.add_actions(actions) end #Returns an array with the errors due to parameters are missing def self.check_params(params) errors=[] comment='Blast E-value used as cut-off when searching for contaminations' default_value = 1e-10 params.check_param(errors,'blast_evalue_contaminants','Float',default_value,comment) comment='Minimum required identity (%) for a reliable contamination' default_value = 85 params.check_param(errors,'blast_percent_contaminants','Integer',default_value,comment) comment='Minimum hit size (nt) for considering a true contamination' default_value = 40 params.check_param(errors,'min_contam_seq_presence','Integer',default_value,comment) comment='Genus of input data: contaminations belonging to this genus will be ignored' default_value = '' params.check_param(errors,'genus','String',default_value,comment) comment='Is a contamination considered a source of sequence rejection? (setting to false will only trim contaminated sequences instead of rejecting the complete read)' default_value = 'true' params.check_param(errors,'contaminants_reject','String',default_value,comment) comment='Path for contaminants database' default_value = File.join($FORMATTED_DB_PATH,'contaminants.fasta') params.check_param(errors,'contaminants_db','DB',default_value,comment) return errors end end