require "plugin" require "make_blast_db" ######################################################## # Author: Almudena Bocinos Rioboo # # Defines the main methods that are necessary to execute Pluginclassify # Inherit: Plugin ######################################################## class PluginUserContaminants < Plugin MAX_TARGETS_SEQS=4 #MAXIMUM NUMBER OF DIFFERENT ALIGNED SEQUENCES TO KEEP FROM BLAST DATABASE def near_to_extrem(c,seq,min_cont_size) max_to_extreme=(min_cont_size/2).to_i return ((c.q_beg-max_to_extreme<0) || (( c.q_end+max_to_extreme)>=seq.seq_fasta.size-1) ) #return if vector is very near to the extremes of insert) end def sum_hits_by_id(hits) res={} hits.each do |c| hit_size=c.q_end - c.q_beg + 1 res[c.definition] = (res[c.definition]||0)+hit_size end puts res.to_json return res end #Begins the plugin1's execution to warn that there are classify in the sequence "seq" def execute(seqs) blasts= do_blasts(seqs) seqs.each_with_index do |s,i| exec_seq(s,blasts.querys[i]) end end def do_blasts(seqs) # TODO - Culling limit = 2 porque el blast falla con este comando cuando se le pasa cl=1 y dust=no # y una secuencia de baja complejidad como entrada blast = BatchBlast.new("-db #{@params.get_param('user_contaminant_db')}",'blastn'," -task blastn -evalue #{@params.get_param('blast_evalue_classify')} -perc_identity #{@params.get_param('blast_percent_classify')} -culling_limit 1") #get classify -max_target_seqs #{MAX_TARGETS_SEQS} $LOG.debug('BLAST:'+blast.get_blast_cmd(:xml)) fastas=[] seqs.each do |seq| fastas.push ">"+seq.seq_name fastas.push seq.seq_fasta end blast_table_results = blast.do_blast(fastas,:xml) return blast_table_results end def exec_seq(seq,blast_query) if blast_query.query_id != seq.seq_name # raise "Blast and seq names does not match, blast:#{blast_query.query_id} sn:#{seq.seq_name}" end $LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: looking for classify into the sequence" type = "ActionClassify" classify={} # classify_ids=[] classify=sum_hits_by_id(blast_query.hits) actions=[] classify_size=0 min_cont_size=@params.get_param('min_classify_hit_size').to_i biggest_classify = classify.sort {|c1,c2| c1[1]<=>c2[1]} if !biggest_classify.empty? definition,classify_size = biggest_classify.last a = seq.new_action(-1,-1,type) # adds the correspondent action to the sequence a.message = definition a.tag_id = definition.gsub(' ','_') # a.found_definition = c.definition # save the classify definitions, each separately #save to this file seq.add_file_tag(1, a.tag_id, :file) actions.push a add_stats('classify_size',classify_size) add_stats('classify_ids',definition) seq.add_actions(actions) end end #Returns an array with the errors due to parameters are missing def self.check_params(params) errors=[] comment='Blast E-value used as cut-off when searching for contaminations' default_value = 1e-10 params.check_param(errors,'blast_evalue_classify','Float',default_value,comment) comment='Minimum required identity (%) for a reliable classify' default_value = 85 params.check_param(errors,'blast_percent_classify','Integer',default_value,comment) comment='Minimum hit size (nt) for considering to classify' default_value = 30 # era 40 params.check_param(errors,'min_classify_hit_size','Integer',default_value,comment) comment='Path for classify database' default_value = File.join($FORMATTED_DB_PATH,'classify.fasta') params.check_param(errors,'user_contaminant_db','DB',default_value,comment) return errors end end