lib/seqtrimnext/plugins/plugin_low_quality.rb in seqtrimnext-2.0.51 vs lib/seqtrimnext/plugins/plugin_low_quality.rb in seqtrimnext-2.0.52

- old
+ new

@@ -10,249 +10,98 @@ ######################################################## class PluginLowQuality < Plugin - - def create_sum_window(qual,ini,index_window_end) - - # puts "--------index w #{index_window_end}" - sum=[] - i=ini - # puts "#{i} #{index_window_end}" - while (i<=index_window_end) # initialize sum - sum[i]=0 - i += 1 + + def next_low_qual_region(quals,from_pos,min_value,max_good_quals=2) + + rstart=nil + rend=nil + + i=from_pos + + good_q=0 + + # skip good values + while (i< quals.length) && (quals[i]>=min_value) + i +=1 + end + + # now we have found a bad quality, or end of sequence + if i < quals.length + rstart=i + len=0 + + # puts " - [#{rstart},#{len}]" + + # continue growing while region of lowqual until more than 2 bases of good qual are found + begin + q=quals[i] + + if q<min_value + len += 1 + # puts "BAD #{q}<#{min_value}" + len += good_q + good_q=0 + else + good_q+=1 + end + # puts "#{q} - q[#{rstart},#{rend}], #{good_q}" + + i+=1 + end while (i < quals.length) && (good_q <= max_good_quals) + + rend = rstart + len -1 + # puts "#{q} - q[#{rstart},#{rend}], #{good_q}" + end + + return [rstart,rend] + end + + # A region is valid if it starts in 0, ends in seq.length or is big enought + def valid_low_qual_region?(quals,rstart,rend,min_region_size) + # puts [rstart,rend,0,quals.length,(rend-rstart+1)].join(';') + # res =((rstart==0) || (rend==quals.length-1) || ((rend-rstart+1)>=min_region_size)) + # if res + # puts "VALID" + # end + return ((rstart==0) || (rend==quals.length-1) || ((rend-rstart+1)>=min_region_size)) + end + + + def get_low_qual_regions(quals,min_value, min_region_size,max_good_quals=2) + + # the initial region is the whole array + left=0 + right=quals.length-1 + # puts quals.map{|e| ("%2d" % e.to_s)}.join(' ') + + # puts "[#{left},#{right}]" + + i = 0 + + from_pos=0 + regions =[] + + # get all new regions + begin + rstart, rend = next_low_qual_region(quals,from_pos,min_value,max_good_quals) + if !rstart.nil? + from_pos= rend+1 + + if valid_low_qual_region?(quals,rstart,rend,min_region_size) + regions << [rstart,rend] end - # puts " contenido de sum" + sum.join.to_s + " i index_window_end window #{i} #{index_window_end} #{@window}" - - i=ini - while (i<ini+@window) - - sum[ini] += qual[i] - i+=1 - end - - - i=ini+1 - - while (i<=index_window_end) - - sum[i]=sum[i-1]-qual[i-1]+qual[i+@window-1] - i+=1 - - end - - # puts '2____' + sum.join(',') + 'pos sum' + ini.to_s - - return sum - - end - - def find_bounds_high_quality(sum,ini,index_window_end) - - new_start = -1 - new_end = -1 - - # puts " ini #{ini} iwe #{index_window_end}" - # puts "ini #{ini} index_window_end #{index_window_end} sum[ini] #{sum[ini]} cut_off #{@cut_off} suma #{sum.size} " - if (ini>index_window_end) - temp_start= ini - # new_start, new_end = temp_start, index_window_end - new_end = index_window_end # para que no crea que no hay alta calidad, sino que hemos sobrepasado el indice final de la ventana - # new_start, new_end = index_window_end, index_window_end - end - # puts " temp_start #{temp_start}" if (ini>index_window_end) - temp_start=((ini<=index_window_end) && (sum[ini]>=@cut_off))? ini : -1 - - i=ini+1 - while (i<=index_window_end) - if (sum[i]>=@cut_off) - if (temp_start<0) - temp_start=i #just in! - # puts "just in ---- #{sum[i]}>= cut off #{@cut_off} pos #{temp_start}" - end - - else - # puts "sum #{sum[i]} < cut off " - if(temp_start>=0) #just out! - # puts "update #{sum[i]}< cut off #{@cut_off} pos #{i}.if #{i-1} - #{temp_start} > #{new_end} - #{new_start}" - if (((i-1-temp_start)>=(new_end-new_start))) - new_start,new_end=temp_start,i-1 - # puts "just out ---- new start,new_end = #{temp_start}, #{i-1} index_window_end = #{index_window_end}" - end - temp_start= -1 - end - end - i+=1 - - - end - # puts "4 temp_start #{temp_start} new_start #{new_start} new-end #{new_end}" - - if (temp_start != -1) # finished while ok - # puts "4 #{index_window_end} - #{temp_start} > #{new_end} - #{new_start}" - if ((index_window_end- temp_start) >= (new_end-new_start)) #put the end of the window at the end of sequence - new_start, new_end = temp_start, index_window_end #-1 - end - end - - # puts "5 temp_start #{temp_start} new_start #{new_start} new-end #{new_end}" - - # puts " newstart #{new_start} newend #{new_end}" - - return new_start,new_end - - - end - - def cut_fine_bounds_short(qual,new_start,new_end) - - i=0 - # puts " qual[new_start+i] new_start #{new_start} i #{i} = #{new_start+i} qual.size #{qual.size}" - while (i<@window) - if (qual[new_start+i]>=@low) - break - end - i+=1 - end - new_start +=i - # puts "#{new_start} ***********" - - i=@window -1 - while (i>=0) - if (qual[new_end+i]>=@low) - break - end - i-=1 - end - new_end += i - # puts "6a new_start #{new_start} new-end #{new_end}" - - # puts " #{new_start} #{new_end} .o.o.o.o.o.o.o.o2 short" - return new_start, new_end - - end - - - # cuts fine the high quality bounds - def cut_fine_bounds(qual,new_start,new_end) - # puts " ççççççççççççççç #{new_start+@window} >= #{new_end} " - # puts " #{new_start} #{new_end} .o.o.o.o.o.o.o.o1" - # cut it fine - - one_ok = 0 - - i=@window-1 - # puts " qual[new_start+i] new_start #{new_start} i #{i} = #{new_start+i} qual.size #{qual.size}" - while (i>=0) - if (qual[new_start+i] < @low) - break if one_ok - else - one_ok = 1 - end - i-=1 - end - new_start += i+1 - oneOk = 0 - i=0 - while (i<@window) - if (qual[new_end+i] < @low) - break if oneOk - else - oneOk = 1 - end - i+=1 - end - new_end += i-1 - # puts "6b new_start #{new_start} new-end #{new_end}" - - # puts " #{new_start} #{new_end} .o.o.o.o.o.o.o.o2" - return new_start, new_end - end + end while !rstart.nil? + + return regions + + end - def find_high_quality(qual,ini=0) - - # puts qual.class.to_s + qual.size.to_s + 'size,' + @window.to_s + ' window, '+ qual.join(',') + 'size' + qual.size.to_s - - update=false - # if @window>qual.length-ini #search in the last window although has a low size - # @window=qual.length-ini - # # puts ' UPDATE WINDOW Y CUT OFF ' + @window.to_s - # @cut_off=@window*@low - # update=true - # end - - if (ini==0 or update) - #index_window_start = ini - @index_window_end = qual.size- @window #don't sub 1, or will lost the last nucleotide of the sequence -1; - #TODO En seqtrim de Juan iwe, que en nuestro seqtrim se llama index_window_end, está perdiendo 2 nucleótidos de la última ventana calculada - - - @sum = create_sum_window(qual,ini,@index_window_end) - # puts "SUMA #{@sum.join(' ')}" - end - - new_start, new_end = find_bounds_high_quality(@sum,ini,@index_window_end) - # puts " #{new_start} #{new_end} .o.o.o.o.o.o.o.o1" - - if (new_start>=0) - if (new_start+@window >= new_end) - # puts "cfs" - new_start, new_end = cut_fine_bounds_short(qual,new_start,new_end) - # puts "cfs" - - else - # puts "cf" - new_start, new_end = cut_fine_bounds(qual,new_start,new_end) - # puts "cf" - end - end - - # puts " #{new_start} #{new_end} .o.o.o.o.o.o.o.o2" - - return new_start,new_end #+1 - - - end - - - def add_action_before_high_qual(p_begin,p_end,actions,seq,start) - - action_size = p_begin-1 - if action_size>=(@window/2) - - - # puts "action_SIZE1 #{action_size} > #{@window/2}" - - if ( (p_begin>0) && (action_size>0) ) #if there is action before the high qual part - # it's created an action before of the high quality part - a = seq.new_action(start ,p_begin-1,"ActionLowQuality") # adds the ActionInsert to the sequence before adding the actionMid - # puts " new low qual start: #{start} = #{a.start_pos} end: #{p_begin} -1 = #{a.end_pos}" - actions.push a - end - end - end - - def add_action_after_high_qual(p_begin,p_end,actions,seq) - - action_size = seq.insert_end-p_end - if action_size>=(@window/2) - - - # puts "action_SIZE2 #{action_size} > #{@window/2}" - - if ((p_end<seq.seq_fasta.size-1) && (action_size>0) ) #if there is action before the high qual part - # it's created an action before of the high quality part - a = seq.new_action(p_end-seq.insert_start+1,seq.seq_fasta.size-1,"ActionLowQuality") # adds the ActionInsert to the sequence before adding the actionMid - - actions.push a - end - end - end - + ###################################################################### @@ -264,104 +113,46 @@ # Creates the qualities windows from the sequence, looks for the subsequence with high quality # and mark, with an action, the before part to the High Quality Subsequence like a low quality part # Finally mark, with an action, the after part to the High Quality Subsequence like a low quality part #----------------------------------------------------------------- - def execute(seqs) - seqs.each do |s| - exec_seq(s) - end - end + def exec_seq(seq,blast_query) - - def exec_seq(seq) - if ((self.class.to_s=='PluginLowQuality') && seq.seq_qual.nil? ) - $LOG.error " Quality File haven't been provided. It's impossible to execute " + self.class.to_s + $LOG.debug " Quality File haven't been provided. It's impossible to execute " + self.class.to_s elsif (seq.seq_qual.size>0) - $LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: checking low quality of the sequence" - - @low=@params.get_param('min_quality').to_i - - if @params.get_param('window_width').to_i>seq.seq_fasta.length - @window=seq.seq_fasta.length - - else - @window=@params.get_param('window_width').to_i - - end - @cut_off=@window*@low - - type='ActionLowQuality' - low_qual=0 - actions=[] - - p_begin,p_end =0,-1 # positions from high quality bounds - - # @stats[:low_qual]={} - # @stats['low_qual']={} - - - while ((p_begin>=0) && (p_end + 1 < seq.seq_qual.size) ) - - - p_begin_old,p_end_old= p_begin, p_end - p_begin,p_end = find_high_quality(seq.seq_qual,p_end+1) - # entra=(p_begin>0) or (p_end_old<0) - # - # puts "high ini fin #{p_begin} #{p_end} ini-old fin-old #{p_begin_old} #{p_end_old} __ ___ ___ ___1" - - if ((p_begin>0) && (p_begin-p_end_old-1>=@window/2)) #if we have found the high quality part, and the low quality part has enough size - # it's created an action before of the high quality part - add_action_before_high_qual(p_begin,p_end,actions,seq,p_end_old+1) - # puts "low1 ini fin #{p_end_old+1} #{p_begin-1} = #{p_begin-1-p_end_old-1+1}" - low_qual = p_begin-1-p_end_old-1 + 1 - - add_stats('low_qual',low_qual) - # @stats[:low_qual]={low_qual => 1} - - end - - # puts "-----ññññ----- high quality #{p_begin} #{p_end}+#{seq.insert_start} seq size #{seq.seq_fasta.size}" - - end - - # puts "high [#{p_begin}, #{p_end}] old [#{p_begin_old}, #{p_end_old}] size #{seq.seq_qual.size}" - if ((p_begin>=0) && (p_end+1<seq.seq_qual.size)) #if we have found the high quality part - # it's created an action after of the high quality part - add_action_after_high_qual(p_begin,p_end,actions,seq) - # puts "low2 ini fin #{p_end+1} #{seq.seq_fasta.size-1} = #{seq.seq_fasta.size-1-p_end-1+1}" - low_qual = seq.seq_fasta.size-1 - p_end-seq.insert_start-1 + 1 - # if @stats[:low_qual][low_qual].nil? - # @stats[:low_qual][low_qual] = 0 - # end - # @stats[:low_qual][low_qual] += 1 - add_stats('low_qual',low_qual) - # @stats[:low_qual]={low_qual => 1} - end - - # puts "-----ññññ----- high quality #{p_begin} #{p_end}" - - - if p_end<0 and p_end_old #add action low qual to all the part - a = seq.new_action(p_end_old+1 ,seq.seq_fasta.size-1,"ActionLowQuality") # adds the ActionInsert to the sequence before adding the actionMid - # puts "new low qual start: #{p_end_old+1} end: #{seq.seq_fasta.size-1} = #{seq.seq_fasta.size-1 - p_end_old-1 + 1}" - low_qual = seq.seq_fasta.size-1 - p_end_old-1 + 1 - - # if @stats[:low_qual][low_qual].nil? - # @stats[:low_qual][low_qual] = 0 - # end - # @stats[:low_qual][low_qual] += 1 - add_stats('low_qual',low_qual) - # @stats[:low_qual]={'low_qual' => 1} + $LOG.debug "[#{self.class.to_s}, seq: #{seq.seq_name}]: checking low quality of the sequence" - actions.push a - end - - # puts "------- ADDING ACTIONs LOW QUAL #{actions.size}" - seq.add_actions(actions) + min_quality=@params.get_param('min_quality').to_i + min_length_inside_seq=@params.get_param('min_length_inside_seq').to_i + max_consecutive_good_bases=@params.get_param('max_consecutive_good_bases').to_i + + type='ActionLowQuality' + actions=[] + + regions=get_low_qual_regions(seq.seq_qual,min_quality,min_length_inside_seq,max_consecutive_good_bases) + + regions.each do |r| + low_qual_size=r.last-r.first+1 + + # puts "(#{low_qual_size}) = [#{r.first},#{r.last}]: #{a[r.first..r.last].map{|e| ("%2d" % e.to_s)}.join(' ')}" + + + add_stats('low_qual',low_qual_size) + + + # create action + a = seq.new_action(r.first,r.last,type) # adds the correspondent action to the sequence + actions.push a + + + + end + + # add quals + seq.add_actions(actions) end end #----------------------------------------------------------------- @@ -377,18 +168,23 @@ comment='Minimum quality value for every nucleotide' default_value = 20 params.check_param(errors,'min_quality','Integer',default_value,comment) - comment='Quality window for scanning low quality segments' + comment='Quality window for scanning low quality segments' default_value = 15 params.check_param(errors,'window_width','Integer',default_value,comment) - + comment='Minimum length of a bad quality segment inside the sequence' + default_value = 8 + params.check_param(errors,'min_length_inside_seq','Integer',default_value,comment) + + + comment='Maximum consecutive good-quality bases between two bad quality regions' + default_value = 2 + params.check_param(errors,'max_consecutive_good_bases','Integer',default_value,comment) + return errors end - - - private :find_high_quality end