require 'scbi_blast' def filter_hits(query, select_hits=10) # Select best hits hits = query.hits if !hits.first.nil? hits = cluster_hsps(hits) hits = hits[0..select_hits] hits = select_hits_by_identity_query(hits) hits = select_hits_by_coverage_subject(hits) end if hits.empty? if select_hits >= query.hits.length || select_hits >= 100 # Condition to stop a infinite recursive function hits = [cluster_hsps(query.hits).first] else hits = filter_hits(query, select_hits+10) end end return hits end def get_coverage_subject(hit) perc_identity = hit.align_len*100.0/hit.s_len if perc_identity > 100 && hit.class.to_s == 'ExoBlastHit' && !hit.q_frameshift.empty? hit.q_frameshift.length.times do |n| #Align len correction by frameshift. FS can create a gap in alignment adding extra aa. FS can be deletions or insertions so we check until get a perc_identity of 100 align_len = hit.align_len- (n + 1) perc_identity = align_len*100.0/hit.s_len break if perc_identity <= 100 end end return perc_identity end def select_hits_by_coverage_subject(hits) selected_hits = [] coverage_thresold = get_coverage_subject(hits.first.first) coverage_thresold = 100 if coverage_thresold > 100 hits.map{|hit| hit.each do |hsp| coverage = get_coverage_subject(hsp) if coverage > 100 next end if coverage >= coverage_thresold selected_hits << hit break end end } return selected_hits end def select_hits_by_identity_query(hits) selected_hits = [] identity = hits.first.first.ident hits.map{|hit| hit.each do |hsp| if hsp.ident >= identity selected_hits << hit break end end } return selected_hits end def select_hits_by_evalue(hits, evalue) selected_hits = [] hits.map{|hit| hit.each do |hsp| if hsp.e_val <= evalue selected_hits << hit end end } return selected_hits end def select_hsps_by_id(hits, selected_ids) selected_hits = [] hits.map{|hsp| if selected_ids.include?(hsp.acc) selected_hits << hsp end } return selected_hits end def set_thresold_evalue(hits) evalue = 100 hits.map{|hit| if hit.e_val != 0 && hit.e_val < evalue evalue = hit.e_val end } if evalue == 100 evalue = 0 else exp = Math.log10(evalue).abs.to_i min_exp = (exp/10.0).ceil evalue = 10.0**-(exp-min_exp) end return evalue end def same_subject_hsp(hit, second_hit) same = FALSE if hit.acc == second_hit.acc if hit.s_beg <= second_hit.s_beg && hit.s_end >= hit.s_end && (second_hit.s_beg - hit.s_end).abs > 1 same = TRUE end end return same end def same_query_hsp(hit, second_hit) same = FALSE if hit.acc == second_hit.acc if hit.q_beg <= second_hit.q_beg && hit.q_end >= hit.q_end && (second_hit.q_beg - hit.q_end).abs > 1 same = TRUE end end return same end def same_sense?(hit, second_hit) same= FALSE hit_sense = hit.q_frame <=> 0 second_hit_sense = second_hit.q_frame <=> 0 if hit_sense == second_hit_sense same = TRUE end return same end def clean_by_identity(blast_result, ident) blast_result.querys.each do |query| if !query.hits.first.nil? new_hits = query.hits.select{|hit| hit.ident > ident} new_hits = [nil] if new_hits.empty? #When no hit, set new_hits to [nil] query.hits = new_hits end query.full_query_length = query.full_query_length.to_i #to_i is used to correct a scbi_blast's bug. Returns this attribute like string instead integer end end def clean_by_query_length_match(blast_result, min_len_nt) blast_result.querys.each do |query| if !query.hits.first.nil? new_hits = query.hits.select{|hit| hit.align_len * 3 > min_len_nt} new_hits = [nil] if new_hits.empty? #When no hit, set new_hits to [nil] query.hits = new_hits end query.full_query_length = query.full_query_length.to_i #to_i is used to correct a scbi_blast's bug. Returns this attribute like string instead integer end end def clean_overlapping_hsps(blast_result, keep_if_diff_sense = FALSE) blast_result.querys.each do |query| if query.hits.length > 1 query.hits.each_with_index do |hit, j| if hit.nil? next end query.hits.each_with_index do |second_hit, i| if second_hit.nil? || i == j #Same hit next end if same_query_hsp(hit, second_hit) #|| same_subject_hsp(hit, second_hit) if keep_if_diff_sense if same_sense?(hit, second_hit) #Delete second_hit if is into the hit and has same sense query.hits[i] = nil end else query.hits[i] = nil end end end end query.hits.compact! end end end ##################################################################### ## DETECTION FUNCTIONS ##################################################################### def misassembled_detection(query) miss=FALSE hits = cluster_hsps(query.hits) misassembled_hits = [] hits.each do |hit| if hit.length > 1 negative_frame = hit.select{|hsp| hsp.q_frame < 0} if negative_frame.length > 0 && negative_frame.length != hit.length misassembled_hits << hit.first.acc end end end if misassembled_hits.length*1.0/ hits.length > 0.5 miss = TRUE else #Remove missassembled hits to avoid broken analysis query.hits.reverse_each do |hsp| if misassembled_hits.include?(hsp.acc) query.hits.delete(hsp) end end end return miss end def multiple_hsps(query, num) multiple = FALSE hsps = query.hits.select{|h| h.acc == query.hits.first.acc} if hsps.length >= num multiple = TRUE end return multiple end def overlapping_hsps_on_subject(query) overlapping = FALSE current_hit = query.hits.first.acc complete_hit = [] cleaned_hits = [] query.hits.each do |hit| if hit.acc != current_hit complete_hit, overlapping = clean_subject_overlapping_hsps(complete_hit, cleaned_hits) complete_hit = [] end complete_hit << hit current_hit = hit.acc end complete_hit, overlapping = clean_subject_overlapping_hsps(complete_hit, cleaned_hits) query.hits = cleaned_hits return query, overlapping end ##################################################################### ## COMPLEMENTARY FUNCTIONS ##################################################################### def clean_subject_overlapping_hsps(complete_hit, cleaned_hits) if complete_hit.length > 1 complete_hit, overlapping = subject_overlapping_hsps(complete_hit) end cleaned_hits.concat(complete_hit) return complete_hit, overlapping end def subject_overlapping_hsps(hit) overlapping = FALSE hsp_table = hsps_relationship_subject(hit) if !hsp_table.empty? hit = clean_hsp_by_identity(hit, 55) if hit.empty? overlapping = TRUE else hsp_table = hsps_relationship_subject(hit) if !hsp_table.empty? overlapping = TRUE end end end return hit, overlapping end def hsps_relationship_subject(hit) hsps = [] hit.each_with_index do |hsp, j| hit.each_with_index do |second_hsp, i| if i == j #Same hit next end if same_subject_hsp(hsp, second_hsp) if !hsps.include?([hsp, second_hsp]) && !hsps.include?([second_hsp, hsp]) # Save if no exists direct relationship or his inverse hsps << [hsp, second_hsp] end end end end return hsps end def same_subject_hsp(hit, second_hit) same = FALSE if hit.acc == second_hit.acc if hit.s_beg <= second_hit.s_beg && hit.s_end >= hit.s_end && (second_hit.s_beg - hit.s_end).abs > 1 same = TRUE end end return same end def clean_hsp_by_identity(hit, identity) hit.select!{|hsp| hsp.ident >= identity} return hit end def cluster_hsps(hsps) hits = [] last_acc = '' hsps.each do |hsp| if hsp.acc != last_acc hits << [hsp] else hits.last << hsp end last_acc = hsp.acc end return hits end def find_hit(hit_acc, ar_hits) selected_hit = nil ar_hits.each do |hit| if hit.first.acc == hit_acc selected_hit = hit break end end return selected_hit end