require 'scbi_blast' require 'fl_analysis' #Allow call 'analiza_orf_y_fl' require 'common_functions' module ChimericSeqs BEG = 0 STOP = 1 HIT = 2 def search_chimeras(seq, blast_query, options, db_name, db_path) # DETECTION #---------------------- homology_zones = [] cut_positions = [] if blast_query.hits.length > 1 homology_zones = define_homology_zones(blast_query, options, seq.seq_fasta) cut_positions = set_cut_positions(homology_zones) if homology_zones.length > 1 end # CONFIRMATION #---------------------- num_homology_zones = homology_zones.length if num_homology_zones > 1 && options[:chimera].include?('r') confirm_chimeras(homology_zones, db_path, options[:ident_thresold]) # Check if prots are differents or not num_homology_zones = homology_zones.length end # SPLICING #-------------------- new_seqs=[] if num_homology_zones > 1 #In this case the sequence is a chimera seq.format_chimera! homology_zones.each_with_index do |hom_zone, i| seq.hit << hom_zone[HIT].first.dup #Save hit before modified it for write output purposes hit_limits = get_limits(hom_zone[HIT])# Take beginning and end of hit on query, hit can be composed by unsorted or antisense hsps if options[:chimera].include?('c') && hit_limits[STOP]-hit_limits[BEG]> options[:min_nucleotides] new_seqs << fragment_chimera(blast_query, seq, hom_zone[HIT], i, hit_limits, num_homology_zones, options, db_name, cut_positions[i]) seq.warnings('SOLVED') end end else new_seqs = nil #Sequence isn't chimera end return new_seqs end def set_cut_positions(homology_zones) cut_positions = [] last_cut = -1 homology_zones.each_with_index do |hom_zone, i| if i > 0 positions = [] positions << last_cut + 1 # Start of fragment cut_position = homology_zones[i-1][STOP] + (hom_zone[BEG] - homology_zones[i-1][STOP])/2 positions << cut_position # End of fragment last_cut = cut_position cut_positions << positions end end cut_positions << [last_cut, homology_zones.last[HIT].first.q_len-1] return cut_positions end def confirm_chimeras(homology_zones, db_path, ident_thresold) acc_hit = homology_zones.map{|zone| zone[HIT].first.acc} seq_fasta = %x[blastdbcmd -db #{db_path} -entry #{acc_hit.join(',')}] seq_fasta << ">remove\nALGO\n" #Needed for clustal-omega display the dist-matrix, requires unless 3 sequences to do it clustal_matrix = do_clustal(seq_fasta) clustal_matrix.shift #Remove header clustal_matrix.pop #Remove false sequence clustal_hits = [] distances = [] clustal_matrix.each do |line| fields = line.split fields.pop #Remove data belong to false sequence fields.shift #Remove prot name distances << fields.map! {|field| field.to_f} end delete_positions = search_ident_prots(homology_zones, distances, ident_thresold) delete_zones(delete_positions, homology_zones) end def search_ident_prots(homology_zones, distances, ident_thresold) delete_positions = [] n_homology_zones = homology_zones.length n_homology_zones.times do |j| n_homology_zones.times do |i| next if i == j if distances[j][i] >= ident_thresold delete_positions << j delete_positions << i end end end delete_positions.uniq! return delete_positions end def fragment_chimera(query, seq, hit, hit_position, hit_limits, num_homology_zones, options, db_name, cut_positions) # Prepare new seq and query #---------------------------- query_bak = query.dup query_bak.hits = hit # Here, hit is an array of hsps query_bak.query_def += "_split_#{hit_position}" seq_bak = seq.dup seq_bak.reset_classification seq_bak.clean_warnings seq_bak.seq_name += "_split_#{hit_position}" seq_bak.clean_orfs seq_bak.save_fasta = true seq_bak.ignore = false # Cut sequence and move hit/hsps limits #---------------------------------------- if hit_position == 0 #First zone limit = 0 if hit.first.q_frame < 0 #Hit reversed hit.first.q_frame = -1 end else #Middle & last zone limit = cut_positions[BEG]#hit_limits[BEG] hit_move_limits(hit, -limit, 0) #Redefine hit limits on new sequence after cut if hit.first.q_frame >= 0 hit.first.q_frame=1 elsif hit_position < num_homology_zones-1 #Last zone keeps his original frame because it's composed by the hit and the terminal sequence (Here hit is reversed). hit.first.q_frame=-1 end end if hit_position == num_homology_zones-1 #Last zone seq_bak.seq_fasta = seq.seq_fasta[cut_positions[BEG]..seq.fasta_length-1]#[hit_limits[BEG]..seq.fasta_length-1] else # Beginning & Middle zone seq_bak.seq_fasta = seq.seq_fasta[limit..cut_positions[STOP]]#[limit..hit_limits[STOP]] end seq_length = seq_bak.seq_fasta.length query_bak.full_query_length = seq_length seq_bak.fasta_length = seq_length hit_set_q_len(hit, seq_length) # Full length analisys of fragment #---------------------------------------- analiza_orf_y_fl(seq_bak, query_bak.hits, options, db_name) return seq_bak end def define_homology_zones(query, options, query_fasta) # Define hit limits #--------------------- hits = cluster_query_hits(query) #Hsp packages hits_limits = define_hit_limits(hits) # Define homology zones #------------------------ #First homology zone zones = [[hits_limits.first[BEG], hits_limits.first[STOP], hits.first]] ref_hit_beg = hits_limits.first[BEG] ref_hit_end = hits_limits.first[STOP] #Other homology zone hits_limits.each_with_index do |hit, i| coincidences = 0 zones.each do |zone| if hit_is_in?(zone[BEG], zone[STOP], hit) # Extender zona de homologia si coinciden en zona zone[BEG] = [zone[BEG],hit[BEG]].min zone[STOP] = [zone[STOP],hit[STOP]].max coincidences+=1 end end if coincidences == 0 zones << [hit[BEG], hit[STOP], hits[i]] end end zones.sort!{|e1,e2| e1[BEG] <=> e2[BEG]} # Delete overlapping homology zones #------------------------------------ overlapping_zones = overlapping_zones(zones) delete_zones(overlapping_zones, zones) return zones end def define_hit_limits(hits) limits=[] hits.each do |hit| limits << get_limits(hit) end return limits end def get_limits(hit) coordenates=[] hit.map{|h| coordenates << h.q_beg; coordenates << h.q_end} # BEG END limits=[coordenates.min, coordenates.max] return limits end def get_limits_s(hit) coordenates=[] hit.map{|h| coordenates << h.s_beg; coordenates << h.s_end} # BEG END limits=[coordenates.min, coordenates.max] return limits end def cluster_query_hits(query) hits = [] acc_hit = [] query.hits.each do |hit| ind = acc_hit.index(hit.acc) if ind.nil? acc_hit << hit.acc hits << [hit] else hits[ind] << hit end end return hits end def delete_zones(overlapping_zones, zones) overlapping_zones.each do |zone| zones.delete_at(zone) end end def overlapping_zones(zones) delete_zones=[] zones.each_with_index do |zone, i| if i>0 if zone[BEG]< zones[i-1][STOP] delete_zones << i delete_zones << i-1 end end end delete_zones.uniq! return delete_zones end def hit_is_in?(h_beg, h_end, hit) is=false # CONTIENE #OVERLAP if h_beg <= hit[BEG] && h_end > hit[BEG] || hit[BEG] <= h_beg && hit[STOP] > h_beg is=true end return is end def get_hits(query, ref_hit) all_hits=[] query.hits.each do |hit| if hit.acc == ref_hit.acc all_hits << hit end end return all_hits end def min_distance_between_homology_zones(homology_zones) distance=nil homology_zones.each_with_index do |zone,i| if i > 0 local_distance=homology_zones[i][BEG] - homology_zones[i-1][STOP] if distance.nil? || distance > local_distance distance=local_distance end end end return distance end def duplicate_hits(query) dup_hits=[] query.hits.each do |hit| dup_hits << hit.dup end return dup_hits end def set_limits(hit, q_beg, q_end, s_beg, s_end) hit.q_beg = q_beg hit.q_end = q_end hit.s_beg = s_beg hit.s_end = s_end end def move_limits(hit, q_add, s_add) hit.q_beg+=q_add hit.q_end+=q_add hit.s_beg+=s_add hit.s_end+=s_add if hit.class.to_s == 'ExoBlastHit' && !hit.q_frameshift.empty? #There is frameshift hit.q_frameshift.map!{|fs| [fs.first + q_add, fs.last] } end end def hit_move_limits(hit, q_add, s_add) if hit.class.to_s == 'Array' hit.each do |hsp| move_limits(hsp, q_add, s_add) end elsif hit.class.to_s == 'Hit' #puts "\e[35m#{hit.acc}\t#{hit.q_beg}\t#{hit.q_end}\t#{hit.s_beg}\t#{hit.s_end}\t#{hit.reversed}\e[0m" move_limits(hit, q_add, s_add) end end def hit_set_q_len(hit, q_len) hit.each do |hsp| hsp.q_len=q_len end end def do_clustal(seq_fasta) cmd='clustalo -i - -o /dev/null --percent-id --full --distmat-out=/dev/stdout --force' clustal_matrix = nil IO.popen(cmd,'w+') {|clustal| clustal.sync = true clustal.write(seq_fasta) clustal.close_write clustal_matrix = clustal.readlines clustal.close_read } return clustal_matrix end end