require 'une_los_hit' module FlAnalysis def analiza_orf_y_fl(seq, blast_query, options, db_name) aas_n_end = options[:distance] pident_threshold = options[:ident] evalue_threshold = options[:evalue] # @verbose = options[:verbose] # test_blast_hits(blast_query) # used to detect if the sequence and the blast are from different query if seq.seq_name != blast_query.query_def raise "BLAST query name and sequence are different" end q=blast_query msgs = '' atg_status = '' end_status = '' final_status = '' # the fasta sequence is saved query_fasta = seq.seq_fasta if q.hits[0].nil? # There is no match in blast, the seq go to the next DB # puts "#{db_name} -- #{q.query_def} --> NO BLASTX match" # If the DB is trembl and the seq has annotations from other DB the annotations must be printed if (db_name =~ /^tr_/) if (seq.get_annotations(:tmp_annotation).empty?) if (seq.sec_desc.empty?) seq.annotate(:tcode,'') else seq.annotate(:tmp_annotation,[seq.sec_desc, '','',''],true) end else save_last_db_annotations(seq) end end return end #---------------------------------------------------------------------------------------------------------- warnings = '' errors = '' wrong_seq = false # if the sequence has more than one hit, the frames are checked and fixed to get an single hit if (q.hits.count > 1) seq_unida = UneLosHit.new(q, query_fasta, pident_threshold) wrong_seq = seq_unida.wrong_seq is_ok = seq_unida.is_ok q_index_start = seq_unida.q_index_start full_prot = seq_unida.full_prot query_fasta = seq_unida.output_seq # repaired fasta final_hit = seq_unida.final_hit # single hit msgs = seq_unida.msgs # warning messages x_number = seq_unida.number_x # number of nucleotides used to fix frame errors else # if there is only one hit if (q.hits[0].q_frame.to_i < 0) # si la secuencia esta al reves le damos la vuelta (query_fasta, q.hits[0].q_frame, q.hits[0].q_beg, q.hits[0].q_end) = reverse_seq(query_fasta, q.hits[0].q_frame, q.hits[0].q_beg, q.hits[0].q_end) q.hits[0].reversed = true end final_hit = q.hits[0] # single hit x_number = 0 # number of nucleotides used to fix frame errors full_prot = query_fasta[final_hit.q_frame-1, query_fasta.length+1].translate (is_ok, q_index_start) = contenidos_en_prot(final_hit, full_prot, q) end # test_final_hit(final_hit, query_fasta) #---------------------------------------------------------------------------------------------------------- if wrong_seq warnings = "ERROR#1, contains sense and antisense hits!!!, putative chimeric sequence, " + warnings # puts "ERROR#1, contains sense and antisense hits!!!, putative chimeric sequence" errors = "#{db_name}\t#{q.hits[0].acc}\tERROR#1\tcontains sense and antisense hits!!!, putative chimeric sequence, " error_log(q, seq, warnings, db_name) return end #---------------------------------------------------------------------------------------------------------- warnings += msgs msgs = '' #---------------------------------------------------------------------------------------------------------- if (x_number < 0) warnings = "ERROR#2, unexpected negative index in x_number, " + warnings # puts "ERROR#2, unexpected negative index in x_number" errors = "#{db_name}\t#{q.hits[0].acc}\tERROR#2\tunexpected negative index in x_number, " error_log(q, seq, warnings, db_name) return end #---------------------------------------------------------------------------------------------------------- if (!is_ok) warnings = "ERROR#3, very serious frame error, " + warnings # puts "#{q.query_def} ERROR#3, hit was NOT found in the protein" errors = "#{db_name}\t#{q.hits[0].acc}\tERROR#3\thit was NOT found in the protein, " # error_log(q, seq, warnings, db_name) # return end #---------------------------------------------------------------------------------------------------------- fiable = false if ((final_hit.ident >= pident_threshold) && (final_hit.e_val <= evalue_threshold)) fiable = true end # if the query protein is large enough at the start of the sequence should have the start codon if (final_hit.q_beg/3 + aas_n_end >= final_hit.s_beg.to_i) substring = full_prot[0, q_index_start + 10] resto_substring = full_prot[q_index_start + 10, full_prot.length - q_index_start - 10] # to look for the beginning of the protein (m_substring, atg_status, msgs) = find_start(final_hit.s_beg, substring, fiable, aas_n_end) # pasting the substring sequence with the rest of the sequence tmp_prot = "#{m_substring}#{resto_substring}" # to get the value of the start_ORF index final_hit.q_beg = final_hit.q_beg.to_i - ((m_substring.length - 10) * 3) else # if (@verbose) # puts "beginning too short!" # end atg_status = 'incomplete' substring = full_prot[0, q_index_start] distance_s_atg = (final_hit.s_beg.to_i - final_hit.q_beg/3) + 1 if (substring.rindex('*')) warnings += "Unexpected stop codon in the beginning of your sequence, " # if (@verbose) # puts "#{db_name} -- #{q.query_def} --> Unexpected stop codon in the beginning of your sequence" # end end final_hit.q_beg = final_hit.q_beg.to_i - (substring.length * 3) tmp_prot = full_prot end #---------------------------------------------------------------------------------------------------------- # look for the end of the protein (resto_substring, end_substring, end_status, warnings, putative_end) = find_end(final_hit, q, full_prot, tmp_prot, end_status, warnings, aas_n_end) #---------------------------------------------------------------------------------------------------------- final_prot = "#{resto_substring}#{end_substring}" warnings += msgs # to get the value of the end_ORF index if (atg_status == 'complete') final_hit.q_end = final_hit.q_beg - 3 + (final_prot.length * 3) else if (putative_end) final_hit.q_end = final_hit.q_end - 45 + (putative_end*3) end end #-------------------------------------------------------------------------------------------------------------- # decide the sequence status (Complete, Putative Complete, Internal, N-terminus, Putative N-terminus, C-terminus) final_status = determine_status(atg_status,end_status) #---------------------------------------------------------------------------------------------------------- if (final_prot.length - 2*aas_n_end > final_hit.full_subject_length) warnings += " your sequence is longer than subject: #{final_prot.length} - #{final_hit.full_subject_length}" elsif (final_prot.length + aas_n_end < final_hit.full_subject_length) warnings += " your sequence is shorter than subject: #{final_prot.length} - #{final_hit.full_subject_length}" if (final_prot.length + 100 < final_hit.full_subject_length) || (final_prot.length*2 < final_hit.full_subject_length) if (final_status == 'Complete') final_status = 'Putative Complete' warnings += ". Was predicted as Complete, but is very much shorter than de subject" # if (@verbose) # puts "#{db_name} -- #{q.query_def} --> your sequence is 100 aas shorter than the subject or shorter than the half length of the subject" # end end end end # test_final_hit(final_hit, query_fasta) print_annotations(seq, q, final_hit, final_status, final_prot, warnings, query_fasta, db_name) end def test_blast_hits(q) puts "query_def: #{q.query_def} full_query_length: #{q.full_query_length} ------------------------------------------------" q.hits.each do |h| puts "\t subject_id: #{h.acc}" puts "\t acc: #{h.acc}" puts "\t full_subject_length: #{h.full_subject_length}" puts "\t q_beg: #{h.q_beg + 1}" puts "\t q_end: #{h.q_end + 1}" puts "\t q_frame: #{h.q_frame}" puts "\t s_beg: #{h.s_beg + 1}" puts "\t s_end: #{h.s_end + 1}" puts "\t s_frame: #{h.s_frame}" puts "\t align_len: #{h.align_len}" puts "\t gaps: #{h.gaps}" puts "\t mismatches: #{h.mismatches}" puts "\t reversed: #{h.reversed}" puts "\t score: #{h.score}" puts "\t bit_score: #{h.bit_score}" puts "\t ident: #{h.ident}" puts "\t e_val: #{h.e_val}" puts "\t definition: #{h.definition}" puts "\t q_seq: #{h.q_seq}" puts "\t s_seq: #{h.s_seq}" end end def test_final_hit(final_hit, query_fasta) puts "\t acc: #{final_hit.acc}" puts "\t full_subject_length: #{final_hit.full_subject_length}" puts "\n\t q_frame: #{final_hit.q_frame}" puts "\t reversed: #{final_hit.reversed}" puts "\n\t q_beg-q_end: #{final_hit.q_beg + 1} - #{final_hit.q_end + 1}" puts "\t s_beg - s_end: #{final_hit.s_beg + 1} - #{final_hit.s_end + 1}" puts "\n\t score: #{final_hit.score}, bit_score: #{final_hit.bit_score}, ident: #{final_hit.ident}, e_val: #{final_hit.e_val}" puts "\n\t definition: #{final_hit.definition}" puts "\t q_seq: #{final_hit.q_seq}" puts "\t s_seq: #{final_hit.s_seq}" puts "\nnt q_beg-q_end\n#{query_fasta[final_hit.q_beg..final_hit.q_end]}" puts "\n\nprot q_beg-q_end\n#{query_fasta[final_hit.q_beg..final_hit.q_end].translate}" end def error_log(q, seq, warnings, db_name) # seq.annotate(:error,"#{q.query_def}\t#{warnings}\t#{q.hits[0].definition}") if (db_name =~ /^tr_/) if (seq.get_annotations(:tmp_annotation).empty?) if (seq.sec_desc.empty?) if (!q.hits[0].definition.nil?) warnings = "Coding sequence with some errors, #{warnings}" seq.sec_desc = "#{q.query_def}\t#{seq.seq_fasta.length}\t#{q.hits[0].acc}\t#{db_name}\tCoding Seq\t\t#{q.hits[0].e_val}\t#{q.hits[0].ident}\t\t#{q.hits[0].full_subject_length}\t#{warnings}\t\t\t\t\t\t#{q.hits[0].definition}\t" seq.annotate(:tmp_annotation,[seq.sec_desc, '','',''],true) else seq.annotate(:tcode,'') end else warnings = "Coding sequence with some errors, #{warnings}" tmp_annot = seq.sec_desc.sub('my_warning',"#{warnings}") seq.annotate(:tmp_annotation,[tmp_annot, '','',''],true) end else save_last_db_annotations(seq) end else if (seq.sec_desc.empty?) if (!q.hits[0].definition.nil?) warnings = "Coding sequence with some errors, #{warnings}" seq.sec_desc = "#{q.query_def}\t#{seq.seq_fasta.length}\t#{q.hits[0].acc}\t#{db_name}\tCoding Seq\t\t#{q.hits[0].e_val}\t#{q.hits[0].ident}\t\t#{q.hits[0].full_subject_length}\t#{warnings}\t\t\t\t\t\t#{q.hits[0].definition}\t" end end end end def save_last_db_annotations(seq) # puts "sequence not complete! recovering annotations from previous database! sldba!!" (q, final_hit, final_prot, query_fasta, final_status) = seq.get_annotations(:tmp_annotation).first[:message][3] print_nt_seqs(seq, q, final_hit, final_prot, query_fasta, final_status) # funcion para marcar ATG (_-_) y STOP (___) (name,fasta_length,acc,db_name,final_status,testcode,e_val,ident,my_length,subject_length,warnings,q_frame,q_beg,q_end,s_beg,s_end,description,final_prot) = seq.get_annotations(:tmp_annotation).first[:message][0].split("\t") if (final_hit.reversed) (kk, q_frame, q_end, q_beg) = reverse_seq(query_fasta, q_frame.to_i, q_beg.to_i, q_end.to_i) end seq.annotate(:protein,seq.get_annotations(:tmp_annotation).first[:message][1]) seq.annotate(:alignment,seq.get_annotations(:tmp_annotation).first[:message][2]) tmp_annot = "#{name}\t#{fasta_length}\t#{acc}\t#{db_name}\t#{final_status}\t\t#{e_val}\t#{ident}\t#{my_length}\t#{subject_length}\t#{warnings}\t#{q_frame}\t#{q_beg}\t#{q_end}\t#{s_beg}\t#{s_end}\t#{description}\t#{final_prot}" seq.annotate(:tmp_annotation,[tmp_annot, '','',''],true) end def find_start(subject_start, substring, fiable, aas_n_end) tmp_prot = '' msgs = '' atg_status = 'incomplete' # complete, incomplete or putative # puts "\nsubstring (#{substring.length} aas):\n#{substring}" stop_codon = substring.rindex('*') # marcamos la distancia al s_beg desde el principio del substring # s_beg_distance = (substring.length) - subject_start s_beg_distance = (substring.length - 10) - subject_start # marcamos la distancia al s_beg desde el final del substring atg_distance = (subject_start + 1) - (substring.length - 10) if (atg_distance <= 0) atg_distance = 0 else # puts "expected atg_distance = 0, your sequence atg_distance = #{atg_distance}; limit (1-15)" msgs = "atg_distance in limit (1-15): atg_distance = #{atg_distance}, " end # puts "s_beg_distance:#{s_beg_distance}, stop_codon: #{stop_codon}, subject_start: #{subject_start + 1}, atg_distance: #{atg_distance}" #---------------------------------------------------------------------------------------------------------- # tenemos un codon de parada en el substring 5 prima if (stop_codon) stop_codon += 1 # ahora vamos a ver si el stop esta antes o despues del s_beg if (stop_codon <= s_beg_distance) # esta antes substring = substring[stop_codon, substring.length - stop_codon] # puts "\nhay un codon de parada en el substring (#{substring.length} aas)\tstop_codon:#{stop_codon +1}\n#{substring}\n\n" first_m = substring.index('M') if (first_m) # tenemos M y stop --------------------------------------------------------------------------- substring = substring[first_m, substring.length - first_m] atg_status = 'complete' else # con STOP pero sin M -------------------------------------------------------------------------------- atg_status = 'putative' # puts "there is not a start codon near the expected beginning of your sequence, distance to subject ATG= #{atg_distance} aas --> good simil: #{fiable}" msgs += "W1: There is no M at the beginning, " end #---------------------------------------------------------------------------------------------------------- else # esta despues, un cambio de fase impide analizar el principio substring = substring[stop_codon, substring.length - stop_codon] # comentar? first_m = substring.index('M') # comentar? if (first_m) # tenemos M y unexpected stop # comentar? substring = substring[first_m, substring.length - first_m] # comentar? end # comentar? # TODO esto se puede cambiar! atg_status = 'putative' msgs += " Unexpected STOP codon in 5 prime region, " # puts "\nhay un codon de parada inesperado en el substring (#{substring.length} aas)\tstop_codon:#{stop_codon}, s_beg_distance: #{s_beg_distance +1}, atg_distance: #{atg_distance}" end #--------------------------------------------------------------------------------------------------------------- else # no hay stop codon first_m = substring.index('M') if (first_m) # tenemos M, sin stop m_distance = subject_start - (substring.length - 10 - first_m) substring = substring[first_m, substring.length - first_m] # m_distance = [first_m+1,s_beg_distance].max - [first_m+1,s_beg_distance].min if (m_distance > aas_n_end*2) # sin STOP, con atg pero muy lejos del inicio que marca el subject --------------- # puts "No stop codon before M and M found is too far from subject M, distance to subject ATG= #{m_distance} aas --> good simil: #{fiable}" msgs += "No stop codon before M and M found is too far from subject M, " atg_status = 'incomplete' else if (fiable) # Tenemos M y aunque no hay STOP condon el ortologo es fiable ---------------------------------- # msgs += "No stop codon before M but high homology subject, " atg_status = 'complete' else # Tenemos M pero no tenemos stop y el ortologo no es fiable ------------------------------------------- # puts "No stop codon before M and low homology subject, distance to subject ATG= #{m_distance} aas --> good simil: #{fiable}" msgs += "No stop codon before M and low homology subject, " atg_status = 'putative' end end else # sin M ni STOP ------------------------------------------------------------------------------------------- atg_status = 'putative' # puts "your sequence has the subject beginning but there is not start codon at the beginning, distance to subject ATG= #{atg_distance} aas --> good simil: #{fiable}" msgs += "W2: There is no M at the beginning, " end end return [substring, atg_status, msgs] end def find_end(final_hit, q, full_prot, tmp_prot, end_status, warnings, aas_n_end) # aqui vemos lo que queda sin similitud hasta el final s_end_resto = (final_hit.full_subject_length - (final_hit.s_end.to_i + 1)) # en el subject, numero de aas que necesito cubrir q_end_resto = (q.full_query_length.to_i - final_hit.q_end.to_i)/3 # en el query, numero de aas que tengo sq_end_distance = q_end_resto - s_end_resto cut_in_5p = full_prot.length - tmp_prot.length resto_substring = tmp_prot[0..final_hit.q_end/3 - cut_in_5p - 16] end_substring = tmp_prot[final_hit.q_end/3 - cut_in_5p - 15..tmp_prot.length] putative_end = end_substring.index('*') # si no tenemos suficiente secuencia para tener el stop (nos faltan 15 aas o mas) if (sq_end_distance + aas_n_end < 0) end_status = 'incomplete' if (putative_end) warnings += " Unexpected STOP codon at 3' end. Distance to subject end: #{sq_end_distance.abs} aas, " end_substring = end_substring[0, putative_end+1] # comentar? # if (@verbose) # puts "#{db_name} -- #{q.query_def} --> Unexpected STOP codon at 3' end. Distance to subject end: #{sq_end_distance.abs} aas" # end else warnings += "Distance to subject end: #{sq_end_distance.abs} aas, " # if (@verbose) # puts "#{db_name} -- #{q.query_def} --> Distance to subject end: #{sq_end_distance.abs} aas" # end end else # tenemos suficiente secuencia if (putative_end) # tenemos un stop q_stop_resto = (putative_end - 15) # distancia entre el stop y el q_end, si es negativo el stop esta antes del q_end qs_stop_distance = q_stop_resto - s_end_resto # distancia entre los stops del q y el s # puts "putative_end: #{putative_end}, q_stop_resto: #{q_stop_resto}, qs_stop_distance: #{qs_stop_distance}" if (qs_stop_distance + aas_n_end >= 0) # si q_end esta a menos de 15 aas antes o esta despues del s_end; complete end_status = 'complete' elsif (qs_stop_distance + 2*aas_n_end < 0) # si q_end es mas de 30 aas menor que el s_end; putative/Putative chimeric seq end_status = 'putative' warnings += " query STOP codon too far from subject stop. Distance to subject end: #{qs_stop_distance.abs} aas, putative chimeric sequence, " # if (@verbose) # puts "#{db_name} -- #{q.query_def} --> query STOP too far from subject stop. Distance to subject end: #{qs_stop_distance.abs} aas, putative chimeric sequence" # end elsif (qs_stop_distance + aas_n_end < 0) # si q_end es mas de 15 aas menor pero menos de 30 que el s_end; putative end_status = 'putative' warnings += " query STOP codon is far from subject stop. Distance to subject end: #{qs_stop_distance.abs} aas, " # if (@verbose) # puts "#{db_name} -- #{q.query_def} --> query STOP far from subject stop. Distance to subject end: #{qs_stop_distance.abs} aas" # end end end_substring = end_substring[0, putative_end+1] else # no tenemos codon de parada pero tenemos suficiente secuencia end_status = 'putative' warnings += " STOP codon was not found. Distance to subject end: #{sq_end_distance.abs} aas, " # if (@verbose) # puts "#{db_name} -- #{q.query_def} --> STOP codon was not found. Distance to subject end: #{sq_end_distance.abs} aas" # end end end return [resto_substring, end_substring, end_status, warnings, putative_end] end def determine_status(atg_status,end_status) if (atg_status == 'complete') && (end_status == 'complete') # proteina completa final_status = 'Complete' elsif (atg_status == 'putative' && end_status == 'complete') || (atg_status == 'complete' && end_status == 'putative') || (atg_status == 'putative' && end_status == 'putative') # comienzo y/o final putative final_status = 'Putative Complete' elsif (atg_status == 'incomplete') && (end_status == 'incomplete') # region intermedia final_status = 'Internal' elsif (atg_status == 'complete') && (end_status == 'incomplete') # tenemos el principio de la proteina final_status = 'N-terminus' elsif (atg_status == 'putative') && (end_status == 'incomplete') # puede que tengamos el principio de la proteina final_status = 'Putative N-terminus' elsif (atg_status == 'incomplete') && (end_status == 'complete') # tenemos el final de la proteina final_status = 'C-terminus' elsif (atg_status == 'incomplete') && (end_status == 'putative') # puede que tengamos el final de la proteina final_status = 'Putative C-terminus' end return final_status end def print_annotations(seq, q, final_hit, final_status, final_prot, warnings, query_fasta, db_name) name_diff = q.query_def.length - final_hit.acc.length if (name_diff > 0) spnum = ' '*name_diff.to_i else spnum = '' end #------------------------------------------------------------------------------------------------------------------------------------- # if the sequence is Complete will be printed -------------------------------------------------------------------- if (final_status == 'Complete') seq.annotate(:protein,">#{q.query_def}\n#{final_prot}") print_nt_seqs(seq, q, final_hit, final_prot, query_fasta, final_status) # funcion para marcar ATG (_-_) y STOP (___) if (final_hit.reversed) (kk, final_hit.q_frame, final_hit.q_end, final_hit.q_beg) = reverse_seq(seq.seq_fasta, final_hit.q_frame.to_i, final_hit.q_beg.to_i, final_hit.q_end.to_i) end seq.annotate(:complete,"#{q.query_def}\t#{query_fasta.length}\t#{final_hit.acc}\t#{db_name}\t#{final_status}\t\t#{final_hit.e_val}\t#{final_hit.ident}\t#{final_prot.length}\t#{final_hit.full_subject_length}\t#{warnings}\t#{final_hit.q_frame}\t#{final_hit.q_beg.to_i + 1}\t#{final_hit.q_end.to_i + 1}\t#{final_hit.s_beg.to_i + 1}\t#{final_hit.s_end.to_i + 1}\t#{final_hit.definition}\t#{final_prot}") seq.annotate(:alignment,"#{q.query_def}\t#{final_hit.q_seq}\n#{final_hit.acc}#{spnum}\t#{final_hit.s_seq}\n\n") #------------------------------------------------------------------------------------------------------------------------------------- else # la proteina no esta completa ------------------------------------------------------------------------- if (!seq.get_annotations(:tmp_annotation).empty?) && (!seq.get_annotations(:tmp_annotation).nil?) # ---> trae informacion de una bd anterior if (db_name =~/^tr_/) # ---> estamos usando el trembl, se dejan las anotaciones que trae # puts "#{db_name} -- #{q.query_def} --> print_annotations: sequence not complete! recovering annotations from previous database!" (kk1, final_hit, final_prot, query_fasta, final_status) = seq.get_annotations(:tmp_annotation).first[:message][3] print_nt_seqs(seq, q, final_hit, final_prot, query_fasta, final_status) # funcion para marcar ATG (_-_) y STOP (___) (name,fasta_length,acc,db_name,final_status,testcode,e_val,ident,my_length,subject_length,warnings,q_frame,q_beg,q_end,s_beg,s_end,description,final_prot) = seq.get_annotations(:tmp_annotation).first[:message][0].split("\t") if (final_hit.reversed) (kk, q_frame, q_end, q_beg) = reverse_seq(query_fasta, q_frame.to_i, q_beg.to_i, q_end.to_i) end my_prot = seq.get_annotations(:tmp_annotation).first[:message][1] seq.annotate(:protein,my_prot) my_align = seq.get_annotations(:tmp_annotation).first[:message][2] seq.annotate(:alignment,my_align) tmp_annot = "#{name}\t#{query_fasta.length}\t#{acc}\t#{db_name}\t#{final_status}\t\t#{e_val}\t#{ident}\t#{my_length}\t#{subject_length}\t#{warnings}\t#{q_frame}\t#{q_beg}\t#{q_end}\t#{s_beg}\t#{s_end}\t#{description}\t#{final_prot}" seq.annotate(:tmp_annotation,[tmp_annot, '','',''],true) #----------------------------------------------------------------------------------------------------------------------------- # elsif (db_name =~ /^sp_/) # ---> estamos usando el sp, se dejan las anotaciones que trae # puts "#{db_name} -- #{q.query_def} --> print_annotations: Mantenemos las anotaciones de la BD de usuario y pasamos la secuencia al trembl" end #------------------------------------------------------------------------------------------------------------------------------------- elsif (seq.get_annotations(:tmp_annotation).empty?) # ---> NO trae informacion de una bd anterior if (db_name =~ /^tr_/) # ---> estamos usando el trembl # puts "#{db_name} -- #{q.query_def} --> print_annotations: #{q.query_def} is not complete!! se anota con trembl" print_nt_seqs(seq, q, final_hit, final_prot, query_fasta, final_status) # funcion para marcar ATG (_-_) y STOP (___) if (final_hit.reversed) (kk, final_hit.q_frame, final_hit.q_end, final_hit.q_beg) = reverse_seq(seq.seq_fasta, final_hit.q_frame.to_i, final_hit.q_beg.to_i, final_hit.q_end.to_i) end seq.annotate(:alignment,"#{q.query_def}\t#{final_hit.q_seq}\n#{final_hit.acc}#{spnum}\t#{final_hit.s_seq}\n\n") seq.annotate(:protein,">#{q.query_def}\n#{final_prot}") tmp_annot = "#{q.query_def}\t#{query_fasta.length}\t#{final_hit.acc}\t#{db_name}\t#{final_status}\t\t#{final_hit.e_val}\t#{final_hit.ident}\t#{final_prot.length}\t#{final_hit.full_subject_length}\t#{warnings}\t#{final_hit.q_frame}\t#{final_hit.q_beg.to_i + 1}\t#{final_hit.q_end.to_i + 1}\t#{final_hit.s_beg.to_i + 1}\t#{final_hit.s_end.to_i + 1}\t#{final_hit.definition}\t#{final_prot}" seq.annotate(:tmp_annotation,[tmp_annot, '','','']) #------------------------------------------------------------------------------------------------------------------------------------- else # cargamos anotaciones para la siguiente BD tmp_prot = ">#{q.query_def}\n#{final_prot}" tmp_align = "#{q.query_def}\t#{final_hit.q_seq}\n#{final_hit.acc}#{spnum}\t#{final_hit.s_seq}\n\n" tmp_annot = "#{q.query_def}\t#{query_fasta.length}\t#{final_hit.acc}\t#{db_name}\t#{final_status}\t\t#{final_hit.e_val}\t#{final_hit.ident}\t#{final_prot.length}\t#{final_hit.full_subject_length}\t#{warnings}\t#{final_hit.q_frame}\t#{final_hit.q_beg.to_i + 1}\t#{final_hit.q_end.to_i + 1}\t#{final_hit.s_beg.to_i + 1}\t#{final_hit.s_end.to_i + 1}\t#{final_hit.definition}\t#{final_prot}" seq.sec_desc = "#{q.query_def}\t#{query_fasta.length}\t#{final_hit.acc}\t#{db_name}\tCoding Seq\t\t#{final_hit.e_val}\t#{final_hit.ident}\t\t#{final_hit.full_subject_length}\t#{warnings}\t\t\t\t\t\t#{final_hit.definition}\t" seq.annotate(:tmp_annotation,[tmp_annot, tmp_prot,tmp_align,[q, final_hit, final_prot, query_fasta, final_status]]) # puts "\n\n\n-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.---#{q.query_def}\t#{final_status}\n#{tmp_prot}" # puts "#{db_name} -- #{q.query_def} --> print_annotations: cargamos anotaciones para utilizarlas en la siguiente BD" end end end end def print_nt_seqs(seq, q, final_hit, final_prot, query_fasta, final_status) bad_atg = false #------------------------------------------------------------------------------------------------------------- ATG if (final_status == 'Complete') || (final_status == 'Putative Complete') || (final_status == 'Putative N-terminus') || (final_status == 'N-terminus') # puts "entra aqui, final_status: #{final_status}" my_seq_n = query_fasta[final_hit.q_beg - 5..final_hit.q_beg + 5] beg5 = false # ------------------------------------- si my_seq_n = nil puede ser porque q_beg sea < 5 if (final_hit.q_beg < 6) my_seq_n = query_fasta[0..10] beg5 = true # puts "empieza en el borde de la seq" end atg_found = my_seq_n.index(/ATG/i) atg_found_rv = my_seq_n.rindex(/ATG/i) my_atg_index = nil end if (!atg_found.nil?) if (beg5) my_seq_n.sub!(/ATG/i,'_-_ATG') my_atg_index = atg_found my_seq = my_seq_n + query_fasta[11..query_fasta.length + 1] elsif (atg_found == atg_found_rv) my_seq_n.sub!(/ATG/i,'_-_ATG') my_atg_index = final_hit.q_beg - 5 + atg_found my_seq = query_fasta[0..final_hit.q_beg - 6] + my_seq_n + query_fasta[final_hit.q_beg + 6..query_fasta.length + 1] # puts "my_seq despues de encontrar el atg: #{my_seq}" elsif (atg_found == 5) || (atg_found_rv == 5) my_seq_n = my_seq_n[0..4]+'_-_'+my_seq_n[5..10] my_atg_index = final_hit.q_beg - 5 + atg_found my_seq = query_fasta[0..final_hit.q_beg - 6] + my_seq_n + query_fasta[final_hit.q_beg + 6..query_fasta.length + 1] else # puts "#{q.query_def} tiene mas de un ATG my_seq_n: #{my_seq_n}" bad_atg = true my_seq = query_fasta end else bad_atg = true # puts "#{q.query_def} NO TIENE ATG my_seq_n: #{my_seq_n}" my_seq = query_fasta end #------------------------------------------------------------------------------------------------------------- STOP stop_c = nil if (final_status == 'Complete') || (final_status == 'Putative Complete') || (final_status == 'C-terminus') || (final_status == 'Putative C-terminus') if (bad_atg == true) stop_c = my_seq[final_hit.q_end - 2..final_hit.q_end] stop_c_longer = my_seq[final_hit.q_end - 7..final_hit.q_end + 5] else stop_c = my_seq[final_hit.q_end + 3..final_hit.q_end + 5] stop_c_longer = my_seq[final_hit.q_end - 2..final_hit.q_end + 10] end end if (!stop_c.nil?) # puts stop_c # puts stop_c_longer if (stop_c.translate == '*') if (bad_atg == true) my_seq = my_seq[0..final_hit.q_end] +'___'+ my_seq[final_hit.q_end + 1..my_seq.length + 1] seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tNO ATG\t\t#{stop_c}\t#{final_hit.q_beg + 1}\t#{final_hit.q_end + 1}\t#{my_seq}") else my_seq = my_seq[0..final_hit.q_end + 5] +'___'+ my_seq[final_hit.q_end + 6..my_seq.length + 1] my_prot = my_seq.sub(/\w+_\-_/,'') my_prot = my_prot.sub(/___\w+/,'') my_prot = my_prot.translate my_prot = my_prot.sub(/x$/,'') simliar_fragment = final_prot.lcs(my_prot) if (simliar_fragment.length == final_prot.length) && (simliar_fragment.length == my_prot.length) seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\t\t\t\t\t\t#{my_seq}") else seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tthe nucleotide sequence contain a lot of errors\tstop: #{stop_c_longer}\t#{stop_c}\t#{final_hit.q_beg + 1}\t#{final_hit.q_end + 1}\t#{my_seq}") # puts "nt seq: was no possible to find stop codon, the nucleotide sequence contain a lot of errors" end end else if (final_status == 'Putative Complete') || (final_status == 'C-terminus') || (final_status == 'Putative C-terminus') if (bad_atg == true) stop_c = my_seq[final_hit.q_end+1..final_hit.q_end+3] stop_c_longer = my_seq[final_hit.q_end - 4..final_hit.q_end + 8] else stop_c = my_seq[final_hit.q_end + 7..final_hit.q_end + 9] stop_c_longer = my_seq[final_hit.q_end..final_hit.q_end + 13] end if (!stop_c.nil?) if (stop_c.translate == '*') final_hit.q_end = final_hit.q_end + 3 if (bad_atg == true) my_seq = my_seq[0..final_hit.q_end] +'___'+ my_seq[final_hit.q_end + 1..my_seq.length + 1] seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tNO ATG\t\t#{stop_c}\t#{final_hit.q_beg + 1}\t#{final_hit.q_end + 1}\t#{my_seq}") else seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tNO STOP exacto\tstop: #{stop_c_longer}\t#{stop_c}\t#{final_hit.q_beg + 1}\t#{final_hit.q_end + 1}\t#{my_seq}") end else if (bad_atg == true) seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tNO ATG NO STOP exacto\tstop: #{stop_c_longer}\t#{stop_c}\t#{final_hit.q_beg + 1}\t#{final_hit.q_end + 1}\t#{my_seq}") # puts "find nt end: NO ATG, NO exact STOP" else seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tNO STOP exacto\tstop: #{stop_c_longer}\t#{stop_c}\t#{final_hit.q_beg + 1}\t#{final_hit.q_end + 1}\t#{my_seq}") # puts "find nt end: GOOD ATG, NO exact STOP" end end end end end else if (bad_atg == true) seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tNO ATG NO STOP\t\t\t\t\t#{my_seq}") else seq.annotate(:nucleotide,"#{q.query_def}\t#{final_status}\tNO STOP\t\t\t\t\t#{my_seq}") end end end end