require 'types' require 'une_los_hit' module FlAnalysis $global_warnings = [] def analiza_orf_y_fl(seq, hit, options, db_name) query_fasta = seq.seq_fasta.upcase.dup # Upcase for prevents complications with masked sequences, dup for discard changes if hit.count > 1 # if the sequence has more than one hit, the frames are checked and fixed to get a single hit seq_unida = UneLosHit.new(hit, query_fasta) full_prot = seq_unida.full_prot query_fasta = seq_unida.output_seq # repaired fasta final_hit = seq_unida.final_hit # single hit $global_warnings += seq_unida.msgs # warning messages else query_fasta = reverse_seq(query_fasta, hit.first) if hit.first.q_frame < 0 # si la secuencia esta al reves le damos la vuelta final_hit = hit.first # single hit end query_fasta = exonerate_fix_frame_shift(query_fasta, hit) if options[:exonerate] full_prot = query_fasta[final_hit.q_frame-1, query_fasta.length+1].translate original_query_coordinates = [final_hit.q_beg, final_hit.q_end] ## VERBOSE seq.show_alignment(final_hit, query_fasta, show_nts) if $verbose > 2 ## VERBOSE atg_status, tmp_prot = set_start_codon(final_hit, options[:distance], full_prot, query_fasta) end_status, final_prot = find_end(final_hit, options[:distance], tmp_prot, query_fasta) puts "\n------------------- POST EXTENSION---------------------" if $verbose > 1 ## VERBOSE seq.show_alignment(final_hit, query_fasta, show_nts, original_query_coordinates) if $verbose > 1 ## VERBOSE puts "ATG: #{atg_status} STOP: #{end_status}" if $verbose > 2 ## VERBOSE # decide the sequence status (Complete, Putative Complete, Internal, N-terminus, Putative N-terminus, C-terminus) type, status = determine_status(atg_status, end_status) status = compare_seq_length_with_subject(final_prot, options[:distance], final_hit, type, status) if final_prot.length >= 25 && final_prot.length.to_f/final_hit.full_subject_length >= options[:subject_coverage] # Prot length min of 25 aa and subject coverage by generated prot of 25% save_annotations(seq, final_hit, type, status, final_prot, query_fasta, db_name) end end def set_start_codon(final_hit, distance, full_prot, query_fasta) q_index_start = contenidos_en_prot(final_hit.q_seq, full_prot) atg_status = nil _5prima = q_index_start + distance if final_hit.s_beg == 0 && final_hit.q_seq[0] == 'M' && final_hit.s_seq[0] == 'M' #there is M in query and subject at first position of alignment and subject's M is in first position atg_status = 'complete' tmp_prot = full_prot[q_index_start..full_prot.length] elsif _5prima >= final_hit.s_beg amine_seq = full_prot[0, _5prima] #Contiene parte amino de la proteina carboxile_seq = full_prot[_5prima, full_prot.length - _5prima] #Contiene parte carboxilo de la proteina hasta el fin de la secuencia length_before_cut = amine_seq.length amine_seq, atg_status = find_start(final_hit.s_beg, amine_seq, distance) # to look for the beginning of the protein tmp_prot = "#{amine_seq}#{carboxile_seq}" # merge seqs in prot new_q_beg = final_hit.q_frame-1 + (length_before_cut - amine_seq.length) * 3 modify_5p_align(new_q_beg, final_hit, query_fasta) if $verbose > 1 ## VERBOSE, Modify query align final_hit.q_beg = new_q_beg # to get the value of the start_ORF index else $global_warnings << 'UnexpStopBegSeq' if full_prot[0, q_index_start].rindex('*') atg_status = 'incomplete' tmp_prot = full_prot end return atg_status, tmp_prot end def find_start(subject_start, amine_seq, distance) atg_status = 'putative' # complete, incomplete or putative stop_codon = amine_seq.rindex('*') if !stop_codon.nil? # tenemos un codon de parada en el amine_seq 5 prima _5prime_UTR = amine_seq.length - 10 - subject_start # marcamos la distancia al s_beg desde el principio del amine_seq amine_seq = amine_seq[stop_codon + 1 .. amine_seq.length - 1] first_m = amine_seq.index('M') if stop_codon <= _5prime_UTR # Ver si stop está en zona 5 prima UTR if first_m # tenemos M amine_seq = amine_seq[first_m .. amine_seq.length - 1] atg_status = 'complete' else # con STOP pero sin M $global_warnings << 'noM1' end else # esta despues, un cambio de fase impide analizar el principio $global_warnings << 'UnexpSTOP5p' amine_seq = amine_seq[first_m .. amine_seq.length - 1] if first_m # tenemos M end else # no hay stop codon first_m = amine_seq.index('M') if first_m # tenemos M amine_seq = amine_seq[first_m .. amine_seq.length - 1] m_distance = (subject_start - amine_seq.length).abs - 10 if m_distance.abs > distance*2 # con atg pero muy lejos del inicio que marca el subject $global_warnings << 'NoStopMfar' atg_status = 'incomplete' else # Tenemos M atg_status = 'complete' end else # sin M $global_warnings << 'noM2' end end return amine_seq, atg_status end def find_end(final_hit, max_distance, tmp_prot, query_fasta) frame_shift = check_frame_shift(final_hit) beg_end_string =(final_hit.q_end-final_hit.q_beg)/3 - max_distance # Begin of terminal region (Coordinate) in tmp_prot atg_substring = tmp_prot[0..beg_end_string] # prot without terminal region end_substring = tmp_prot[beg_end_string + 1 ..tmp_prot.length-1] #Take 3' of unigen #puts "\e[32m\nfinal_hit.q_end-final_hit.q_beg: #{final_hit.q_end-final_hit.q_beg} /3 - max_distance: #{max_distance}\e[0m" #puts "\e[33mbeg_end_string: #{beg_end_string}\e[0m" #puts "\e[35mtmp_prot.length: #{tmp_prot.length}\e[0m" if beg_end_string < 0 || end_substring.nil? #Sequences whose homology is at end of it and dont't exits the 3' part of unigene atg_substring = tmp_prot end_substring = '' end_status = 'incomplete' else end_status = 'putative' putative_end = end_substring.index('*') end_substring = end_substring[0 .. putative_end] if putative_end s_end_resto = final_hit.s_len - (final_hit.s_end + 1) # en el subject, numero de aas que necesito cubrir q_end_resto = (query_fasta.length - final_hit.q_end)/3 # en el query, numero de aas que tengo sq_end_distance = q_end_resto - s_end_resto # La diferencia se hace a partir del final del hit para que el calculo no quede sesgado en caso de que la secuecia este truncada por 5' if (final_hit.align_len == final_hit.s_len && putative_end)||(sq_end_distance.abs <= max_distance && putative_end && putative_end <= max_distance*2) #Stop in a Full-length. max_distance *2 is set by de margin of +-15aa at the end of aligment end_status = 'complete' elsif sq_end_distance < max_distance # si no tenemos suficiente secuencia para tener el stop (nos faltan 15 aas o mas) end_status = 'incomplete' if putative_end $global_warnings << ['UnexpSTOP3pDist', sq_end_distance.abs] else $global_warnings << ['DistSubj', sq_end_distance.abs] end else # tenemos suficiente secuencia if putative_end # tenemos un stop #beg_end_string indica en que punto del unigen se encuentra el area de busqueda del codon stop stop_q_s = beg_end_string + putative_end - final_hit.s_len # Space between query's stop and subject's stop if stop_q_s.abs <= max_distance #Stop codon is in search region end_status = 'complete' elsif stop_q_s < 0 $global_warnings << 'UnexpSTOP3p' elsif stop_q_s > 0 end_status = 'complete' $global_warnings << 'QueryTooLong' end else # no tenemos codon de parada pero tenemos suficiente secuencia end_status = 'incomplete' $global_warnings << 'ProtFusion' end end end final_prot = atg_substring + end_substring end_status = 'complete' if final_prot.length == final_hit.s_len+1 && final_prot[final_prot.length-1] == '*' new_q_end = final_hit.q_beg-1 + final_prot.length * 3 + frame_shift modify_3p_align(new_q_end, final_hit, query_fasta, final_prot) if $verbose > 1 final_hit.q_end = new_q_end return end_status, final_prot end def determine_status(atg_status, end_status) if atg_status != 'incomplete' && end_status != 'incomplete' # proteina completa type = COMPLETE elsif atg_status == 'incomplete' && end_status == 'incomplete' # region intermedia type = INTERNAL elsif atg_status != 'incomplete' && end_status == 'incomplete' # tenemos el principio de la proteina type = N_TERMINAL elsif atg_status == 'incomplete' && end_status != 'incomplete' # tenemos el final de la proteina type = C_TERMINAL end if atg_status == 'putative' || end_status == 'putative' status = false # Putative else status = true # Sure end return type, status end def compare_seq_length_with_subject(final_prot, distance, final_hit, type, status) if final_prot.length - 2 * distance > final_hit.s_len $global_warnings << ['SeqLonger', final_prot.length, final_hit.s_len] elsif final_prot.length + 2 * distance < final_hit.s_len $global_warnings << ['SeqShorter', final_prot.length, final_hit.s_len] if final_prot.length + 100 < final_hit.s_len || final_prot.length*2 < final_hit.s_len if type == COMPLETE status = false $global_warnings << 'VeryShorter' end end end return status end def save_annotations(seq, final_hit, type, status, final_prot, query_fasta, db_name) # if the sequence is Complete or it hasn't previous info will be saved if seq.type == UNKNOWN || (type == COMPLETE && seq.type != COMPLETE) seq.type = type seq.status = status seq.db_name = db_name seq.seq_fasta = query_fasta seq.seq_aa = final_prot seq.hit = final_hit seq.warnings($global_warnings) $global_warnings = [] # Clean all warnings for current sequence seq.seq_nt = mark_nt_seqs(final_hit, query_fasta) if type == COMPLETE seq.ignore = true end end if $verbose > 2 puts "\e[1mStruct annot: #{seq.prot_annot_calification}\e[0m" end end def mark_nt_seqs(final_hit, query_fasta) atg = query_fasta[final_hit.q_beg..final_hit.q_beg + 2] mark_atg = nil if atg == 'ATG' mark_atg = '_-_' end stop = query_fasta[final_hit.q_end - 2..final_hit.q_end] mark_stop = nil if stop == 'TAG' || stop == 'TGA' || stop == 'TAA' mark_stop = '___' end seq5p = query_fasta[0..final_hit.q_beg-1] orf = query_fasta[final_hit.q_beg..final_hit.q_end] seq3p = query_fasta[final_hit.q_end..query_fasta.length] nt_seq = "#{seq5p}#{mark_atg}#{orf}#{mark_stop}#{seq3p}" return nt_seq end def exonerate_fix_frame_shift(query_fasta, hit) frame_shifts = [] added_nts = 0 hit.each_with_index do |hsp, num| if hsp.class.to_s == 'ExoBlastHit' #Only this type of class of BlastHit has frameshift attributes if !hsp.q_frameshift.empty? #There is frameshift hsp.q_frameshift.each do |position, num_nts| local_add = 3 - num_nts fs_final_position = position + num_nts $global_warnings << ['ExFrameS', fs_final_position] frame_shifts << [fs_final_position, local_add] added_nts += local_add end end end hsp.q_beg += added_nts if num > 0 hsp.q_end += added_nts end add = 0 frame_shifts.each do |position, num_nts| query_fasta = query_fasta.insert(position+add, 'n'*num_nts) add += num_nts end return query_fasta end ## VERBOSE METHODS def show_nts show = false show = true if $verbose && $verbose > 3 return show end def modify_3p_align(new_q_end, final_hit, query_fasta, final_prot) ## For visual report if new_q_end > final_hit.q_end #There is an align extension extend_align = query_fasta[final_hit.q_end+1 .. new_q_end].translate final_hit.q_seq = final_hit.q_seq + extend_align elsif new_q_end < final_hit.q_end #The align is cutted upper_limit = final_prot.length - 1 + final_hit.q_seq.count('-') final_hit.q_seq = final_hit.q_seq[0 .. upper_limit] end end def modify_5p_align(new_q_beg, final_hit, query_fasta) ## For visual report if new_q_beg < final_hit.q_beg #There is an align extension extend_align = query_fasta[new_q_beg .. final_hit.q_beg-1].translate final_hit.q_seq = extend_align + final_hit.q_seq elsif new_q_beg > final_hit.q_beg #The align is cut seq_cut = (new_q_beg - final_hit.q_beg)/3 gaps = final_hit.q_seq[0..seq_cut].count('-') seq_cut += gaps final_hit.q_seq = final_hit.q_seq[seq_cut .. final_hit.q_seq.length-1] end end end