require 'orf' require 'types' require 'warnings' require 'common_functions' class Sequence attr_accessor :seq_name, :seq_fasta, :fasta_length, :db_name, :seq_nt, :seq_aa, :db, :type, :status, :id, :orfs, :area_without_annotation, :save_fasta, :ignore, :hit, :t_code, :functional_annotations, :fpkm, :coverage_analysis def initialize(seq_name, seq_fasta, seq_qual='') @seq_name = seq_name @seq_fasta = seq_fasta @fasta_length = seq_fasta.length @db_name = nil @seq_nt = nil # Unigen sequence with tagged ATG & stop @seq_aa = nil # Protein sequence generated over unigen @db =nil @type = UNKNOWN # See types.rb @status = FALSE # TRUE => Sure, FALSE => Putative @id = nil #Prot or EST id, can be several => array @warnings = [] @annotations=[] @functional_annotations = {} @orfs=[] #Mapping_info @fpkm = [] @coverage_analysis = [] @area_without_annotation=FALSE @save_fasta=TRUE @ignore=FALSE @hit=nil @t_code=0 end def add_orf(orf_seq, orf_t_start, orf_t_end, orf_q_frame, orf_stop_codon, orf_type) orf = Orf.new(orf_seq, orf_t_start, orf_t_end, orf_q_frame, orf_stop_codon, orf_type) @orfs << orf end def change_degenerated_nt! translate_hash = {} translate_hash['R']= [['a','g'],0] translate_hash['W']= [['a','t'],0] translate_hash['M']= [['a','c'],0] translate_hash['K']= [['g','t'],0] translate_hash['S']= [['g','c'],0] translate_hash['Y']= [['c','t'],0] translate_hash['H']= [['a','t','c'],0] translate_hash['B']= [['g','t','c'],0] translate_hash['D']= [['g','a','t'],0] translate_hash['V']= [['g','a','c'],0] translate_hash['N']= [['g','a','c','t'],0] fix_degenerated_fasta!(translate_hash) end def fix_degenerated_fasta!(translate_hash) s = @seq_fasta res = [] nts_of_a_line = s.split('') nts_of_a_line.map{|e| if (e =~ /[RWMKSYHBDVN]/) translate_hash[e][1] += 1 e = translate_hash[e][0][translate_hash[e][1]%translate_hash[e][0].length] end res << e } @seq_fasta=res.compact.join end def clean_orfs @orfs=[] end def reset_classification @type = UNKNOWN @status = FALSE end def clean_warnings @warnings = [] end def clean_annotations @annotations = [] end def get_acc acc=hit.acc return acc end def get_pident pident=hit.ident return pident end def format_chimera! @hit = [] end def warnings(warn) if warn.class.to_s == 'Array' warn.each do |w| @warnings << check_warn(w) end else @warnings << check_warn(warn) end end def clone_warnings(array_warnings) array_warnings.map{|warn| @warnings << warn.dup} end def check_warn(warn) check = warn replace = nil if warn.class.to_s == 'Array' check = warn.shift # Take warning tag message replace = warn # Take values to replace in message end message = $warnings_hash[check] if message.nil? message = check # If not exists the message end if !replace.nil? message = message.dup # Duplicate memory to avoid overwrite original warning hash messages replace.each do |rep| message.sub!('(*replace*)',"#{rep}") #message variable end end return message end def test_code(test_code) @t_code = test_code if @t_code >= 0.95 @status = TRUE end end def get_fasta(seq) fasta = ">#{@seq_name}\n#{seq}" return fasta end def write_info(output_files) # Output_files is a hash if @save_fasta output_files['seqs'].puts get_fasta(@seq_fasta) end if !@fpkm.empty? output_files['fpkm'].puts "#{@seq_name}\t#{@fpkm.join("\t")}" end if !@coverage_analysis.empty? all_coverages = @coverage_analysis.last output_files['coverage'].puts "#{@seq_name}\t#{@coverage_analysis[0..@coverage_analysis.length-1].join("\t")}" end case @type when OTHER write_other(output_files[@type]) when CHIMERA write_chimera(output_files[@type]) when UNMAPPED write_unmapped(output_files[@type]) when MISASSEMBLED write_misassembled(output_files[@type]) when UNKNOWN write_unknown(output_files[@type]) when COMPLETE .. INTERNAL write_prot_annot(output_files['db']) write_prot_seq(output_files['prot']) write_nt_seq(output_files['nts']) write_align(output_files['align']) when NCRNA write_ncrna(output_files[@type]) when CODING write_coding(output_files[@type]) else if @type != FAILED raise "#{@type} is an incorrect type" end end end def all_warns all = @warnings.join(' ') return all end def write_other(file) file.puts "#{@seq_name}\t#{@fasta_length}\t#{@hit.acc}\t#{@db_name}\t#{all_warns}" end def write_unmapped(file) file.puts "#{@seq_name}\t#{@fasta_length}" end def write_chimera(file) #TODO : write 'SOLVED' tag @hit.each do |h| file.puts "#{@seq_name}\t#{@fasta_length}\t#{h.acc}\t#{@db_name}\t#{h.q_frame}\t#{h.e_val}\t#{h.ident}\t#{h.q_beg + 1}\t#{h.q_end + 1}\t#{h.s_beg + 1}\t#{h.s_end + 1}\t#{h.definition}" end file.puts end def write_misassembled(file) file.puts "#{@seq_name}\t#{@fasta_length}\t#{@hit.acc}\t#{@db_name}" end def write_unknown(file) # ħit is an array. 2 => q_frame, 1 ORF end, 0 ORF beg if hit.class.to_s == 'Array' orf_beg = @hit[0] orf_end = @hit[1] q_frame = @hit[2] else orf_beg = '-' orf_end = '-' q_frame = '-' end file.puts "#{@seq_name}\t#{@fasta_length}\t#{@t_code}\t#{all_warns}\t#{q_frame}\t#{orf_beg}\t#{orf_end}" end def write_prot_annot(file) final_func_annot = Array.new(9, '-') if !@functional_annotations.empty? final_func_annot = @functional_annotations.values end file.puts "#{@seq_name}\t#{@fasta_length}\t#{@hit.acc}\t#{@db_name}\t#{prot_annot_calification}\t#{@hit.e_val}\t#{@hit.ident}\t#{@hit.full_subject_length}\t#{@seq_aa.length}\t#{all_warns}\t#{@hit.q_frame}\t#{@hit.q_beg + 1}\t#{@hit.q_end + 1}\t#{@hit.s_beg + 1}\t#{@hit.s_end + 1}\t#{@hit.definition}\t#{final_func_annot.join("\t")}" end def write_ncrna(file) file.puts "#{@seq_name}\t#{@fasta_length}\t#{@hit.acc}\t#{@hit.e_val}\t#{@hit.ident}\t#{@hit.q_beg + 1}\t#{@hit.q_end + 1}\t#{@hit.s_beg + 1}\t#{@hit.s_end + 1}\t#{@hit.definition}" end def write_coding(file) # ħit is an array. 2 => q_frame, 1 ORF end, 0 ORF beg calification = 'Putative' if @status calification = 'Sure' end file.puts "#{@seq_name}\t#{@fasta_length}\t#{calification}\t#{@t_code}\t#{@hit.last}\t#{@hit.first}\t#{@hit[1]}" end #Write complementary files def write_prot_seq(file) file.puts get_fasta(@seq_aa) end def write_align(file) tabs = (seq_name.length/8).ceil if tabs == 0 tabs = 1 end second_tab = 0 if seq_name.length > 7 second_tab = 1 end file.puts "#{@seq_name}#{"\t"*tabs}#{@hit.q_seq}\n#{@hit.acc}#{"\t"*(tabs+second_tab)}#{@hit.s_seq}" file.puts end def write_nt_seq(file) file.puts "#{@seq_name}\t#{@fasta_length}\t#{@seq_nt}" end def calification type_description = nil case @type when FAILED type_description = 'Failed' when OTHER type_description = 'Other' when UNMAPPED type_description = 'Unmapped' when CHIMERA type_description = 'Chimera' when MISASSEMBLED type_description = 'Misassembled' when UNKNOWN type_description = 'Unknown' when COMPLETE type_description = 'Complete' when N_TERMINAL type_description = 'N_terminal' when C_TERMINAL type_description = 'C_terminal' when INTERNAL type_description = 'Internal' when CODING type_description = 'Coding' when NCRNA type_description = 'NcRNA' end end def prot_annot_calification info = "#{calification} " if @status info << 'Sure' else info << 'Putative' end return info end def show_alignment(h, nts, show_nts, original_query_coordinates = nil) puts "Prot id:\t#{h.acc}", "Alignment length:\t#{h.align_len} aa", "Subject length:\t#{h.s_len} aa", "Query length:\t#{nts.length/3} aa" puts prot_annot_calification puts aa_unigen = nts[h.q_frame - 1 .. nts.length-1].translate index = contenidos_en_prot(h.q_seq, aa_unigen) # View desplacements 5-prime/align/3-prime subzone_align = nil if !original_query_coordinates.nil? subzone_align = {} if h.q_beg > original_query_coordinates.first #alignment has transferred characters to 5 prime subzone_align['beg'] = [original_query_coordinates.first, h.q_beg-3, 42] # -3 to exclude the last aa elsif h.q_beg < original_query_coordinates.first subzone_align['beg'] = [h.q_beg, original_query_coordinates.first-3, 46] #alignment has received characters from 5 prime end if h.q_end < original_query_coordinates.last #alignment has transferred characters to 3 prime subzone_align['end'] = [h.q_end, original_query_coordinates.last, 42] elsif h.q_end > original_query_coordinates.last subzone_align['end'] = [original_query_coordinates.last, h.q_end, 43] #alignment has received characters from 3 prime end end # Print 5 prime if index > 0 # 5 prime exists aa_align = aa_unigen[0 .. index-1].split('') nt_align = nts[h.q_frame-1..h.q_beg-1] print_alignment(aa_align, nt_align, 36, show_nts, subzone_align) reduce_coordinates(subzone_align, aa_align, h) end # Print core alignment or protein aa_align = h.q_seq.split('') nt_align = nts[h.q_beg..h.q_end] print_alignment(aa_align, nt_align, 32, show_nts, subzone_align) reduce_coordinates(subzone_align, aa_align, h) # Print 3 prime gaps = h.q_seq.count('-') three_prime_beg = index+h.q_seq.length-gaps if aa_unigen.length > three_prime_beg # 3 prime exists aa_align = aa_unigen[three_prime_beg .. aa_unigen.length-1].split('') fs = check_frame_shift(h) nt_align = nts[h.q_end+1-fs..nts.length-1] print_alignment(aa_align, nt_align, 33, show_nts, subzone_align) end end def print_alignment(aa_align, nt_align, color, show_nts, mark_subzone = nil) original_color = color c={ 'GCT'=>'A','GCC'=>'A','GCA'=>'A','GCG'=>'A', 'CGT'=>'R','CGC'=>'R','CGA'=>'R','CGG'=>'R','AGA'=>'R','AGG'=>'R', 'AAT'=>'N','AAC'=>'N', 'GAT'=>'D','GAC'=>'D', 'TGT'=>'C','TGC'=>'C', 'CAA'=>'Q','CAG'=>'Q', 'GAA'=>'E','GAG'=>'E', 'GGT'=>'G','GGC'=>'G','GGA'=>'G','GGG'=>'G', 'CAT'=>'H','CAC'=>'H', 'ATT'=>'I','ATC'=>'I','ATA'=>'I', 'TTA'=>'L','TTG'=>'L','CTT'=>'L','CTC'=>'L','CTA'=>'L','CTG'=>'L', 'ATG'=>'M', 'AAA'=>'K','AAG'=>'K', 'TTT'=>'F','TTC'=>'F', 'CCT'=>'P','CCC'=>'P','CCA'=>'P','CCG'=>'P', 'TCT'=>'S','TCC'=>'S','TCA'=>'S','TCG'=>'S','AGT'=>'S','AGC'=>'S', 'ACT'=>'T','ACC'=>'T','ACA'=>'T','ACG'=>'T', 'TGG'=>'W', 'TAT'=>'Y','TAC'=>'Y', 'GTT'=>'V','GTC'=>'V','GTA'=>'V','GTG'=>'V', 'TAG'=>'*','TGA'=>'*','TAA'=>'*'} nt_line = '' aa_line = '' gaps = 0 count = 0 aa_align.each_with_index do |aa, n| if aa == '-' nt_line << '---' gaps += 1 else # Check aa with codon codon_window = (n-gaps)*3 codon = nt_align[codon_window..codon_window+2] nt_line << "#{codon}" if aa.upcase != 'X' if codon.upcase.include?('N') traslated_aa = '-' else traslated_aa = c[codon] end if traslated_aa != '-' && traslated_aa != aa puts "#{traslated_aa} #{aa}" aa = '?' end end end if !mark_subzone.nil? nts_coordenate = (n-gaps)*3 mark_subzone.values.each do |subzone| if nts_coordenate >= subzone[0] && nts_coordenate <= subzone[1] #0 => first coordenate, 1 => second coordenate color = subzone.last end end end space = nil if show_nts space = ' ' end aa_line << "\e[#{color}m#{space}#{aa}#{space}\e[0m" color = original_color line_length = 60 if (n+1) % line_length == 0 || n+1 == aa_align.length count = n + 1 print "#{count}\t" puts aa_line if show_nts print "#{count*3}\t" puts nt_line end aa_line = '' nt_line = '' end end end def reduce_coordinates(subzone_align, aa_align, h) if !subzone_align.nil? aligned = 3 * aa_align.length + h.q_frame-1 subzone_align.values.each do |subzone| subzone[0]-= aligned subzone[1]-= aligned end end end def area_without_annotation? if @hit.class == Array hit = @hit.first else hit = @hit end upstream_annotation_space = hit.q_beg downstream_annotation_space = @fasta_length - hit.q_end if upstream_annotation_space >= 150 || downstream_annotation_space >= 150 @area_without_annotation = TRUE end return @area_without_annotation end def clone new_seq = self.dup new_seq.clean_annotations new_seq.clean_warnings new_seq.clean_orfs new_seq.clone_warnings(@warnings) new_seq.clone_annotations(@annotations) return new_seq end def clone_annotations(array_annotations) array_annotations.map{|annotation| @annotations << annotation.dup} end def unmapped? res = FALSE res = TRUE if !@coverage_analysis.empty? && @coverage_analysis[3] == 0 #3 => percentage of sequence covered by reads return res end end