module FlnStats def summary_stats stats_file = File.open('fln_results/summary_stats.html', 'w') (html_head, html_1, html_2, html_3, html_4) = html_code total_seqs = 0 (status_array, seqs_number1, error_1_num, seq_uniq, complete_uniq, seq_length_stats, complete_seq_length_stats) = annotation_stats (tcode_array, seqs_number2, tcode_length_stats, coding_length_stats, unknown_length_stats) = testcode_stats ncrna_array=ncrna_stats total_seqs = seqs_number1 + seqs_number2 + ncrna_array[4].to_i stats_file.puts html_head stats_file.puts "\t\t\t\t"+''+total_seqs.to_s+" sequences in your input fasta\n\t\t\t\n\t\t" if (total_seqs.to_i > 0) stats_file.puts html_1 stats_file.puts ' YES '+seqs_number1.to_s+' '+'%.2f' % (100*seqs_number1.to_f/total_seqs.to_f).to_s+' % '+seq_uniq.to_s+' '+seq_length_stats[0].to_s+' '+seq_length_stats[1].to_s+' '+seq_length_stats[2].to_s+' '+seq_length_stats[3].to_s+' ' stats_file.puts ' NO '+seqs_number2.to_s+' '+'%.2f' % (100*seqs_number2.to_f/total_seqs.to_f).to_s+' % - '+tcode_length_stats[0].to_s+' '+tcode_length_stats[1].to_s+' '+tcode_length_stats[2].to_s+' '+tcode_length_stats[3].to_s+' ' stats_file.puts ' ncRNA '+ncrna_array[4].to_s+' '+'%.2f' % (100*ncrna_array[4].to_f/total_seqs.to_f).to_s+' % - '+ncrna_array[0].to_s+' '+ncrna_array[1].to_s+' '+ncrna_array[2].to_s+' '+ncrna_array[3].to_s+' ' stats_file.puts '

'+error_1_num.to_s+' Sequences with sense and antisense hits error

' stats_file.puts '

'+complete_uniq.to_s+' Complete sequences with different ortologue ID

' stats_file.puts html_2 status_array.each do |status| stats_file.puts ' '+status[4].to_s+' '+status[0].to_s+' '+'%.2f' % (100*status[0].to_f/total_seqs.to_f).to_s+' % '+status[1].to_s+' '+status[2].to_s+' '+status[3].to_s+' ' end stats_file.puts html_3 tcode_array.each do |status| stats_file.puts ' '+status[5].to_s+' '+status[4].to_s+' '+'%.2f' % (100*status[4].to_f/total_seqs.to_f).to_s+' % '+status[0].to_s+' '+status[1].to_s+' '+status[2].to_s+' '+status[3].to_s+' ' end # print Non coding RNA stats_file.puts ' Putative ncRNA '+ncrna_array[4].to_s+' '+'%.2f' % (100*ncrna_array[4].to_f/total_seqs.to_f).to_s+' % '+ncrna_array[0].to_s+' '+ncrna_array[1].to_s+' '+ncrna_array[2].to_s+' '+ncrna_array[3].to_s+' ' end stats_file.puts html_4 stats_file.close end def html_code html_head = ' FLN Annotation Summary

Full-LengtherNEXT
Annotation summary

' html_1 = '
' html_2= '
Ortologue found Sequences found % Different IDs >200 bp <200 bp >500 bp <500 bp
' html_3= '
Status Total % UserDB SwissProt TrEMBL

' html_4 = ' ' return [html_head, html_1, html_2, html_3, html_4] end def stats_my_db(db_name, array) if (db_name !~ /^sp_/) && (db_name !~ /^tr_/) array[1] += 1 elsif (db_name =~ /^sp_/) array[2] += 1 elsif (db_name =~ /^tr_/) array[3] += 1 end return array end def annotation_stats seqs_number = 0 array_of_all_accs = [] array_of_complete_accs = [] error_1_num = 0 # >200, <200, >500, <500 seq_length_stats = [0,0,0,0] # >200, <200, >500, <500 complete_seq_length_stats = [0,0,0,0] status_array = [] # total, userdb, swissprotdb, trembl, status complete = [0,0,0,0,'Complete'] putative_complete = [0,0,0,0,'Putative Complete'] c_terminus = [0,0,0,0,'C-terminus'] putative_c_terminus = [0,0,0,0,'Putative C-terminus'] n_terminus = [0,0,0,0,'N-terminus'] putative_n_terminus = [0,0,0,0,'Putative N-terminus'] internal = [0,0,0,0,'Internal'] cod_seq = [0,0,0,0,'Misassembled'] File.open('fln_results/annotations.txt').each do |line| line.chomp! (name,fasta_length,acc,db_name,status,kk1,kk2,kk3,kk4,kk5,msgs) = line.split("\t") if (line !~ /^Query_id\t/) && (!line.empty?) seqs_number += 1 array_of_all_accs.push acc # ------------------------------------------------------------------------- if (fasta_length.to_i >= 200) seq_length_stats[0] += 1 # seqs_longer_200 += 1 else seq_length_stats[1] += 1 # seqs_shorter_200 += 1 end if (fasta_length.to_i >= 500) seq_length_stats[2] += 1 # seqs_longer_500 += 1 else seq_length_stats[3] += 1 # seqs_shorter_500 += 1 end # ------------------------------------------------------------------------- if (msgs =~ /ERROR#1/) error_1_num += 1 end # ------------------------------------------------------------------------- if (status == 'Complete') complete[0] += 1 array_of_complete_accs.push acc complete = stats_my_db(db_name, complete) if (fasta_length.to_i >= 200) complete_seq_length_stats[0] += 1 # complete_longer_200 += 1 else complete_seq_length_stats[1] += 1 # complete_shorter_200 += 1 end if (fasta_length.to_i >= 500) complete_seq_length_stats[2] += 1 # complete_longer_500 += 1 else complete_seq_length_stats[3] += 1 # complete_shorter_500 += 1 end elsif (status == 'Putative Complete') putative_complete[0] += 1 putative_complete = stats_my_db(db_name, putative_complete) elsif (status == 'C-terminus') c_terminus[0] += 1 c_terminus = stats_my_db(db_name, c_terminus) elsif (status == 'N-terminus') n_terminus[0] += 1 n_terminus = stats_my_db(db_name, n_terminus) elsif (status == 'Putative C-terminus') putative_c_terminus[0] += 1 putative_c_terminus = stats_my_db(db_name, putative_c_terminus) elsif (status == 'Putative N-terminus') putative_n_terminus[0] += 1 putative_n_terminus = stats_my_db(db_name, putative_n_terminus) elsif (status == 'Internal') internal[0] += 1 internal = stats_my_db(db_name, internal) elsif (status == 'Coding Seq') cod_seq[0] += 1 cod_seq = stats_my_db(db_name, cod_seq) end # ------------------------------------------------------------------------- end end status_array = [complete, putative_complete, c_terminus, putative_c_terminus, n_terminus, putative_n_terminus, internal, cod_seq] return [status_array, seqs_number, error_1_num, array_of_all_accs.uniq.count, array_of_complete_accs.uniq.count, seq_length_stats, complete_seq_length_stats] end def testcode_stats seqs_number = 0 # >200, <200, >500, <500 all_tcode_stats = [0,0,0,0] # >200, <200, >500, <500, total, status coding_length_stats = [0,0,0,0,0,'Coding'] p_coding_length_stats = [0,0,0,0,0,'Putative Coding'] unknown_length_stats = [0,0,0,0,0,'Unknown'] File.open('fln_results/tcode_result.txt').each do |line| line.chomp! (name,fasta_length,acc,db_name,status) = line.split("\t") if (line !~ /^Query_id\t/) && (!line.empty?) seqs_number += 1 if (fasta_length.to_i >= 200) all_tcode_stats[0] += 1 if (status == 'coding') coding_length_stats[4] += 1 coding_length_stats[0] += 1 elsif (status == 'putative_coding') p_coding_length_stats[4] += 1 p_coding_length_stats[0] += 1 elsif (status == 'unknown') unknown_length_stats[4] += 1 unknown_length_stats[0] += 1 end else all_tcode_stats[1] += 1 if (status == 'coding') coding_length_stats[4] += 1 coding_length_stats[1] += 1 elsif (status == 'putative_coding') p_coding_length_stats[4] += 1 p_coding_length_stats[1] += 1 elsif (status == 'unknown') unknown_length_stats[4] += 1 unknown_length_stats[1] += 1 end end if (fasta_length.to_i >= 500) all_tcode_stats[2] += 1 if (status == 'coding') coding_length_stats[2] += 1 elsif (status == 'putative_coding') p_coding_length_stats[2] += 1 elsif (status == 'unknown') unknown_length_stats[2] += 1 end else all_tcode_stats[3] += 1 if (status == 'coding') coding_length_stats[3] += 1 elsif (status == 'putative_coding') p_coding_length_stats[3] += 1 elsif (status == 'unknown') unknown_length_stats[3] += 1 end end end end status_array = [coding_length_stats, p_coding_length_stats, unknown_length_stats] return [status_array, seqs_number, all_tcode_stats, coding_length_stats, unknown_length_stats] end def ncrna_stats # >200, <200, >500, <500, total ncrna_array = [0,0,0,0,0] File.open('fln_results/nc_rna.txt').each do |line| line.chomp! (name,fasta_length,acc,db_name,status) = line.split("\t") if (status == 'Putative ncRNA') ncrna_array[4] += 1 if (fasta_length.to_i >= 200) ncrna_array[0] += 1 else ncrna_array[1] += 1 end if (fasta_length.to_i >= 500) ncrna_array[2] += 1 else ncrna_array[3] += 1 end end end return ncrna_array end end
Status Total % >200 bp <200 bp >500 bp <500 bp