module Fl2Stats # -------------------------------------------------------------------------------- Main def summary_stats stats_file = File.open('fl2_results/summary_stats.txt', 'w') total_seqs = 0 num1 = annotation_stats(stats_file) num2 = testcode_stats(stats_file) total_seqs = num1 + num2 stats_file.puts "\nInput sequences in your fasta: #{total_seqs}\n\n" end # ---------------------------------------------------------------------------------- Functions def stats_my_db(db_name, array) if (db_name !~ /^sp_/) && (db_name !~ /^tr_/) array[1] += 1 elsif (db_name =~ /^sp_/) array[2] += 1 elsif (db_name =~ /^tr_/) array[3] += 1 end return array end def annotation_stats(stats_file) seqs_number = 0 array_of_all_accs = [] array_of_complete_accs = [] error_1_num = 0 seqs_longer_200 = 0 seqs_shorter_200 = 0 complete_longer_200 = 0 complete_shorter_200 = 0 seqs_longer_500 = 0 seqs_shorter_500 = 0 complete_longer_500 = 0 complete_shorter_500 = 0 complete = [0,0,0,0] putative_complete = [0,0,0,0] c_terminus = [0,0,0,0] putative_c_terminus = [0,0,0,0] n_terminus = [0,0,0,0] putative_n_terminus = [0,0,0,0] internal = [0,0,0,0] cod_seq = [0,0,0,0] File.open('fl2_results/annotations.txt').each do |line| line.chomp! (name,fasta_length,acc,db_name,status,kk1,kk2,kk3,kk4,kk5,msgs) = line.split("\t") if (line !~ /^Query_id\t/) seqs_number += 1 array_of_all_accs.push acc # ------------------------------------------------------------------------- if (fasta_length.to_i >= 200) seqs_longer_200 += 1 else seqs_shorter_200 += 1 end if (fasta_length.to_i >= 500) seqs_longer_500 += 1 else seqs_shorter_500 += 1 end # ------------------------------------------------------------------------- if (msgs =~ /ERROR#1/) error_1_num += 1 end # ------------------------------------------------------------------------- if (status == 'Complete') complete[0] += 1 array_of_complete_accs.push acc complete = stats_my_db(db_name, complete) if (fasta_length.to_i >= 200) complete_longer_200 += 1 else complete_shorter_200 += 1 end if (fasta_length.to_i >= 500) complete_longer_500 += 1 else complete_shorter_500 += 1 end elsif (status == 'Putative Complete') putative_complete[0] += 1 putative_complete = stats_my_db(db_name, putative_complete) elsif (status == 'C-terminus') c_terminus[0] += 1 c_terminus = stats_my_db(db_name, c_terminus) elsif (status == 'N-terminus') n_terminus[0] += 1 n_terminus = stats_my_db(db_name, n_terminus) elsif (status == 'Putative C-terminus') putative_c_terminus[0] += 1 putative_c_terminus = stats_my_db(db_name, putative_c_terminus) elsif (status == 'Putative N-terminus') putative_n_terminus[0] += 1 putative_n_terminus = stats_my_db(db_name, putative_n_terminus) elsif (status == 'Internal') internal[0] += 1 internal = stats_my_db(db_name, internal) elsif (status == 'Coding Seq') cod_seq[0] += 1 cod_seq = stats_my_db(db_name, cod_seq) end # ------------------------------------------------------------------------- end end stats_file.puts "--- Annotation Summary ---" stats_file.puts "\n------------------------------ Summary of sequences found by similarity -----" stats_file.puts "\n\tSequences found: #{seqs_number}\t\t(>200: #{seqs_longer_200}, <200: #{seqs_shorter_200})\t(>500: #{seqs_longer_500}, <500: #{seqs_shorter_500})" stats_file.puts "\tDifferent IDs: #{array_of_all_accs.uniq.count}" stats_file.puts "\n\tsequences with sense and antisense hits error: #{error_1_num}" stats_file.puts "\n------------------------------------------------- Full-Length Sequences -----" stats_file.puts "\tComplete Seqs: #{complete[0]} ("+ '%.3f' % (complete[0].to_f/seqs_number.to_f*100) +" %)\t\t(>200: #{complete_longer_200}, <200: #{complete_shorter_200})\t(>500: #{complete_longer_500}, <500: #{complete_shorter_500})" stats_file.puts "\tDifferent IDs: #{array_of_complete_accs.uniq.count} ("+ '%.3f' % (array_of_complete_accs.uniq.count.to_f/seqs_number.to_f*100) +" %)" stats_file.puts "\n\t\tuser_db: #{complete[1]}\n\t\tsp: #{complete[2]}\n\t\ttr: #{complete[3]}" stats_file.puts "-----------------------------------------------------------------------------" stats_file.puts "\n\tputative completes: #{putative_complete[0]}\n\t\tuser_db: #{putative_complete[1]}\n\t\tsp: #{putative_complete[2]}\n\t\ttr: #{putative_complete[3]}" stats_file.puts "\n\tn-terminus: #{n_terminus[0]}\n\t\tuser_db: #{n_terminus[1]}\n\t\tsp: #{n_terminus[2]}\n\t\ttr: #{n_terminus[3]}" stats_file.puts "\n\tputative_n_terminus: #{putative_n_terminus[0]}\n\t\tuser_db: #{putative_n_terminus[1]}\n\t\tsp: #{putative_n_terminus[2]}\n\t\ttr: #{putative_n_terminus[3]}" stats_file.puts "\n\tc-terminus: #{c_terminus[0]}\n\t\tuser_db: #{c_terminus[1]}\n\t\tsp: #{c_terminus[2]}\n\t\ttr: #{c_terminus[3]}" stats_file.puts "\n\tputative_c_terminus: #{putative_c_terminus[0]}\n\t\tuser_db: #{putative_c_terminus[1]}\n\t\tsp: #{putative_c_terminus[2]}\n\t\ttr: #{putative_c_terminus[3]}" stats_file.puts "\n\tinternal: #{internal[0]}\n\t\tuser_db: #{internal[1]}\n\t\tsp: #{internal[2]}\n\t\ttr: #{internal[3]}" stats_file.puts "\n\tcoding sequences with unknown status: #{cod_seq[0]}\n\t\tuser_db: #{cod_seq[1]}\n\t\tsp: #{cod_seq[2]}\n\t\ttr: #{cod_seq[3]}" return seqs_number end def testcode_stats(stats_file) seqs_number = 0 coding = 0 putative_coding = 0 unknown = 0 coding_longer_200 = 0 coding_shorter_200 = 0 unknown_longer_200 = 0 unknown_shorter_200 = 0 coding_longer_500 = 0 coding_shorter_500 = 0 unknown_longer_500 = 0 unknown_shorter_500 = 0 File.open('fl2_results/tcode_result.txt').each do |line| line.chomp! (name,fasta_length,acc,db_name,status) = line.split("\t") if (line !~ /^Query_id\t/) seqs_number += 1 if (status == 'coding') coding += 1 if (fasta_length.to_i >= 200) coding_longer_200 += 1 coding_longer_500 += 1 else coding_shorter_200 += 1 coding_shorter_500 += 1 end elsif (status == 'putative_coding') putative_coding += 1 elsif (status == 'unknown') unknown += 1 if (fasta_length.to_i >= 200) unknown_longer_200 += 1 unknown_longer_500 += 1 else unknown_shorter_200 += 1 unknown_shorter_500 += 1 end end end end stats_file.puts "\n--------------------------- Test Code Summary\n\n\ttotal seqs: #{seqs_number}" stats_file.puts "\n\tcoding sequences: #{coding}" stats_file.puts "\t\tlonger than 200 bp: #{coding_longer_200}" stats_file.puts "\t\tshorter than 200 bp: #{coding_shorter_200}" stats_file.puts "\t\tlonger than 500 bp: #{coding_longer_500}" stats_file.puts "\t\tshorter than 500 bp: #{coding_shorter_500}" stats_file.puts "\n\tputative coding sequences: #{putative_coding}\n" stats_file.puts "\n\tunknown: #{unknown} ("+ '%.3f' % (unknown.to_f/seqs_number.to_f*100) +" %)" stats_file.puts "\t\tlonger than 200 bp: #{unknown_longer_200}" stats_file.puts "\t\tshorter than 200 bp: #{unknown_shorter_200}" stats_file.puts "\t\tlonger than 500 bp: #{unknown_longer_500}" stats_file.puts "\t\tshorter than 500 bp: #{unknown_shorter_500}" stats_file.puts "\n\tUnknown sequences have a bad test code score or haven't got an ORF longer than 200 nt" stats_file.puts "---------------------------------------------" return seqs_number end end