module FlnStats
def summary_stats
stats_file = File.open('fln_results/summary_stats.html', 'w')
(html_head, html_1, html_2, html_3, html_4) = html_code
total_seqs = 0
(status_array, seqs_number1, error_1_num, seq_uniq, complete_uniq, seq_length_stats, complete_seq_length_stats) = annotation_stats
(tcode_array, seqs_number2, tcode_length_stats, coding_length_stats, unknown_length_stats) = testcode_stats
ncrna_array=ncrna_stats
total_seqs = seqs_number1 + seqs_number2 + ncrna_array[4].to_i
stats_file.puts html_head
stats_file.puts "\t\t\t\t"+''+total_seqs.to_s+" sequences in your input fasta\n\t\t\t\n\t\t"
if (total_seqs.to_i > 0)
stats_file.puts html_1
stats_file.puts '
YES |
'+seqs_number1.to_s+' |
'+'%.2f' % (100*seqs_number1.to_f/total_seqs.to_f).to_s+' % |
'+seq_uniq.to_s+' |
'+seq_length_stats[0].to_s+' |
'+seq_length_stats[1].to_s+' |
'+seq_length_stats[2].to_s+' |
'+seq_length_stats[3].to_s+' |
'
stats_file.puts '
NO |
'+seqs_number2.to_s+' |
'+'%.2f' % (100*seqs_number2.to_f/total_seqs.to_f).to_s+' % |
- |
'+tcode_length_stats[0].to_s+' |
'+tcode_length_stats[1].to_s+' |
'+tcode_length_stats[2].to_s+' |
'+tcode_length_stats[3].to_s+' |
'
stats_file.puts '
ncRNA |
'+ncrna_array[4].to_s+' |
'+'%.2f' % (100*ncrna_array[4].to_f/total_seqs.to_f).to_s+' % |
- |
'+ncrna_array[0].to_s+' |
'+ncrna_array[1].to_s+' |
'+ncrna_array[2].to_s+' |
'+ncrna_array[3].to_s+' |
'
stats_file.puts ' '+error_1_num.to_s+' Sequences with sense and antisense hits error
'
stats_file.puts ' '+complete_uniq.to_s+' Complete sequences with different ortologue ID
'
stats_file.puts html_2
status_array.each do |status|
stats_file.puts '
'+status[4].to_s+' |
'+status[0].to_s+' |
'+'%.2f' % (100*status[0].to_f/total_seqs.to_f).to_s+' % |
'+status[1].to_s+' |
'+status[2].to_s+' |
'+status[3].to_s+' |
'
end
stats_file.puts html_3
tcode_array.each do |status|
stats_file.puts '
'+status[5].to_s+' |
'+status[4].to_s+' |
'+'%.2f' % (100*status[4].to_f/total_seqs.to_f).to_s+' % |
'+status[0].to_s+' |
'+status[1].to_s+' |
'+status[2].to_s+' |
'+status[3].to_s+' |
'
end
# print Non coding RNA
stats_file.puts '
Putative ncRNA |
'+ncrna_array[4].to_s+' |
'+'%.2f' % (100*ncrna_array[4].to_f/total_seqs.to_f).to_s+' % |
'+ncrna_array[0].to_s+' |
'+ncrna_array[1].to_s+' |
'+ncrna_array[2].to_s+' |
'+ncrna_array[3].to_s+' |
'
end
stats_file.puts html_4
stats_file.close
end
def html_code
html_head = '
FLN Annotation Summary
Full-LengtherNEXT
Annotation summary
'
html_1 = '
Ortologue found |
Sequences found |
% |
Different IDs |
>200 bp |
<200 bp |
>500 bp |
<500 bp |
'
html_2= '
Status |
Total |
% |
UserDB |
SwissProt |
TrEMBL |
'
html_3= '
Status |
Total |
% |
>200 bp |
<200 bp |
>500 bp |
<500 bp |
'
html_4 = '
'
return [html_head, html_1, html_2, html_3, html_4]
end
def stats_my_db(db_name, array)
if (db_name !~ /^sp_/) && (db_name !~ /^tr_/)
array[1] += 1
elsif (db_name =~ /^sp_/)
array[2] += 1
elsif (db_name =~ /^tr_/)
array[3] += 1
end
return array
end
def annotation_stats
seqs_number = 0
array_of_all_accs = []
array_of_complete_accs = []
error_1_num = 0
# >200, <200, >500, <500
seq_length_stats = [0,0,0,0]
# >200, <200, >500, <500
complete_seq_length_stats = [0,0,0,0]
status_array = []
# total, userdb, swissprotdb, trembl, status
complete = [0,0,0,0,'Complete']
putative_complete = [0,0,0,0,'Putative Complete']
c_terminus = [0,0,0,0,'C-terminus']
putative_c_terminus = [0,0,0,0,'Putative C-terminus']
n_terminus = [0,0,0,0,'N-terminus']
putative_n_terminus = [0,0,0,0,'Putative N-terminus']
internal = [0,0,0,0,'Internal']
cod_seq = [0,0,0,0,'Misassembled']
File.open('fln_results/annotations.txt').each do |line|
line.chomp!
(name,fasta_length,acc,db_name,status,kk1,kk2,kk3,kk4,kk5,msgs) = line.split("\t")
if (line !~ /^Query_id\t/) && (!line.empty?)
seqs_number += 1
array_of_all_accs.push acc
# -------------------------------------------------------------------------
if (fasta_length.to_i >= 200)
seq_length_stats[0] += 1
# seqs_longer_200 += 1
else
seq_length_stats[1] += 1
# seqs_shorter_200 += 1
end
if (fasta_length.to_i >= 500)
seq_length_stats[2] += 1
# seqs_longer_500 += 1
else
seq_length_stats[3] += 1
# seqs_shorter_500 += 1
end
# -------------------------------------------------------------------------
if (msgs =~ /ERROR#1/)
error_1_num += 1
end
# -------------------------------------------------------------------------
if (status == 'Complete')
complete[0] += 1
array_of_complete_accs.push acc
complete = stats_my_db(db_name, complete)
if (fasta_length.to_i >= 200)
complete_seq_length_stats[0] += 1
# complete_longer_200 += 1
else
complete_seq_length_stats[1] += 1
# complete_shorter_200 += 1
end
if (fasta_length.to_i >= 500)
complete_seq_length_stats[2] += 1
# complete_longer_500 += 1
else
complete_seq_length_stats[3] += 1
# complete_shorter_500 += 1
end
elsif (status == 'Putative Complete')
putative_complete[0] += 1
putative_complete = stats_my_db(db_name, putative_complete)
elsif (status == 'C-terminus')
c_terminus[0] += 1
c_terminus = stats_my_db(db_name, c_terminus)
elsif (status == 'N-terminus')
n_terminus[0] += 1
n_terminus = stats_my_db(db_name, n_terminus)
elsif (status == 'Putative C-terminus')
putative_c_terminus[0] += 1
putative_c_terminus = stats_my_db(db_name, putative_c_terminus)
elsif (status == 'Putative N-terminus')
putative_n_terminus[0] += 1
putative_n_terminus = stats_my_db(db_name, putative_n_terminus)
elsif (status == 'Internal')
internal[0] += 1
internal = stats_my_db(db_name, internal)
elsif (status == 'Coding Seq')
cod_seq[0] += 1
cod_seq = stats_my_db(db_name, cod_seq)
end
# -------------------------------------------------------------------------
end
end
status_array = [complete, putative_complete, c_terminus, putative_c_terminus, n_terminus, putative_n_terminus, internal, cod_seq]
return [status_array, seqs_number, error_1_num, array_of_all_accs.uniq.count, array_of_complete_accs.uniq.count, seq_length_stats, complete_seq_length_stats]
end
def testcode_stats
seqs_number = 0
# >200, <200, >500, <500
all_tcode_stats = [0,0,0,0]
# >200, <200, >500, <500, total, status
coding_length_stats = [0,0,0,0,0,'Coding']
p_coding_length_stats = [0,0,0,0,0,'Putative Coding']
unknown_length_stats = [0,0,0,0,0,'Unknown']
File.open('fln_results/tcode_result.txt').each do |line|
line.chomp!
(name,fasta_length,acc,db_name,status) = line.split("\t")
if (line !~ /^Query_id\t/) && (!line.empty?)
seqs_number += 1
if (fasta_length.to_i >= 200)
all_tcode_stats[0] += 1
if (status == 'coding')
coding_length_stats[4] += 1
coding_length_stats[0] += 1
elsif (status == 'putative_coding')
p_coding_length_stats[4] += 1
p_coding_length_stats[0] += 1
elsif (status == 'unknown')
unknown_length_stats[4] += 1
unknown_length_stats[0] += 1
end
else
all_tcode_stats[1] += 1
if (status == 'coding')
coding_length_stats[4] += 1
coding_length_stats[1] += 1
elsif (status == 'putative_coding')
p_coding_length_stats[4] += 1
p_coding_length_stats[1] += 1
elsif (status == 'unknown')
unknown_length_stats[4] += 1
unknown_length_stats[1] += 1
end
end
if (fasta_length.to_i >= 500)
all_tcode_stats[2] += 1
if (status == 'coding')
coding_length_stats[2] += 1
elsif (status == 'putative_coding')
p_coding_length_stats[2] += 1
elsif (status == 'unknown')
unknown_length_stats[2] += 1
end
else
all_tcode_stats[3] += 1
if (status == 'coding')
coding_length_stats[3] += 1
elsif (status == 'putative_coding')
p_coding_length_stats[3] += 1
elsif (status == 'unknown')
unknown_length_stats[3] += 1
end
end
end
end
status_array = [coding_length_stats, p_coding_length_stats, unknown_length_stats]
return [status_array, seqs_number, all_tcode_stats, coding_length_stats, unknown_length_stats]
end
def ncrna_stats
# >200, <200, >500, <500, total
ncrna_array = [0,0,0,0,0]
File.open('fln_results/nc_rna.txt').each do |line|
line.chomp!
(name,fasta_length,acc,db_name,status) = line.split("\t")
if (status == 'Putative ncRNA')
ncrna_array[4] += 1
if (fasta_length.to_i >= 200)
ncrna_array[0] += 1
else
ncrna_array[1] += 1
end
if (fasta_length.to_i >= 500)
ncrna_array[2] += 1
else
ncrna_array[3] += 1
end
end
end
return ncrna_array
end
end