module FlnStats
def summary_stats
stats_file = File.open('fln_results/summary_stats.html', 'w')
# recogemos los trozos de html fijos
(html_head, html_st, html_uni, html_db, html_as, html_end) = html_code
total_seqs = 0
status_suma = 0
#recogemos los datos que necesitamos de los ficheros de resultados
(status_array, db_usage, seqs_number1, error_1_num, seq_uniq, complete_uniq, db_uni_500, db_uni_200, db_longest_one) = annotation_stats
(tcode_array, seqs_number2, unk_200, tc_uni_500, tc_uni_200, tc_longest_one) = testcode_stats
(ncrna_total, nc_uni_500, nc_uni_200, nc_longest_one)=ncrna_stats
total_seqs = seqs_number1 + seqs_number2 + ncrna_total.to_i
uni_500 = (db_uni_500 + tc_uni_500 + nc_uni_500)
uni_200 = (db_uni_200 + tc_uni_200 + nc_uni_200)
longest_one = [db_longest_one, tc_longest_one, nc_longest_one].max
stats_file.puts html_head
if (total_seqs.to_i > 0)
# imprimimos la tabla Status Report --------------------------------------------------------------------------------------------
stats_file.puts html_st
status_array.each do |status|
if (status[1] == 'Internal') || (status[1] == 'Misassembled')
stats_file.puts '
'+status[1].to_s+' |
'+status[0].to_s+' |
'+'%.2f' % (100*status[0].to_f/total_seqs.to_f).to_s+' % |
'
elsif (status[1] =~ /^Putative/)
stats_file.puts '
Putative |
'+status[0].to_s+' |
'+'%.2f' % (100*status[0].to_f/total_seqs.to_f).to_s+' % |
'
else
stats_file.puts '
'+status[1].to_s+' |
Sure |
'+status[0].to_s+' |
'+'%.2f' % (100*status[0].to_f/total_seqs.to_f).to_s+' % |
'
end
status_suma += status[0]
end
# añadimos los coding, P.coding
tcode_array.each do |status|
if (status[1] == 'Coding')
stats_file.puts '
'+status[1].to_s+' |
Sure |
'+status[0].to_s+' |
'+'%.2f' % (100*status[0].to_f/total_seqs.to_f).to_s+' % |
'
elsif (status[1] == 'Putative Coding')
stats_file.puts '
Putative |
'+status[0].to_s+' |
'+'%.2f' % (100*status[0].to_f/total_seqs.to_f).to_s+' % |
'
end
status_suma += status[0]
end
# se ponen los ncRNA
stats_file.puts '
Putative ncRNA |
'+ncrna_total.to_s+' |
'+'%.2f' % (100*ncrna_total.to_f/total_seqs.to_f).to_s+' % |
'
status_suma += ncrna_total
# se ponen los unknown
tcode_array.each do |status|
if (status[1] =~ /Unknown/i)
stats_file.puts '
'+status[1].to_s+' |
'+status[0].to_s+' |
'+'%.2f' % (100*status[0].to_f/total_seqs.to_f).to_s+' % |
'
end
end
#se añade el total
stats_file.puts '
Total |
'+status_suma.to_s+' |
'+'%.2f' % (100*status_suma.to_f/total_seqs.to_f).to_s+' % |
'
# imprimimos la tabla Unigene Report --------------------------------------------------------------------------------------------
new_genes = tcode_array[0][0] + tcode_array[1][0]
total_uni = (seqs_number1 + new_genes + ncrna_total + tcode_array[2][0])
stats_file.puts html_uni
stats_file.puts '
With orthologue in DBs |
'+seqs_number1.to_s+' |
'+'%.2f' % (100*seqs_number1.to_f/total_seqs.to_f).to_s+' % |
'
stats_file.puts '
Putative New Genes |
'+new_genes.to_s+' |
'+'%.2f' % (100*new_genes.to_f/total_seqs.to_f).to_s+' % |
'
stats_file.puts '
ncRNAs |
'+ncrna_total.to_s+' |
'+'%.2f' % (100*ncrna_total.to_f/total_seqs.to_f).to_s+' % |
'
stats_file.puts '
Unknown |
'+tcode_array[2][0].to_s+' |
'+'%.2f' % (100*tcode_array[2][0].to_f/total_seqs.to_f).to_s+' % |
'
stats_file.puts '
Total |
'+total_uni.to_s+' |
'+'%.2f' % (100*total_uni.to_f/total_seqs.to_f).to_s+' % |
'
# imprimimos la tabla Database Usage --------------------------------------------------------------------------------------------
stats_file.puts html_db
db_names=["UserDB", "SwissProt", "TrEMBL"]
total_db = 0
for i in 0..db_usage.length-1 do i
total_db += db_usage[i]
stats_file.puts '
'+db_names[i].to_s+' |
'+db_usage[i].to_s+' |
'+'%.2f' % (100*db_usage[i].to_f/total_seqs.to_f).to_s+' % |
'
end
no_db = seqs_number2 + ncrna_total.to_i
stats_file.puts '
None |
'+no_db.to_s+' |
'+'%.2f' % (100*no_db.to_f/total_seqs.to_f).to_s+' % |
'
total_db += no_db
stats_file.puts '
Total |
'+total_db.to_s+' |
'+'%.2f' % (100*total_db.to_f/total_seqs.to_f).to_s+' % |
'
# imprimimos la tabla Report guiding assembly quality -------------------------------------------------------------
stats_file.puts html_as
stats_file.puts '
Unigenes |
'+total_seqs.to_s+' |
'+'%.2f' % (100*total_seqs.to_f/total_seqs.to_f).to_s+' % |
'
stats_file.puts '
Unigenes >500pb |
'+uni_500.to_s+' |
'+'%.2f' % (100*uni_500.to_f/total_seqs.to_f).to_s+' % |
'
stats_file.puts '
Unigenes >200pb |
'+uni_200.to_s+' |
'+'%.2f' % (100*uni_200.to_f/total_seqs.to_f).to_s+' % |
'
stats_file.puts '
Longest unigene |
'+longest_one.to_s+' |
- |
'
stats_file.puts '
With orthologue 1 |
'+seqs_number1.to_s+' |
'+'%.2f' % (100*seqs_number1.to_f/total_seqs.to_f).to_s+' % |
'
stats_file.puts '
Different orthologue IDs |
'+seq_uniq.to_s+' |
'+'%.2f' % (100*seq_uniq.to_f/seqs_number1.to_f).to_s+' % |
'
stats_file.puts '
Complete transcripts |
'+status_array[0][0].to_s+' |
'+'%.2f' % (100*status_array[0][0].to_f/seqs_number1.to_f).to_s+' % |
'
stats_file.puts '
Different complete transcripts |
'+complete_uniq.to_s+' |
'+'%.2f' % (100*complete_uniq.to_f/seqs_number1.to_f).to_s+' % |
'
stats_file.puts '
Misassembled |
'+error_1_num.to_s+' |
'+'%.2f' % (100*error_1_num.to_f/seqs_number1.to_f).to_s+' % |
'
stats_file.puts '
Without orthologue 1 |
'+no_db.to_s+' |
'+'%.2f' % (100*seqs_number2.to_f/total_seqs.to_f).to_s+' % |
'
stats_file.puts '
Coding |
'+tcode_array[0][0].to_s+' |
'+'%.2f' % (100*tcode_array[0][0].to_f/no_db.to_f).to_s+' % |
'
stats_file.puts '
Putative Coding |
'+tcode_array[1][0].to_s+' |
'+'%.2f' % (100*tcode_array[1][0].to_f/no_db.to_f).to_s+' % |
'
stats_file.puts '
Putative ncRNA |
'+ncrna_total.to_s+' |
'+'%.2f' % (100*ncrna_total.to_f/no_db.to_f).to_s+' % |
'
stats_file.puts '
Unknown (all) |
'+tcode_array[2][0].to_s+' |
'+'%.2f' % (100*tcode_array[2][0].to_f/no_db.to_f).to_s+' % |
'
stats_file.puts '
Unknown < 200bp |
'+unk_200.to_s+' |
'+'%.2f' % (100*unk_200.to_f/no_db.to_f).to_s+' % |
1 Percents for subclassifications of this category were calculated using this line as 100% reference.'
end
stats_file.puts html_end
stats_file.close
end
def html_code
html_head = '
FLN Summary
Full-LengtherNEXT Summary
'
html_1 = '
Status report
Status |
Unigenes |
% |
'
html_2= '
Unigene report
|
Unigenes |
% |
'
html_3= '
Database usage
|
Unigenes |
% |
'
html_4= '
Report guiding assembly quality
|
Unigenes |
% |
'
html_5 = '
'
return [html_head, html_1, html_2, html_3, html_4, html_5]
end
def annotation_stats
seqs_number = 0
array_of_all_accs = []
array_of_complete_accs = []
error_1_num = 0
uni_500 = 0
uni_200 = 0
longest_one = 0
status_array = []
# total, status
complete = [0,'Complete']
putative_complete = [0,'Putative Complete']
c_terminus = [0,'C-terminus']
putative_c_terminus = [0,'Putative C-terminus']
n_terminus = [0,'N-terminus']
putative_n_terminus = [0,'Putative N-terminus']
internal = [0,'Internal']
cod_seq = [0,'Misassembled']
#userdb, SwissProt, TrEMBL
db_usage = [0,0,0]
File.open('fln_results/dbannotated.txt').each do |line|
line.chomp!
(name,fasta_length,acc,db_name,status,kk1,kk2,kk3,kk4,kk5,msgs) = line.split("\t")
if (line !~ /^Query_id\t/) && (!line.empty?)
seqs_number += 1
if (fasta_length.to_i > longest_one)
longest_one = fasta_length.to_i
end
array_of_all_accs.push acc
if (db_name !~ /^sp_/) && (db_name !~ /^tr_/)
db_usage[0] += 1
elsif (db_name =~ /^sp_/)
db_usage[1] += 1
elsif (db_name =~ /^tr_/)
db_usage[2] += 1
end
# -------------------------------------------------------------------------
if (fasta_length.to_i >= 200)
uni_200 += 1
end
if (fasta_length.to_i >= 500)
uni_500 += 1
end
# -------------------------------------------------------------------------
if (msgs =~ /ERROR#1/)
error_1_num += 1
end
# -------------------------------------------------------------------------
if (status == 'Complete')
complete[0] += 1
array_of_complete_accs.push acc
elsif (status == 'Putative Complete')
putative_complete[0] += 1
elsif (status == 'C-terminus')
c_terminus[0] += 1
elsif (status == 'N-terminus')
n_terminus[0] += 1
elsif (status == 'Putative C-terminus')
putative_c_terminus[0] += 1
elsif (status == 'Putative N-terminus')
putative_n_terminus[0] += 1
elsif (status == 'Internal')
internal[0] += 1
elsif (status == 'Misassembled')
cod_seq[0] += 1
end
# -------------------------------------------------------------------------
end
end
status_array = [complete, putative_complete, c_terminus, putative_c_terminus, n_terminus, putative_n_terminus, internal, cod_seq]
return [status_array, db_usage, seqs_number, error_1_num, array_of_all_accs.uniq.count, array_of_complete_accs.uniq.count, uni_500, uni_200, longest_one]
end
def testcode_stats
seqs_number = 0
unk_200 = 0
uni_500 = 0
uni_200 = 0
longest_one = 0
# total, status
coding_stats = [0,'Coding']
p_coding_stats = [0,'Putative Coding']
unknown_stats = [0,'Unknown']
File.open('fln_results/new_coding.txt').each do |line|
line.chomp!
(name,fasta_length,acc,db_name,status) = line.split("\t")
if (line !~ /^Query_id\t/) && (!line.empty?)
seqs_number += 1
if (fasta_length.to_i > longest_one)
longest_one = fasta_length.to_i
end
# -------------------------------------------------------------------------
if (fasta_length.to_i >= 200)
uni_200 += 1
end
if (fasta_length.to_i >= 500)
uni_500 += 1
end
# -------------------------------------------------------------------------
if (fasta_length.to_i < 200)
if (status == 'unknown')
unk_200 += 1
end
end
if (status == 'coding')
coding_stats[0] += 1
elsif (status == 'putative_coding')
p_coding_stats[0] += 1
elsif (status == 'unknown')
unknown_stats[0] += 1
end
end
end
status_array = [coding_stats, p_coding_stats, unknown_stats]
return [status_array, seqs_number, unk_200, uni_500, uni_200, longest_one]
end
def ncrna_stats
uni_500 = 0
uni_200 = 0
nc_total = 0
longest_one = 0
File.open('fln_results/nc_rnas.txt').each do |line|
line.chomp!
(name,fasta_length,acc,db_name,status) = line.split("\t")
if (status == 'Putative ncRNA')
if (fasta_length.to_i > longest_one)
longest_one = fasta_length.to_i
end
# -------------------------------------------------------------------------
if (fasta_length.to_i >= 200)
uni_200 += 1
end
if (fasta_length.to_i >= 500)
uni_500 += 1
end
# -------------------------------------------------------------------------
nc_total += 1
end
end
return [nc_total, uni_500, uni_200, longest_one]
end
end