module FlnStats
def summary_stats
stats_file = File.open('fln_results/summary_stats.html', 'w')
size_filter1 = 200
size_filter2 = 500
# recogemos los trozos de html fijos
(html_head, html_st, html_uni, html_db, html_as, html_end) = html_code
total_seqs = 0
status_suma = 0
#recogemos los datos que necesitamos de los ficheros de resultados
(status_array, db_usage, seqs_number1, error_1_num, seq_uniq, complete_uniq, db_uni_500, db_uni_200, db_longest_one) = annotation_stats(size_filter1,size_filter2)
(tcode_array, seqs_number2, tc_uni_500, tc_uni_200, tc_longest_one) = testcode_stats(size_filter1,size_filter2)
(ncrna_total, nc_uni_500, nc_uni_200, nc_longest_one)=ncrna_stats(size_filter1,size_filter2)
(chimera_total, ch_uni_500, ch_uni_200, ch_longest_one, ch_db_usage)=chimera_stats(size_filter1,size_filter2)
seqs_number1 = (seqs_number1+chimera_total.to_i)
total_seqs = (seqs_number1 + seqs_number2 + ncrna_total.to_i)
uni_500 = (db_uni_500 + tc_uni_500 + nc_uni_500 + ch_uni_500)
uni_200 = (db_uni_200 + tc_uni_200 + nc_uni_200 + ch_uni_200)
longest_one = [db_longest_one, tc_longest_one, nc_longest_one, ch_longest_one].max
db_usage[0] += ch_db_usage[0]
db_usage[1] += ch_db_usage[1]
db_usage[2] += ch_db_usage[2]
stats_file.puts html_head
if (total_seqs.to_i > 0)
# imprimimos la tabla Status Report --------------------------------------------------------------------------------------------
stats_file.puts html_st
status_array.each do |status|
if (status[1] == 'Internal') || (status[1] == 'Misassembled')
stats_file.puts '
'+status[1].to_s+' |
'+status[0].to_s+' |
'+'%.2f' % (100*status[0].to_f/total_seqs.to_f).to_s+' % |
'
elsif (status[1] =~ /^Putative/)
stats_file.puts '
Putative |
'+status[0].to_s+' |
'+'%.2f' % (100*status[0].to_f/total_seqs.to_f).to_s+' % |
'
else
stats_file.puts '
'+status[1].to_s+' |
Sure |
'+status[0].to_s+' |
'+'%.2f' % (100*status[0].to_f/total_seqs.to_f).to_s+' % |
'
end
status_suma += status[0]
end
# adding chimeric seqs
stats_file.puts '
Putative chimera |
'+chimera_total.to_s+' |
'+'%.2f' % (100*chimera_total.to_f/total_seqs.to_f).to_s+' % |
'
status_suma += chimera_total
# añadimos los coding, P.coding
tcode_array.each do |status|
if (status[1] == 'Coding')
stats_file.puts '
'+status[1].to_s+' |
Sure |
'+status[0].to_s+' |
'+'%.2f' % (100*status[0].to_f/total_seqs.to_f).to_s+' % |
'
elsif (status[1] == 'Putative Coding')
stats_file.puts '
Putative |
'+status[0].to_s+' |
'+'%.2f' % (100*status[0].to_f/total_seqs.to_f).to_s+' % |
'
end
status_suma += status[0]
end
# se ponen los ncRNA
stats_file.puts '
Putative ncRNA |
'+ncrna_total.to_s+' |
'+'%.2f' % (100*ncrna_total.to_f/total_seqs.to_f).to_s+' % |
'
status_suma += ncrna_total
# se ponen los unknown
tcode_array.each do |status|
if (status[1] =~ /Unknown/i)
stats_file.puts '
'+status[1].to_s+' |
'+status[0].to_s+' |
'+'%.2f' % (100*status[0].to_f/total_seqs.to_f).to_s+' % |
'
end
end
#se añade el total
stats_file.puts '
Total |
'+status_suma.to_s+' |
'+'%.2f' % (100*status_suma.to_f/total_seqs.to_f).to_s+' % |
'
# imprimimos la tabla Unigene Report --------------------------------------------------------------------------------------------
new_genes = tcode_array[0][0] + tcode_array[1][0]
total_uni = (seqs_number1 + new_genes + ncrna_total + tcode_array[2][0])
stats_file.puts html_uni
stats_file.puts '
With orthologue in DBs |
'+seqs_number1.to_s+' |
'+'%.2f' % (100*seqs_number1.to_f/total_seqs.to_f).to_s+' % |
'
stats_file.puts '
Putative New Genes |
'+new_genes.to_s+' |
'+'%.2f' % (100*new_genes.to_f/total_seqs.to_f).to_s+' % |
'
stats_file.puts '
ncRNAs |
'+ncrna_total.to_s+' |
'+'%.2f' % (100*ncrna_total.to_f/total_seqs.to_f).to_s+' % |
'
stats_file.puts '
Unknown |
'+tcode_array[2][0].to_s+' |
'+'%.2f' % (100*tcode_array[2][0].to_f/total_seqs.to_f).to_s+' % |
'
stats_file.puts '
Total |
'+total_uni.to_s+' |
'+'%.2f' % (100*total_uni.to_f/total_seqs.to_f).to_s+' % |
'
# imprimimos la tabla Database Usage --------------------------------------------------------------------------------------------
stats_file.puts html_db
db_names=["UserDB", "SwissProt", "TrEMBL"]
total_db = 0
for i in 0..db_usage.length-1 do i
total_db += db_usage[i]
stats_file.puts '
'+db_names[i].to_s+' |
'+db_usage[i].to_s+' |
'+'%.2f' % (100*db_usage[i].to_f/total_seqs.to_f).to_s+' % |
'
end
no_db = seqs_number2 + ncrna_total.to_i
stats_file.puts '
None |
'+no_db.to_s+' |
'+'%.2f' % (100*no_db.to_f/total_seqs.to_f).to_s+' % |
'
total_db += no_db
stats_file.puts '
Total |
'+total_db.to_s+' |
'+'%.2f' % (100*total_db.to_f/total_seqs.to_f).to_s+' % |
'
# imprimimos la tabla Report guiding assembly quality -------------------------------------------------------------
stats_file.puts html_as
stats_file.puts '
Unigenes |
'+total_seqs.to_s+' |
'+'%.2f' % (100*total_seqs.to_f/total_seqs.to_f).to_s+' % |
'
stats_file.puts '
Unigenes >'+size_filter2.to_s+'pb |
'+uni_500.to_s+' |
'+'%.2f' % (100*uni_500.to_f/total_seqs.to_f).to_s+' % |
'
stats_file.puts '
Unigenes >'+size_filter1.to_s+'pb |
'+uni_200.to_s+' |
'+'%.2f' % (100*uni_200.to_f/total_seqs.to_f).to_s+' % |
'
stats_file.puts '
Longest unigene |
'+longest_one.to_s+' |
- |
'
stats_file.puts '
With orthologue 1 |
'+seqs_number1.to_s+' |
'+'%.2f' % (100*seqs_number1.to_f/total_seqs.to_f).to_s+' % |
'
if (seqs_number1.to_i > 0)
stats_file.puts '
Different orthologue IDs |
'+seq_uniq.to_s+' |
'+'%.2f' % (100*seq_uniq.to_f/seqs_number1.to_f).to_s+' % |
'
stats_file.puts '
Complete transcripts |
'+status_array[0][0].to_s+' |
'+'%.2f' % (100*status_array[0][0].to_f/seqs_number1.to_f).to_s+' % |
'
stats_file.puts '
Different complete transcripts |
'+complete_uniq.to_s+' |
'+'%.2f' % (100*complete_uniq.to_f/seqs_number1.to_f).to_s+' % |
'
stats_file.puts '
Misassembled |
'+error_1_num.to_s+' |
'+'%.2f' % (100*error_1_num.to_f/seqs_number1.to_f).to_s+' % |
'
stats_file.puts '
Putative chimera |
'+chimera_total.to_s+' |
'+'%.2f' % (100*chimera_total.to_f/seqs_number1.to_f).to_s+' % |
'
end
stats_file.puts '
Without orthologue 1 |
'+no_db.to_s+' |
'+'%.2f' % (100*seqs_number2.to_f/total_seqs.to_f).to_s+' % |
'
if (no_db.to_i > 0) && (seqs_number2.to_i > 0)
stats_file.puts '
Coding (all) |
'+tcode_array[0][0].to_s+' |
'+'%.2f' % (100*tcode_array[0][0].to_f/no_db.to_f).to_s+' % |
'
stats_file.puts '
Coding > '+size_filter1.to_s+'bp |
'+tcode_array[0][2].to_s+' |
'+'%.2f' % (100*tcode_array[0][2].to_f/no_db.to_f).to_s+' % |
'
stats_file.puts '
Coding > '+size_filter2.to_s+'bp |
'+tcode_array[0][3].to_s+' |
'+'%.2f' % (100*tcode_array[0][3].to_f/no_db.to_f).to_s+' % |
'
stats_file.puts '
Putative Coding (all) |
'+tcode_array[1][0].to_s+' |
'+'%.2f' % (100*tcode_array[1][0].to_f/no_db.to_f).to_s+' % |
'
stats_file.puts '
Putative Coding > '+size_filter1.to_s+'bp |
'+tcode_array[1][2].to_s+' |
'+'%.2f' % (100*tcode_array[1][2].to_f/no_db.to_f).to_s+' % |
'
stats_file.puts '
Putative Coding > '+size_filter2.to_s+'bp |
'+tcode_array[1][3].to_s+' |
'+'%.2f' % (100*tcode_array[1][3].to_f/no_db.to_f).to_s+' % |
'
stats_file.puts '
Putative ncRNA |
'+ncrna_total.to_s+' |
'+'%.2f' % (100*ncrna_total.to_f/no_db.to_f).to_s+' % |
'
stats_file.puts '
Unknown (all) |
'+tcode_array[2][0].to_s+' |
'+'%.2f' % (100*tcode_array[2][0].to_f/no_db.to_f).to_s+' % |
'
stats_file.puts '
Unknown > '+size_filter1.to_s+'bp |
'+tcode_array[2][2].to_s+' |
'+'%.2f' % (100*tcode_array[2][2].to_f/no_db.to_f).to_s+' % |
'
stats_file.puts '
Unknown > '+size_filter2.to_s+'bp |
'+tcode_array[2][3].to_s+' |
'+'%.2f' % (100*tcode_array[2][3].to_f/no_db.to_f).to_s+' % |
'
end
stats_file.puts '
1 Percents for subclassifications of this category were calculated using this line as 100% reference.'
end
stats_file.puts html_end
stats_file.close
end
def html_code
html_head = '
FLN Summary
Full-LengtherNEXT Summary
'
html_1 = '
Status report
Status |
Unigenes |
% |
'
html_2= '
Unigene report
|
Unigenes |
% |
'
html_3= '
Database usage
|
Unigenes |
% |
'
html_4= '
Report guiding assembly quality
|
Unigenes |
% |
'
html_5 = '
'
return [html_head, html_1, html_2, html_3, html_4, html_5]
end
def annotation_stats(size_filter1,size_filter2)
seqs_number = 0
array_of_all_accs = []
array_of_complete_accs = []
error_1_num = 0
uni_500 = 0
uni_200 = 0
longest_one = 0
status_array = []
# total, status
complete = [0,'Complete']
putative_complete = [0,'Putative Complete']
c_terminus = [0,'C-terminus']
putative_c_terminus = [0,'Putative C-terminus']
n_terminus = [0,'N-terminus']
putative_n_terminus = [0,'Putative N-terminus']
internal = [0,'Internal']
cod_seq = [0,'Misassembled']
#userdb, SwissProt, TrEMBL
db_usage = [0,0,0]
File.open('fln_results/dbannotated.txt').each do |line|
line.chomp!
(name,fasta_length,acc,db_name,status,kk1,kk2,kk3,kk4,kk5,msgs) = line.split("\t")
if (line !~ /^Query_id\t/) && (!line.empty?)
seqs_number += 1
if (fasta_length.to_i > longest_one)
longest_one = fasta_length.to_i
end
array_of_all_accs.push acc
if (db_name !~ /^sp_/) && (db_name !~ /^tr_/)
db_usage[0] += 1
elsif (db_name =~ /^sp_/)
db_usage[1] += 1
elsif (db_name =~ /^tr_/)
db_usage[2] += 1
end
# -------------------------------------------------------------------------
if (fasta_length.to_i >= size_filter1)
uni_200 += 1
end
if (fasta_length.to_i >= size_filter2)
uni_500 += 1
end
# -------------------------------------------------------------------------
if (msgs =~ /ERROR#1/)
error_1_num += 1
end
# -------------------------------------------------------------------------
if (status == 'Complete')
complete[0] += 1
array_of_complete_accs.push acc
elsif (status == 'Putative Complete')
putative_complete[0] += 1
elsif (status == 'C-terminus')
c_terminus[0] += 1
elsif (status == 'N-terminus')
n_terminus[0] += 1
elsif (status == 'Putative C-terminus')
putative_c_terminus[0] += 1
elsif (status == 'Putative N-terminus')
putative_n_terminus[0] += 1
elsif (status == 'Internal')
internal[0] += 1
elsif (status == 'Misassembled')
cod_seq[0] += 1
end
# -------------------------------------------------------------------------
end
end
status_array = [complete, putative_complete, c_terminus, putative_c_terminus, n_terminus, putative_n_terminus, internal, cod_seq]
return [status_array, db_usage, seqs_number, error_1_num, array_of_all_accs.uniq.count, array_of_complete_accs.uniq.count, uni_500, uni_200, longest_one]
end
def testcode_stats(size_filter1,size_filter2)
seqs_number = 0
uni_500 = 0
uni_200 = 0
longest_one = 0
# total, status
coding_stats = [0,'Coding',0,0]
p_coding_stats = [0,'Putative Coding',0,0]
unknown_stats = [0,'Unknown',0,0]
File.open('fln_results/new_coding.txt').each do |line|
line.chomp!
(name,fasta_length,acc,db_name,status) = line.split("\t")
if (line !~ /^Query_id\t/) && (!line.empty?)
seqs_number += 1
if (fasta_length.to_i > longest_one)
longest_one = fasta_length.to_i
end
# -------------------------------------------------------------------------
if (fasta_length.to_i >= size_filter1)
uni_200 += 1
end
if (fasta_length.to_i >= size_filter2)
uni_500 += 1
end
# -------------------------------------------------------------------------
if (fasta_length.to_i > size_filter1)
if (status == 'coding')
coding_stats[2] += 1
elsif (status == 'putative_coding')
p_coding_stats[2] += 1
elsif (status == 'unknown')
unknown_stats[2] += 1
end
end
if (fasta_length.to_i > size_filter2)
if (status == 'coding')
coding_stats[3] += 1
elsif (status == 'putative_coding')
p_coding_stats[3] += 1
elsif (status == 'unknown')
unknown_stats[3] += 1
end
end
if (status == 'coding')
coding_stats[0] += 1
elsif (status == 'putative_coding')
p_coding_stats[0] += 1
elsif (status == 'unknown')
unknown_stats[0] += 1
end
end
end
status_array = [coding_stats, p_coding_stats, unknown_stats]
return [status_array, seqs_number, uni_500, uni_200, longest_one]
end
def ncrna_stats(size_filter1,size_filter2)
uni_500 = 0
uni_200 = 0
nc_total = 0
longest_one = 0
File.open('fln_results/nc_rnas.txt').each do |line|
line.chomp!
(name,fasta_length,acc,db_name,status) = line.split("\t")
if (status == 'Putative ncRNA')
if (fasta_length.to_i > longest_one)
longest_one = fasta_length.to_i
end
# -------------------------------------------------------------------------
if (fasta_length.to_i >= size_filter1)
uni_200 += 1
end
if (fasta_length.to_i >= size_filter2)
uni_500 += 1
end
# -------------------------------------------------------------------------
nc_total += 1
end
end
return [nc_total, uni_500, uni_200, longest_one]
end
def chimera_stats(size_filter1,size_filter2)
uni_500 = 0
uni_200 = 0
ch_total = 0
longest_one = 0
db_usage = [0,0,0]
if !File.exists?('fln_results/chimeric_sequences.txt')
return [0, 0, 0, longest_one, db_usage]
else
File.open('fln_results/chimeric_sequences.txt').each do |line|
line.chomp!
if (!line.empty?)
(name,fasta_length,acc,db_name,status) = line.split("\t")
if (status == 'Putative chimera')
if (fasta_length.to_i > longest_one)
longest_one = fasta_length.to_i
end
# -------------------------------------------------------------------------
if (fasta_length.to_i >= size_filter1)
uni_200 += 1
end
if (fasta_length.to_i >= size_filter2)
uni_500 += 1
end
# -------------------------------------------------------------------------
if (db_name =~ /^sp_/)
db_usage[1] += 1
elsif (db_name =~ /^tr_/)
db_usage[2] += 1
else
db_usage[0] += 1
end
# -------------------------------------------------------------------------
ch_total += 1
end
end
end
db_usage.each_with_index do |db,i|
db_usage[i] = db/2
end
return [(ch_total/2), (uni_500/2), (uni_200/2), longest_one, db_usage]
end
end
end