require 'types.rb' module FlnStats def initialize_stats_hash stats_hash = {} stats_hash['input_seqs'] = 0 stats_hash['output_seqs'] = 0 stats_hash['failed'] = 0 stats_hash['sequences_>200'] = 0 stats_hash['sequences_>500'] = 0 stats_hash['longest_unigene'] = 0 stats_hash['good_seqs'] = 0 stats_hash['artifacts'] = 0 stats_hash['misassembled'] = 0 stats_hash['chimeras'] = 0 stats_hash['other_artifacts'] = 0 stats_hash['unknown'] = 0 stats_hash['unknown_>200'] = 0 stats_hash['unknown_>500'] = 0 stats_hash['prot_annotated'] = 0 stats_hash['complete'] = 0 stats_hash['complete_sure'] = 0 stats_hash['complete_putative'] = 0 stats_hash['n_terminal'] = 0 stats_hash['n_terminal_sure'] = 0 stats_hash['n_terminal_putative'] = 0 stats_hash['c_terminal'] = 0 stats_hash['c_terminal_sure'] = 0 stats_hash['c_terminal_putative'] = 0 stats_hash['internal'] = 0 stats_hash['swissprot'] = 0 stats_hash['trembl'] = 0 stats_hash['userdb'] = 0 stats_hash['ncrna'] = 0 stats_hash['coding'] = 0 stats_hash['coding_sure'] = 0 stats_hash['coding_putative'] = 0 stats_hash['coding_>200'] = 0 stats_hash['coding_>500'] = 0 stats_hash['different_orthologues'] = 0 stats_hash['different_completes'] = 0 stats_hash['BA_index'] = 0 return stats_hash end def get_taxonomy(name, taxonomy) organism = nil if name.include?('OS=') fields = name.split('OS=',2) organism = fields.last.split(' GN=').first.strip elsif name[0..2] = 'sp=' || name[0..2] = 'tr=' name =~ /(\w+ \w+) \(([\w ]+)\) \(([\w ]+)\)/ if !$1.nil? organism = $1 else name =~ /(\w+ \w+) \(([\w ]+)\)/ if !$1.nil? organism = $1 end end else organism = name.split(";",2).last organism = organism.split('.', 2).first organism.gsub!(/\(\D+\)/,'') if organism.split(' ').length > 1 organism.gsub!('.','') organism.gsub!(/^ /,'') organism.gsub!(' ','') organism.strip! end end if !organism.nil? organism = organism.split(' ')[0..1].join(' ') if taxonomy[organism].nil? taxonomy[organism] = 1 else taxonomy[organism] += 1 end end end def initialize_stats_hash_reptrans stats_hash = {} stats_hash['prot_annotated'] = 0 stats_hash['est_annotated'] = 0 stats_hash['coding_>1'] = 0 stats_hash['coding_>0.94'] = 0 stats_hash['coding_>0.84'] = 0 stats_hash['coding_>0.73'] = 0 stats_hash['coding_>0'] = 0 return stats_hash end def summary_stats(seqs, stats_hash, diff_ids_array, diff_ids_complete_array) low_limit = 200 upper_limit = 500 #All seqs #----------- stats_hash['output_seqs'] += seqs.length good_seqs = seqs.select{|s| s.type >= UNKNOWN} stats_hash['good_seqs'] += good_seqs.length #Longest_unigene current_longest_unigene = seqs.map{|s| s.fasta_length}.max if current_longest_unigene > stats_hash['longest_unigene'] stats_hash['longest_unigene'] = current_longest_unigene end #Load ids seqs.map{|s| if s.type > UNKNOWN && s.type < NCRNA diff_ids_array << s.hit.acc end} diff_ids_array.uniq! #By Length stats_hash['sequences_>200'] += good_seqs.select{|s| s.fasta_length > low_limit}.length stats_hash['sequences_>500'] += good_seqs.select{|s| s.fasta_length > upper_limit}.length stats_hash['failed'] += seqs.select{|s| s.type == FAILED}.length #Unknown #----------------------------- all_unknown = seqs.select{|s| s.type == UNKNOWN} stats_hash['unknown'] += all_unknown.length #By Length stats_hash['unknown_>200'] += all_unknown.select{|s| s.fasta_length > low_limit}.length stats_hash['unknown_>500'] += all_unknown.select{|s| s.fasta_length > upper_limit}.length #Artifacts #---------------- stats_hash['artifacts'] += seqs.select{|s| s.type < UNKNOWN && s.type > FAILED}.length stats_hash['misassembled'] += seqs.select{|s| s.type == MISASSEMBLED}.length stats_hash['chimeras'] += seqs.select{|s| s.type == CHIMERA && !s.seq_name.include?('_split_')}.length # We don't want count a multiple chimera stats_hash['other_artifacts'] += seqs.select{|s| s.type == OTHER}.length #Annotated with prot #--------------------- prot_annotated = seqs.select{|s| s.type >= COMPLETE && s.type <= INTERNAL} stats_hash['prot_annotated'] += prot_annotated.length #By annotation stats_hash['internal'] += seqs.select{|s| s.type == INTERNAL}.length complete = seqs.select{|s| s.type == COMPLETE} n_terminal = seqs.select{|s| s.type == N_TERMINAL} c_terminal = seqs.select{|s| s.type == C_TERMINAL} stats_hash['complete'] += complete.length stats_hash['n_terminal'] += n_terminal.length stats_hash['c_terminal'] += c_terminal.length #Load complete ids complete.map{|s| diff_ids_complete_array << s.hit.acc} diff_ids_complete_array.uniq! #----> By Status stats_hash['complete_sure'] += complete.select{|s| s.status}.length stats_hash['n_terminal_sure'] += n_terminal.select{|s| s.status}.length stats_hash['c_terminal_sure'] += c_terminal.select{|s| s.status}.length stats_hash['complete_putative'] += complete.select{|s| !s.status}.length stats_hash['n_terminal_putative'] += n_terminal.select{|s| !s.status}.length stats_hash['c_terminal_putative'] += c_terminal.select{|s| !s.status}.length #By database swissprot = prot_annotated.select{|s| s.db_name =~ /^sp_/}.length trembl = prot_annotated.select{|s| s.db_name =~ /^tr_/}.length stats_hash['swissprot'] += swissprot stats_hash['trembl'] += trembl stats_hash['userdb'] += prot_annotated.length - swissprot - trembl #ncRNA #---------------- stats_hash['ncrna'] += seqs.select{|s| s.type == NCRNA}.length #Coding sequences #---------------- coding = seqs.select{|s| s.type == CODING} stats_hash['coding'] += coding.length #By Status stats_hash['coding_sure'] += coding.select{|s| s.status}.length stats_hash['coding_putative'] += coding.select{|s| !s.status}.length #By Length stats_hash['coding_>200'] += coding.select{|s| s.fasta_length > low_limit}.length stats_hash['coding_>500'] += coding.select{|s| s.fasta_length > upper_limit}.length return stats_hash, diff_ids_array, diff_ids_complete_array end def last_stats(stats_hash, diff_ids_array, diff_ids_complete_array) stats_hash['different_orthologues'] = diff_ids_array.length stats_hash['different_completes'] = diff_ids_complete_array.length #BA index if stats_hash['prot_annotated'] > 0 && stats_hash['complete'] > 0 && stats_hash['sequences_>500'] > 0 && stats_hash['different_orthologues'] > 0 && stats_hash['different_completes'] > 0 coef_anot_geom = (stats_hash['prot_annotated'] * stats_hash['complete'] * 1.0)/(stats_hash['sequences_>500']*10000) coef_mejora = (stats_hash['different_orthologues']*1.0 + stats_hash['different_completes'])/(stats_hash['prot_annotated'] + stats_hash['complete']) stats_hash['BA_index'] = Math.sqrt(coef_anot_geom*coef_mejora) end return stats_hash end def coding_stats_reptrans(coding_seq, stats_hash) group = nil if coding_seq.t_code > 1 group = 'coding_>1' elsif coding_seq.t_code > 0.95 group = 'coding_>0.94' elsif coding_seq.t_code > 0.85 group = 'coding_>0.84' elsif coding_seq.t_code > 0.73 group = 'coding_>0.73' elsif coding_seq.t_code > 0 group = 'coding_>0' end if !group.nil? stats_hash[group] += 1 end end def write_summary_stats(stats_hash, stats_taxonomy, diff_ids_array, diff_ids_complete_array, txt_file, html_file) stats_hash = last_stats(stats_hash, diff_ids_array, diff_ids_complete_array) write_txt(stats_hash, txt_file) write_html(stats_hash, html_file, stats_taxonomy) end def write_reptrans_stats(stats_hash, html_file, txt_file) html = File.open(html_file,'w') txt = File.open(txt_file,'w') write_txt(stats_hash, txt) write_html_reptrans(stats_hash, html) end def write_html_reptrans(stats_hash, html_file) html_file.puts '' header(html_file) body_reptrans(html_file, stats_hash) html_file.puts '' end def write_txt(stats_hash, file) stats_hash.each do |key, value| file.puts "#{value}\t#{key}" end end def write_html(stats_hash, html_file, stats_taxonomy) js_path = File.dirname(html_file.to_path) system("unzip -qq #{File.join(File.dirname(__FILE__), '..', '..', 'expresscanvas.zip')} -d #{js_path}") if !File.exists?(File.join(js_path, 'expresscanvas')) html_file.puts '' html_header(html_file, stats_hash, stats_taxonomy) body(html_file, stats_hash) html_file.puts '' end def header(html_file) html_file.puts '', 'FLN Summary', '' end def html_header(html_file, stats_hash, stats_taxonomy) structural_data_sure = [] structural_data_sure << stats_hash['unknown'] structural_data_sure << stats_hash['complete_sure'] structural_data_sure << stats_hash['n_terminal_sure'] structural_data_sure << stats_hash['c_terminal_sure'] structural_data_sure << stats_hash['internal'] structural_data_sure << stats_hash['ncrna'] structural_data_sure << stats_hash['coding'] structural_data_putative = [] structural_data_putative << 0 structural_data_putative << stats_hash['complete_putative'] structural_data_putative << stats_hash['n_terminal_putative'] structural_data_putative << stats_hash['c_terminal_putative'] structural_data_putative << 0 structural_data_putative << 0 structural_data_putative << stats_hash['coding_putative'] values_structural_sure = "[#{structural_data_sure.map{|stat| stat*100.0/stats_hash['good_seqs']}.join(', ')}]" values_structural_putative = "[#{structural_data_putative.map{|stat| stat*100.0/stats_hash['good_seqs']}.join(', ')}]" data = stats_taxonomy.to_a.sort{|s2, s1| s1.last <=> s2.last}[0..20] smps_taxonomy = "['#{data.map{|tax| tax.first}.join("', '")}']" values_taxonomy = "[#{data.map{|tax| tax.last}.join(', ')}]" html_file.puts ' FLN Summary " end def body_reptrans(html_file, stats_hash) html_file.puts '', '
' # Start body html_file.puts '
', 'Full-LengtherNEXT Representative Transcriptome Summary', '
' # TABLES html_file.puts '
' reptrans_report(html_file, stats_hash, 'left') reptrans_acumulative_report(html_file, stats_hash, 'rigth') html_file.puts '
' # END TABLES html_file.puts '
', '' # End body end def body(html_file, stats_hash) html_file.puts '', '
' # Start body html_file.puts '
', 'Full-LengtherNEXT Summary', '
' # TABLES html_file.puts '
' general_report(html_file, stats_hash, 'left') assembly_report(html_file, stats_hash, 'right') html_file.puts '
' html_file.puts '
' status_graph(html_file, 'left') status_report(html_file, stats_hash, 'rigth') html_file.puts '
' html_file.puts '
' taxonomy_graph(html_file, 'left') database_report(html_file, stats_hash, 'rigth') html_file.puts '
' # END TABLES html_file.puts '
', '' # End body end def reptrans_report(html_file, stats_hash, align) html = [] all_seqs = 0 stats_hash.values.map{|v| all_seqs += v} html << '
' html << table_title('Sequences info') html.concat(table_header(['', 'Sequences', '%'], 0)) html.concat(single_row('Output', all_seqs, all_seqs)) html.concat(single_row('Annotated with protein', stats_hash['prot_annotated'], all_seqs)) html.concat(single_row('Annotated with EST', stats_hash['est_annotated'], all_seqs)) html.concat(single_row('Coding test-code > 1', stats_hash['coding_>1'], all_seqs)) html.concat(single_row('Coding test-code > 0.94', stats_hash['coding_>0.94'], all_seqs)) html.concat(single_row('Coding test-code > 0.84', stats_hash['coding_>0.84'], all_seqs)) html.concat(single_row('Coding test-code > 0.73', stats_hash['coding_>0.73'], all_seqs)) html.concat(single_row('Coding test-code > 0', stats_hash['coding_>0'], all_seqs)) html << '' html << '
' write_array_html(html, html_file) end def reptrans_acumulative_report(html_file, stats_hash, align) html = [] all_seqs = 0 stats_hash.values.map{|v| all_seqs += v} html << '
' html << table_title('Sequences summary (Acumulative)') html.concat(table_header(['', 'Sequences', '%'], 0)) acumulative = 0 html.concat(single_row('Annotated with protein', stats_hash['prot_annotated'], all_seqs)) acumulative += stats_hash['prot_annotated'] html.concat(single_row('Annotated with EST', stats_hash['est_annotated'] + acumulative, all_seqs)) acumulative += stats_hash['est_annotated'] html.concat(single_row('Coding test-code > 1', stats_hash['coding_>1'] + acumulative, all_seqs)) acumulative += stats_hash['coding_>1'] html.concat(single_row('Coding test-code > 0.94', stats_hash['coding_>0.94'] + acumulative, all_seqs)) acumulative += stats_hash['coding_>0.94'] html.concat(single_row('Coding test-code > 0.84', stats_hash['coding_>0.84'] + acumulative, all_seqs)) acumulative += stats_hash['coding_>0.84'] html.concat(single_row('Coding test-code > 0.73', stats_hash['coding_>0.73'] + acumulative, all_seqs)) html << '' html << '
' write_array_html(html, html_file) end def general_report(html_file, stats_hash, align) html = [] html << '
' html << table_title('General info') html.concat(table_header(['', 'Sequences', '%'], 0)) html.concat(single_row('Input', stats_hash['input_seqs'], stats_hash['input_seqs'])) html.concat(single_row('Failing sequences', stats_hash['failed'], stats_hash['output_seqs'])) html.concat(single_row('Artifacts 1', stats_hash['artifacts'], stats_hash['output_seqs'])) html.concat(single_row('Misassembled', stats_hash['misassembled'], stats_hash['artifacts'], TRUE)) html.concat(single_row('Chimeras', stats_hash['chimeras'], stats_hash['artifacts'], TRUE)) html.concat(single_row('Other', stats_hash['other_artifacts'], stats_hash['artifacts'], TRUE)) html.concat(single_row('Sequences with resolved chimeras', stats_hash['output_seqs'], stats_hash['input_seqs'])) html.concat(single_row('Sequences without artifacts', stats_hash['good_seqs'], stats_hash['output_seqs'])) html.concat(single_row('BA index', "%5.2f" % [stats_hash['BA_index']], nil)) if stats_hash['BA_index'] > 0 html << '' html << '
' write_array_html(html, html_file) end def taxonomy_graph(html_file, align) html_file.puts '
' html_file.puts table_title('Taxonomy distribution on annotations') html_file.puts '
' end def database_report(html_file, stats_hash, align) html = [] html << '
' html << table_title('Database usage') html.concat(table_header(['', 'Unigenes', '%'], 0)) html.concat(single_row('UserDB', stats_hash['userdb'], stats_hash['good_seqs'])) html.concat(single_row('SwissProt', stats_hash['swissprot'], stats_hash['good_seqs'])) html.concat(single_row('TrEMBL', stats_hash['trembl'], stats_hash['good_seqs'])) html.concat(single_row('ncRNA', stats_hash['ncrna'], stats_hash['good_seqs'])) html.concat(single_row('None', stats_hash['coding']+ stats_hash['unknown'], stats_hash['good_seqs'])) html.concat(single_row('Total', stats_hash['good_seqs'], stats_hash['good_seqs'])) html << '' html << '
' write_array_html(html, html_file) end def assembly_report(html_file, stats_hash, align) html = [] html << '
' html << table_title('Report guiding assembly quality') html.concat(table_header(['', 'Unigenes', '%'], 0)) html.concat(single_row('Unigenes', stats_hash['good_seqs'], stats_hash['good_seqs'])) html.concat(single_row('Unigenes >500pb', stats_hash['sequences_>500'], stats_hash['good_seqs'])) html.concat(single_row('Unigenes >200pb', stats_hash['sequences_>200'], stats_hash['good_seqs'])) html.concat(single_row('Longest unigene', stats_hash['longest_unigene'], nil)) html.concat(single_row('With orthologue 1', stats_hash['prot_annotated'], stats_hash['good_seqs'])) html.concat(single_row('Different orthologue IDs', stats_hash['different_orthologues'], stats_hash['prot_annotated'], TRUE)) html.concat(single_row('Complete transcripts', stats_hash['complete'], stats_hash['prot_annotated'], TRUE)) html.concat(single_row('Different complete transcripts ', stats_hash['different_completes'], stats_hash['prot_annotated'], TRUE)) html.concat(single_row('ncRNA', stats_hash['ncrna'], stats_hash['good_seqs'])) without_orthologue = stats_hash['coding']+ stats_hash['unknown'] html.concat(single_row('Without orthologue 1', without_orthologue, stats_hash['good_seqs'])) html.concat(single_row('Coding (all)', stats_hash['coding'], without_orthologue, TRUE)) html.concat(single_row('Coding > 200bp', stats_hash['coding_>200'], without_orthologue, TRUE)) html.concat(single_row('Coding > 500bp', stats_hash['coding_>500'], without_orthologue, TRUE)) html.concat(single_row('Unknown (all)', stats_hash['unknown'], without_orthologue, TRUE)) html.concat(single_row('Unknown > 200bp', stats_hash['unknown_>200'], without_orthologue, TRUE)) html.concat(single_row('Unknown > 500bp', stats_hash['unknown_>500'], without_orthologue, TRUE)) html << '' html << '1 Percents for subclassifications of this category
were calculated using this line as 100% reference.' html << '
' write_array_html(html, html_file) end def status_graph(html_file, align) html_file.puts '
' html_file.puts table_title('Structural profile') html_file.puts '
' end def status_report(html_file, stats_hash, align) html = [] html << '
' html << table_title('Status report') html.concat(table_header(['Status', 'Unigenes', '%'], 2)) html.concat(fused_row('Complete', stats_hash['complete_sure'], stats_hash['complete_putative'], stats_hash['good_seqs'])) html.concat(fused_row('C-terminus', stats_hash['c_terminal_sure'], stats_hash['c_terminal_putative'], stats_hash['good_seqs'])) html.concat(fused_row('N-terminus', stats_hash['n_terminal_sure'], stats_hash['n_terminal_putative'], stats_hash['good_seqs'])) html.concat(composed_single_row('Internal', stats_hash['internal'], stats_hash['good_seqs'])) html.concat(fused_row('Coding', stats_hash['coding_sure'], stats_hash['coding_putative'], stats_hash['good_seqs'])) html.concat(composed_single_row('ncRNA', stats_hash['ncrna'], stats_hash['good_seqs'])) html.concat(composed_single_row('Unknown', stats_hash['unknown'], stats_hash['good_seqs'])) html.concat(composed_single_row('Total', stats_hash['good_seqs'], stats_hash['good_seqs'])) html << '' html << '
' write_array_html(html, html_file) end def table_title(title) html = '
'+title+'
' return html end def table_header(col_array, colspan) html = [] html << '' # Table header html << '' col_array.each_with_index do |col,i| if i == 0 && colspan > 0 html << '' else html << '' end end html << '' return html end def single_row(name, magnitude, total, space = FALSE) if space name = '     '+ name end html = [] html << '' html << '' html.concat(sub_row(magnitude, total)) html << '' return html end def fused_row(type, sure_magnitude, putative_magnitude, total) html = [] html << '' html << seq_status('Sure') html.concat(sub_row(sure_magnitude, total)) html << '' html << '' html << seq_status('Putative') html.concat(sub_row(putative_magnitude, total)) html << '' return html end def seq_status(status) html = '' return html end def sub_row(magnitude, total) if !total.nil? perc_float = magnitude*100.0/total if !perc_float.nan? percentage = '%.2f' % perc_float.to_s percentage += '%' else percentage ='-' end else percentage = '-' end html = [] html << '' html << '' return html end def composed_single_row(type, magnitude, total) html = [] html << '' html << '' html.concat(sub_row(magnitude, total)) html << '' return html end def write_array_html(html, html_file) html.map{|line| html_file.puts line} end end
'+col+''+col+'
'+name+'
'+type+'
'+status+''+magnitude.to_s+''+percentage+'
'+type+'