require 'axml'
require 'hash_by'
require 'optparse'
require 'ostruct'
require 'spec_id'
require 'spec_id/precision'
#############################################################
# GLOBALS:
PRECISION_PROGRAM_BASE = 'precision'
DEF_PREFIX = "INV_"
DEF_PERCENT_FP = "5.0"
#############################################################
# @TODO: add group probability title (showin all group probabilities) for protein prob
#class String
# def margin
# self.gsub(/^\s*\|/,'')
# end
#end
class ProteinSummary
module HTML
def header
%Q{
}
end
def style
'
'
end
# an anchor and a title
def at(display, title)
"#{display}"
end
def trailer
%q{
}
end
def tr
"|
| #{yield}
|
\n".margin
end
def table
"|\n".margin
end
def tds(arr)
arr.map {|v| "#{v} | "}.join
end
def ths(arr)
str = arr.map {|v| "#{v} | "}.join
str << "\n"
end
end
end
class ProteinSummary
include ProteinSummary::HTML
def ref_html(gi, name)
"#{gi}"
end
# Takes the -prot.xml filename and grabs the png file (if available)
def error_info(prot_file_name)
img = prot_file_name.gsub('.xml', '.png')
img_bn = File.basename(img)
"![\"[](\"#{img_bn}\")
\n
"
end
# attempts to get the NCBI gi code
def accession(name)
if (name.include? '|') && (name[0,3] == 'gi|')
name.split('|')[1]
else
name
end
end
def flag_to_regex(flag, prefix=false)
if flag
if prefix
/^#{Regexp.escape(flag)}/
else
/#{Regexp.escape(flag)}/
end
else
nil
end
end
# given a list of proteins, output a tab delimited textfile with protein
# name and the total number of peptides found
def output_peptide_counts_file(prots, filename)
File.open(filename, "w") do |fh_out|
prots.each do |prot|
fh_out.puts [prot._protein_name, prot._total_number_peptides].join("\t")
end
end
end
# filters on the false positive regex and sorts by prot probability
def filter_and_sort(uniq_prots, flag=nil, prefix=false)
false_flag_re = flag_to_regex(flag, prefix)
sorted = uniq_prots.sort_by {|prt| [prt._probability, prt.parent._probability]}.reverse
## filter on prefix
if prefix
sorted = sorted.reject {|prot| prot._protein_name =~ false_flag_re }
end
sorted
end
# assumes that these are sorted on probability
# desired_fppr is a float
# returns [number_of_prots, actual_fppr]
def num_prots_above_fppr(prots, desired_fppr)
current_fppr_rate_percent = 0.0
previous_fppr_rate_percent = 0.0
current_sum_one_minus_prob = 0.0
proteins_within_fppr = 0
actual_fppr = nil
already_found = false
prot_cnt = 0
prots.each do |prot|
prot_cnt += 1
# SUM(1-probX)/#prots
current_sum_one_minus_prob += 1.0 - prot._probability.to_f
current_fppr_rate_percent = (current_sum_one_minus_prob / prot_cnt) * 100
if current_fppr_rate_percent > desired_fppr && !already_found
actual_fppr = previous_fppr_rate_percent
proteins_within_fppr = prot_cnt
already_found = true
end
previous_fppr_rate_percent = current_fppr_rate_percent
end
[proteins_within_fppr, actual_fppr]
end
#### #readable_previous_fppr_rate_percent = sprintf("%.2f", previous_fppr_rate_percent)
# returns a string of the table rows
# false_positive_rate (give as a %) is the cutoff mark
# returns the number of proteins at the desired_fppr (if given)
def table_rows(uniq_prots, prefix, false_positive_rate_percent, num_cols, desired_fppr, actual_percent_fp, peptide_count_filename=nil)
prot_cnt = 0
uniq_prots.map do |prot|
tr do
prot_cnt += 1
gi = accession(prot._protein_name)
tds([prot_cnt, prot._probability, ref_html(gi, prot._protein_name), prot.annotation.first._protein_description, prot._percent_coverage, peptide_cell(prot_cnt, prot._unique_stripped_peptides.split('+')), prot._total_number_peptides, prot._pct_spectrum_ids])
end
end.join
end
def print_html_pieces(file, *pieces)
File.open(file, "w") do |out|
pieces.each do |piece|
out.print piece
end
end
end
def file_info(file)
"Source File Information
File: #{File.expand_path(file)}
Last Modified: #{File.mtime(file)}
Size: #{File.size(file)/1000} KB
"
end
def bioworks_script_info(obj)
version = "3.2??"
if obj.version
version = obj.version
end
script_info{"Bioworks version #{version}"}
end
def protproph_script_info
begin
where = `which xinteract`
reply = `#{where}`
rescue Exception
reply = ""
end
prophet = "TPP (version unknown)" # put your version here if you can't get it dynamically
if reply =~ /xinteract.*?\((TPP .*)\)/
prophet = $1.dup
end
script_info { "ProteinProphet from: #{prophet}" }
end
def mspire_version
string = "mspire"
begin
if `gem list --local mspire` =~ /mspire \((.*?)\)/
string << (" v" + $1)
end
rescue Exception
end
string
end
def script_info
"Software Information
#{yield}
Ruby package: #{mspire_version}
Command: #{[File.basename(__FILE__), *@orig_argv].join(" ")}"
end
def proph_output(file, outfn, opt, fppr_output_as_html)
header_anchors = [at('#', 'number'), at('prob','protein probability (for Prophet, higher is better)'), at('ref', 'gi number if available (or complete reference)'), at('annotation', 'annotation from the fasta file'), at('%cov', 'percent of protein sequence covered by corresponding peptides'), at('peps', 'unique peptides identified (includes non-contributing peptides). Click number to show/hide'), at('#peps', 'total number of corresponding peptides that contributed to protein probability'), at('%ids', 'fraction of correct dataset peptide identifications corresponding to protein')]
num_cols = header_anchors.size
theaders = ths(header_anchors)
root = AXML.parse_file(file)
prots = []
## find the min_prob at a fppr of XX
min_prob_redline = 1.01 # if no fppr is less than what they give, then all are redlined!
if opt.c
actual_percent_fp = opt.c.to_f
elsif opt.cut_at
actual_percent_fp = opt.cut_at.to_f
else
actual_percent_fp = nil
end
root.protein_group.each do |group|
group.protein.each do |prt|
prots << prt
end
end
uniq_prots = prots.hash_by(:_protein_name).map{|name,prot_arr| prot_arr.first }
filtered_sorted_prots = filter_and_sort(uniq_prots, opt.f, opt.prefix)
## num proteins above cutoff (if opt.c)
num_prots_html = ''
if opt.c || opt.cut_at
(num_prots, actual_fppr) = num_prots_above_fppr(filtered_sorted_prots, actual_percent_fp)
num_prots_html = num_prots_to_html(actual_percent_fp, actual_fppr, num_prots)
end
if opt.cut_at
filtered_sorted_prots = filtered_sorted_prots[0,num_prots]
end
output_peptide_counts_file(filtered_sorted_prots, opt.peptide_count) if opt.peptide_count
table_string = table do
tr{theaders} + table_rows(filtered_sorted_prots, opt.f, actual_percent_fp, num_cols, opt.c.to_f, actual_percent_fp, opt.peptide_count)
end
er_info = opt.precision ? error_info(file) : ""
html_pieces = [outfn, header, fppr_output_as_html, er_info, file_info(file), protproph_script_info, num_prots_html, table_string, trailer]
print_html_pieces(*html_pieces)
end # proph_output
# given a list of peptide sequences creates javascript to hide/show them
def peptide_cell(prot_num, peptide_sequences)
"#{peptide_sequences.size}#{peptide_sequences.join(', ')}
"
end
# takes spec_id object
# the outfn is the output filename
# opt is an OpenStruct that holds opt.f = the false prefix
def bioworks_output(spec_id, outfn, file=nil, false_flag_re=nil, fppr_output_as_html=nil)
fppr_output_as_html ||= ''
header_anchors = [at('#', 'number'), at('prob','protein probability (for Bioworks, lower is better)'), at('ref', 'gi number if available (or complete reference)'), at('annotation', 'annotation from the fasta file'), at('%cov', 'percent of protein sequence covered by corresponding peptides'), at('peps', 'unique peptides identified (at any confidence) Click number to show/hide.'), at('#peps', 'total number of peptides seen (not unique)')]
num_cols = header_anchors.size
theaders = ths(header_anchors)
proteins = spec_id.prots
protein_num = 0
rows = ""
proteins.each do |prot|
if false_flag_re && prot.reference =~ false_flag_re
next
end
uniq_peps = Hash.new {|h,k| h[k] = true; }
protein_num += 1
prot.peps.each do |pep|
uniq_peps[pep.sequence.split('.')[1]] = true
end
pieces = prot.reference.split(' ')
long_prot_name = pieces.shift
annotation = pieces.join(' ')
accession = prot.accession
if accession == '0' ; accession = long_prot_name end
rows << tr{ tds([protein_num, prot.protein_probability, ref_html(accession, long_prot_name), annotation, prot.coverage, peptide_cell(protein_num, uniq_peps.keys), prot.peps.size]) }
end
table_string = table do
tr{theaders} + rows
end
print_html_pieces(outfn, header, fppr_output_as_html, file_info(file), bioworks_script_info(spec_id), table_string, trailer)
end # bioworks_output
def num_prots_to_html(desired_cutoff, actual_cutoff, num_proteins)
actual_cutoff = sprintf("%.3f", actual_cutoff)
desired_cutoff = sprintf("%.3f", desired_cutoff)
"False Positive Predictive Rate [ FP/(TP+FP) ]
Desired FPPR: #{desired_cutoff} %
Actual FPPR: #{actual_cutoff} %
Number of Proteins at Actual FPPR: #{num_proteins}
"
end
# transforms the output string of file_as_decoy into html
def file_as_decoy_to_html(string)
lines = string.split("\n")
#puts lines ?? is this supposed to be commented out?
lines = lines.reject do |obj| obj =~ /\*{10}/ end
lines.map! do |line| "#{line}
" end
"
Classification Analysis
#{lines.join("\n")}
"
end
# transforms the output string of file_as_decoy into html
def prefix_as_decoy_to_html(string)
"
Classification Analysis
" +
string
end
def create_from_command_line_args(argv)
@orig_argv = argv.dup
opt = OpenStruct.new
opt.f = DEF_PREFIX
opts = OptionParser.new do |op|
op.banner = "usage: #{File.basename(__FILE__)} [options] .xml ..."
op.separator " where file = bioworks -or- -prot (prophet output)"
op.separator " outputs: .summary.html"
op.separator ""
op.on("-f", "--false ", "ignore proteins with flag (def: #{DEF_PREFIX})") {|v| opt.f = v }
op.on("--prefix", "false flag for prefixes only") {|v| opt.prefix = v }
op.on("-p", "--precision", "include the output from precision.rb") {|v| opt.p = v }
op.separator(" if --precision then -f is used to specify a file or prefix")
op.separator(" that indicates the false positives.")
op.on("--peptide_count ", "outputs text file with # peptides per protein") {|v| opt.peptide_count = v}
op.separator ""
op.separator "Options for #{PRECISION_PROGRAM_BASE}.rb :"
op.on("--#{PRECISION_PROGRAM_BASE}", "include output of #{PRECISION_PROGRAM_BASE}.rb,") {|v| opt.precision = v}
op.separator(" type '#{PRECISION_PROGRAM_BASE}.rb' for details")
op.separator ""
op.separator "Specific to ProteinProphet (with no concatenated DB):"
op.on("-c", "--cutoff percent", "false positive predictive rate (FPPR)% for given cutoff") {|v| opt.c = v }
op.on("--cut_at percent", "only reports proteins within FPPR %") {|v| opt.cut_at = v }
end
opts.parse!(argv)
if argv.size < 1
puts opts
return
end
fppr_output_as_html = ''
files = argv.to_a
files.each do |file|
outfn = file.sub(/\.xml$/, '.summary.html')
outfn = outfn.sub(/\.srg$/, '.summary.html')
## False Positive Rate Calculation:
if opt.precision
opt.o = outfn # won't actually be written over, but used
to_use_argv = create_precision_argv(file, opt)
(out_string, opt) = Prec.new.precision(to_use_argv)
fppr_output_as_html = prefix_as_decoy_to_html(out_string)
end
case SpecID.file_type(file)
when "protproph"
#spec_id = SpecID.new(file)
proph_output(file, outfn, opt, fppr_output_as_html)
when "bioworks"
spec_id = SpecID.new(file)
false_regex = flag_to_regex(opt.f, opt.prefix)
bioworks_output(spec_id, outfn, file, false_regex, fppr_output_as_html)
else
abort "filetype for #{file} not recognized!"
end
end
end # method create_from_command_line
def create_precision_argv(file, opt)
# include only those options specific
new_argv = [file]
if opt.prefix ; new_argv << '--prefix' end
if opt.f ; new_argv << '-f' << opt.f end
if opt.o ; new_argv << '-o' << opt.o end
new_argv
end
end # ProteinSummary
##################################################################
# MAIN
##################################################################