require 'ms/ident/pepxml'
require 'ms/ident/pepxml/spectrum_query'
require 'ms/ident/pepxml/search_result'
require 'ms/ident/pepxml/search_hit'
#require 'ms/msrun'
require 'ms/sequest/srf'
require 'ms/sequest/pepxml'
class Ms::Sequest::Srf
module Pepxml
# A hash with the following *symbol* keys may be set:
#
# Run Info
# *:ms_model*:: nil
# *:ms_ionization*:: 'ESI'
# *:ms_detector*:: 'UNKNOWN'
# *:ms_mass_analyzer*:: nil - typically extracted from the srf file and matched with ModelToMsAnalyzer
# *:ms_manufacturer*:: 'Thermo'
#
# Raw data
# *:mz_dir*:: nil - path to the mz[X]ML directory, defaults to the directory the srf file is contained in. mz[X]ML data must be available to embed retention times
# *:raw_data*:: \['.mzML', '.mzXML'\] - preferred extension for raw data
#
# Database
# *:db_seq_type*:: 'AA' - AA or NA
# *:db_dir*:: nil - the directory the fasta file used for the search is housed in. A valid pepxml file must point to a valid fasta file!
# *:db_residue_size*:: nil - An integer for the number of residues in the database. if true, calculates the size of the fasta database.
# *:db_name:: nil
# *:db_orig_url*:: nil
# *:db_release_date*:: nil
# *:db_release_id*:: nil
#
# Search Hits
# *:num_hits*:: 1 - the top number of hits to include
# *:retention_times*:: false - include retention times in the file (requires mz_dir to be set)
# *:deltacn_orig*:: false - when true, the original SEQUEST deltacn values are used. If false, Bioworks deltacn values are used which are derived by taking the original deltacn of the following hit. This gives the top ranking hit an informative deltacn but makes the deltacn meaningless for other hits.
#
# *:pepxml_version*:: Ms::Ident::Pepxml::DEFAULT_PEPXML_VERSION, - Integer to set the pepxml version. The converter and xml output attempts to produce xml specific to the version.
# *:verbose*:: true - set to false to quiet warnings
DEFAULT_OPTIONS = {
:ms_model => nil,
:ms_ionization => 'ESI',
:ms_detector => 'UNKNOWN',
:ms_mass_analyzer => nil,
:ms_manufacturer => 'Thermo',
:mz_dir => nil,
#:raw_data => [".mzXML", '.mzML'],
:raw_data => ['.mzML', '.mzXML'],
:db_seq_type => 'AA',
:db_dir => nil,
:db_residue_size => nil,
:db_name => nil,
:db_orig_url => nil,
:db_release_date => nil,
:db_release_id => nil,
:num_hits => 1,
:retention_times => false,
:deltacn_orig => false,
:pepxml_version => Ms::Ident::Pepxml::DEFAULT_PEPXML_VERSION,
:verbose => true,
}
# An array of regexp to string pairs. The regexps are matched against the
# model (srf.header.model) and the corresponding string will be used as
# the mass analyzer.
#
# /Orbitrap/:: 'Orbitrap'
# /LCQ Deca XP/:: 'Ion Trap'
# /LTQ/:: 'Ion Trap'
# /\w+/:: 'UNKNOWN'
ModelToMsAnalyzer = [
[/Orbitrap/, 'Orbitrap'],
[/LCQ Deca XP/, 'Ion Trap'],
[/LTQ/, 'Ion Trap'],
[/\w+/, 'UNKNOWN'],
]
# returns an Ms::Ident::Pepxml object. See that object for creating an
# xml string or writing to file.
def to_pepxml(opts={})
opt = DEFAULT_OPTIONS.merge(opts)
srf = self
# with newer pepxml version these are not required anymore
hidden_opts = {
# format of file storing the runner up peptides (if not present in
# pepXML) this was made optional after version 19
:out_data_type => "out", ## may be srf??
# runner up search hit data type extension (e.g. .tgz)
:out_data => ".srf",
}
opt.merge!(hidden_opts)
params = srf.params
header = srf.header
opt[:ms_model] ||= srf.header.model
unless opt[:ms_mass_analyzer]
ModelToMsAnalyzer.each do |regexp, val|
if opt[:ms_model].match(regexp)
opt[:ms_mass_analyzer] = val
break
end
end
end
# get the database name
db_filename = header.db_filename.sub(/\.hdr$/, '')
if opt[:db_dir]
db_filename = File.join(opt[:db_dir], db_filename.split(/[\/\\]+/).last)
end
if File.exist?(db_filename)
db_filename = File.expand_path(db_filename)
else
msg = ["!!! WARNING !!!"]
msg << "!!! Can't find database: #{db_filename}"
msg << "!!! pepxml *requires* that the db path be valid"
msg << "!!! make sure 1) the fasta file is available on this system"
msg << "!!! 2) you've specified a valid directory with --db-dir (or :db_dir)"
puts msg.join("\n") if opt[:verbose]
end
modifications_obj = Ms::Sequest::Pepxml::Modifications.new(params, srf.header.modifications)
mass_index = params.mass_index(:precursor)
h_plus = mass_index['h+']
opt[:mz_dir] ||= srf.resident_dir
found_ext = opt[:raw_data].find do |raw_data|
Dir[File.join(opt[:mz_dir], srf.base_name_noext + raw_data)].first
end
opt[:raw_data] = [found_ext] if found_ext
scan_to_ret_time =
if opt[:retention_times]
raise NotImplementedError, "will implement shortly"
#mz_file = Dir[File.join(opt[:mz_dir], srf.base_name_noext + opt[:raw_data].first)].first
#if mz_file
# Ms::Msrun.scans_to_times(mz_file)
#else
# warn "turning retention_times off since no valid mz[X]ML file was found!!!"
# opt[:retention_times] = false
# nil
#end
end
summary_xml_filename = srf.base_name_noext + '.xml'
pepxml = Ms::Ident::Pepxml.new do |msms_pipeline_analysis|
msms_pipeline_analysis.merge!(:summary_xml => summary_xml_filename, :pepxml_version => opt[:pepxml_version]) do |msms_run_summary|
# prep the sample enzyme and search_summary
msms_run_summary.merge!(
:base_name => File.join(opt[:mz_dir], srf.base_name_noext),
:ms_manufacturer => opt[:ms_manufacturer],
:ms_model => opt[:ms_model],
:ms_ionization => opt[:ms_ionization],
:ms_mass_analyzer => opt[:ms_mass_analyzer],
:ms_detector => opt[:ms_detector],
:raw_data => opt[:raw_data].first,
:raw_data_type => opt[:raw_data].first,
) do |sample_enzyme, search_summary, spectrum_queries|
sample_enzyme.merge!(params.sample_enzyme_hash)
sample_enzyme.name = opt[:enzyme] if opt[:enzyme]
search_summary.merge!(
:base_name=> srf.resident_dir + '/' + srf.base_name_noext,
:search_engine => 'SEQUEST',
:precursor_mass_type => params.precursor_mass_type,
:fragment_mass_type => params.fragment_mass_type,
:out_data_type => opt[:out_data_type],
:out_data => opt[:out_data],
) do |search_database, enzymatic_search_constraint, modifications_ar, parameters_hash|
search_database.merge!(:local_path => db_filename, :seq_type => opt[:db_seq_type], :database_name => opt[:db_name], :orig_database_url => opt[:db_orig_url], :database_release_date => opt[:db_release_date], :database_release_identifier => opt[:db_release_id])
case opt[:db_residue_size]
when Integer
search_database.size_of_residues = opt[:db_residue_size]
when true
search_database.set_size_of_residues!
end
enzymatic_search_constraint.merge!(
:enzyme => opt[:enzyme] ? opt[:enzyme] : params.enzyme,
:max_num_internal_cleavages => params.max_num_internal_cleavages,
:min_number_termini => params.min_number_termini,
)
modifications_ar.replace(modifications_obj.modifications)
parameters_hash.merge!(params.opts)
end
spec_queries = srf.dta_files.zip(srf.out_files, index).map do |dta_file,out_file,i_ar|
precursor_neutral_mass = dta_file.mh - h_plus
search_hits = out_file.hits[0,opt[:num_hits]].each_with_index.map do |pep,i|
(prev_aa, pure_aaseq, next_aa) = Ms::Ident::Peptide.prepare_sequence(pep.sequence)
calc_neutral_pep_mass = pep.mh - h_plus
sh = Ms::Ident::Pepxml::SearchHit.new(
:hit_rank => i+1,
:peptide => pure_aaseq,
:peptide_prev_aa => prev_aa,
:peptide_next_aa => next_aa,
:protein => pep.proteins.first.reference.split(' ')[0],
:num_tot_proteins => pep.proteins.size,
:num_matched_ions => pep.ions_matched,
:tot_num_ions => pep.ions_total,
:calc_neutral_pep_mass => calc_neutral_pep_mass,
:massdiff => precursor_neutral_mass - calc_neutral_pep_mass,
:num_tol_term => sample_enzyme.num_tol_term(prev_aa, pure_aaseq, next_aa),
:num_missed_cleavages => sample_enzyme.num_missed_cleavages(pure_aaseq),
:modification_info => modifications_obj.modification_info(Ms::Ident::Peptide.split_sequence(pep.sequence)[1])
) do |search_scores|
if opt[:deltacn_orig]
deltacn = pep.deltacn_orig
deltacnstar = nil
else
deltacn = pep.deltacn
deltacn = 1.0 if deltacn == 1.1
deltacnstar = out_file.hits[i+1].nil? ? '1' : '0'
end
search_scores.merge!( :xcorr => pep.xcorr, :deltacn => deltacn,
:spscore => pep.sp, :sprank => pep.rsp)
search_scores[:deltacnstar] = deltacnstar if deltacnstar
end
end
sr = Ms::Ident::Pepxml::SearchResult.new(:search_hits => search_hits)
ret_time =
if opt[:retention_times]
(first_scan, last_scan) = i_ar[0,2]
if first_scan==last_scan
scan_to_ret_time[i_ar[0]]
else
times = ((i_ar[0])..(i_ar[1])).step(1).map {|i| scan_to_ret_time[i] }.compact
times.inject(&:+) / times.size.to_f
end
end
Ms::Ident::Pepxml::SpectrumQuery.new(
:spectrum => [srf.base_name_noext, *i_ar].join('.'), :start_scan => i_ar[0], :end_scan => i_ar[1],
:precursor_neutral_mass => dta_file.mh - h_plus, :assumed_charge => i_ar[2],
:retention_time_sec => ret_time,
:search_results => [sr],
)
end
spectrum_queries.replace(spec_queries)
end
end
end
pepxml
end # to_pepxml
end # Srf::Pepxml
include Pepxml
end # Srf
require 'trollop'
module Ms::Sequest::Srf::Pepxml
def self.commandline(argv, progname=$0)
opts = Trollop::Parser.new do
banner %Q{
usage: #{progname} [OPTIONS] .srf ...
output: .xml ...
}.lines.map(&:lstrip).join
text ""
text "major options:"
opt :db_dir, "The dir holding the DB if different than in Srf. (pepxml requires a valid database path)", :type => :string
opt :enzyme, "overide the enzyme name embedded in the params file", :type => :string
opt :mz_dir, "directory holding mz[X]ML files (defaults to the folder holding the srf file)", :type => :string
opt :retention_times, "include retention times (requires mz-dir)"
opt :deltacn_orig, "use original deltacn values created by SEQUEST. By default, the top hit gets the next hit's original deltacn."
opt :no_filter, "do not filter hits by peptide_mass_tolerance (per sequest params)"
opt :num_hits, "include N top hits", :default => 1
opt :outdirs, "list of output directories", :type => :strings
opt :quiet, "do not print warnings, etc."
text ""
text "minor options:"
opt :pepxml_version, 'schema version number to use', :default => Ms::Ident::Pepxml::DEFAULT_PEPXML_VERSION
opt :ms_model, 'mass spectrometer model', :type => :string
opt :ms_ionization, 'type of ms ionization', :default => 'ESI'
opt :ms_detector, 'ms detector', :default => 'UNKNOWN'
opt :ms_mass_analyzer, 'ms mass analyzer', :type => :string
opt :ms_manufacturer, 'ms manufacturer', :default => 'Thermo'
opt :raw_data, 'preferred extension for raw data', :default => '.mzML'
opt :db_seq_type, "'AA' or 'NA'", :default => 'AA'
opt :db_residue_size, 'calculate the size of the fasta file'
opt :db_name, 'the database name', :type => :string
opt :db_orig_url, 'original database url', :type => :string
opt :db_release_date, 'database release date', :type => :string
opt :db_release_id, 'the database release identifier', :type => :string
end
opt = opts.parse argv
opts.educate && exit if argv.empty?
Trollop.die :outdirs, "outdirs must be same size as number of input files" if opt.outdirs && opt.outdirs.size != argv.size
opt[:filter] = !opt.delete(:no_filter)
opt[:outdirs] ||= []
opt[:raw_data] = [opt[:raw_data]] if opt[:raw_data]
opt[:verbose] = !opt[:quiet]
argv.zip(opt.delete(:outdirs)) do |srf_file,outdir|
outdir ||= File.dirname(srf_file)
srf = Ms::Sequest::Srf.new(srf_file, :link_protein_hits => false, :filter_by_precursor_mass_tolerance => opt.delete(:filter))
pepxml = srf.to_pepxml(opt)
outfile = pepxml.to_xml(outdir)
puts "wrote file: #{outfile}" if opt[:verbose]
end
end
end