require 'sample_enzyme'
require 'spec/mzxml/parser'
require 'hash_by'
require 'set_from_hash'
require 'spec_id/bioworks'
require 'instance_var_set_from_hash'
require 'spec/msrun'
require 'spec_id/srf'
class Numeric
# returns a string with a + or - on the front
def to_plus_minus_string
if self >= 0
'+' << self.to_s
else
'-' << self.to_s
end
end
end
##########################################
# NEED TO ADD MODIFICATIONS and generally verify pepxml creation!!! :
# HERE's an excerpt from an example file from tpp 2.9.2 that I'm going to follow:
=begin
=end
# and a guy with modifications:
=begin
=end
# sequest.params option:
# diff_search_options = 15.994910 M 0.000000 C 0.000000 M 0.000000 X 0.000000 T 0.000000 Y
# permanent mods are at the bottom: ...
# add_A_Alanine = 0.0000 ; added to A
# add_S_Serine = 0.0000 ; added to S
# add_P_Proline = 0.0000 ; added to P
# add_V_Valine = 0.0000 ; added to V
# add_T_Threonine = 0.0000 ; added to T
# ...
module SpecID::Sequest; end
class SpecID::Sequest::PepXML; end
class SpecID::Sequest::PepXML::MSMSPipelineAnalysis
include SpecIDXML
# Version 1.2.3
attr_writer :date
attr_writer :xmlns, :xmlns_xsi, :xsi_schemaLocation
attr_accessor :summary_xml
# Version 2.3.4
attr_writer :xmlns, :xmlns_xsi, :xsi_schema_location
attr_accessor :pepxml_version
attr_accessor :msms_run_summary
# if block given, sets msms_run_summary to block
def initialize(hash=nil)
@xmlns = nil
@xmlns_xsi = nil
@xsi_schema_location = nil
if hash
self.set_from_hash(hash)
end
if block_given?
@msms_run_summary = yield
end
end
# if no date string given, then it will set to Time.now
def date
if @date ; @date
else
case SpecID::Sequest::PepXML.pepxml_version
when 18 ; tarr = Time.now.to_a ; tarr[3..5].reverse.join('-') + "T#{tarr[0..2].reverse.join(':')}"
when 0 ; Time.new.to_s
end
end
end
def xmlns
if @xmlns ; @xmlns
else ; "http://regis-web.systemsbiology.net/pepXML"
end
end
def xmlns_xsi
if @xmlns_xsi ; @xmlns_xsi
else ; "http://www.w3.org/2001/XMLSchema-instance"
end
end
def xsi_schema_location
if @xsi_schema_location ; @xsi_schema_location
else ; "http://regis-web.systemsbiology.net/pepXML /tools/bin/TPP/tpp/schema/pepXML_v18.xsd"
end
end
def to_pepxml
case SpecID::Sequest::PepXML.pepxml_version
when 0
element_xml(:msms_pipeline_analysis, [:date, :summary_xml]) do
@msms_run_summary.to_pepxml
end
when 18
element_xml_and_att_string(:msms_pipeline_analysis, "date=\"#{date}\" xmlns=\"#{xmlns}\" xmlns:xsi=\"#{xmlns_xsi}\" xsi:schemaLocation=\"#{xsi_schema_location}\" summary_xml=\"#{summary_xml}\"") do
@msms_run_summary.to_pepxml
end
else
abort "Don't know how to deal with version: #{SpecID::Sequest::PepXML.pepxml_version}"
end
end
end
class SpecID::Sequest::PepXML::MSMSRunSummary
include SpecIDXML
# the version of TPP you are using (determines xml output)
# The name of the pep xml file (without extension) (but this is a long
# filename!!!)
attr_accessor :base_name
# The name of the mass spec manufacturer
attr_accessor :ms_manufacturer
attr_accessor :ms_model
attr_accessor :ms_mass_analyzer
attr_accessor :ms_detector
attr_accessor :raw_data_type
attr_accessor :raw_data
attr_accessor :ms_ionization
attr_accessor :pepxml_version
# A SampleEnzyme object (responds to: name, cut, no_cut, sense)
attr_accessor :sample_enzyme
# A SearchSummary object
attr_accessor :search_summary
# An array of spectrum_queries
attr_accessor :spectrum_queries
# takes a hash of name, value pairs
# if block given, spectrum_queries (should be array of spectrum queries) is
# set to the return value of the block
def initialize(hash=nil)
@spectrum_queries = []
if hash
instance_var_set_from_hash(hash)
end
if block_given? ; @spectrum_queries = yield end
end
def to_pepxml
case SpecID::Sequest::PepXML.pepxml_version
when 18
element_xml_and_att_string(:msms_run_summary, "base_name=\"#{base_name}\" msManufacturer=\"#{ms_manufacturer}\" msModel=\"#{ms_model}\" msIonization=\"#{ms_ionization}\" msMassAnalyzer=\"#{ms_mass_analyzer}\" msDetector=\"#{ms_detector}\" raw_data_type=\"#{raw_data_type}\" raw_data=\"#{raw_data}\"") do
sample_enzyme.to_pepxml +
search_summary.to_pepxml +
spectrum_queries.map {|sq| sq.to_pepxml }.join
end
when 0
# element_xml(:msms_run_summary, [:base_name, :search_engine, :database, :raw_data_type, :raw_data, :out_data_type, :out_data, :sample_enzyme]) do
# element_xml(:search_summary, [:base_name, :search_engine, :precursor_mass_type, :fragment_mass_type]) do
# [
# @params.short_element_xml(:enzymatic_search_constraint, [:enzyme, :max_num_internal_cleavages, :min_number_termini]),
# @params.short_element_xml(:sequence_search_constraint, [:sequence]),
# @params.short_element_xml(:sequence_search_constraint, [:sequence]),
# @params.pepxml_parameters(:peptide_mass_tol, :fragment_ion_tol, :ion_series, :max_num_differential_AA_per_mod, :nucleotide_reading_frame, :num_output_lines, :remove_precursor_peak, :ion_cutoff_percentage, :match_peak_count, :match_peak_allowed_error, :match_peak_tolerance, :protein_mass_filter, :sequence_header_filter)
# ].join("\n")
# end + "\n" +
# @spectrum_queries.collect {|result| result.to_pepxml }.join("\n")
# end
end
end
end
class SpecID::Sequest::PepXML
include SpecIDXML
## CREATE a default version for the entire class
class << self
attr_accessor :pepxml_version
end
DEF_VERSION = 18
self.pepxml_version = DEF_VERSION # default version
attr_accessor :pepxml_version, :msms_pipeline_analysis
## the full path name (no extension)
attr_accessor :base_name
attr_accessor :h_plus
attr_accessor :avg_parent
#attr_accessor :spectrum_queries, :params, :base_name, :search_engine, :database, :raw_data_type, :raw_data, :out_data_type, :out_data, :sample_enzyme, :pepxml_version
# returns an array of spectrum queries
def spectrum_queries
msms_pipeline_analysis.msms_run_summary.spectrum_queries
end
# msms_pipeline_analysis is set to the result of the yielded block
# and set_mono_or_avg is called with params if given
def initialize(pepxml_version=DEF_VERSION, sequest_params_obj=nil)
self.class.pepxml_version = pepxml_version
if sequest_params_obj
set_mono_or_avg(sequest_params_obj)
end
if block_given?
@msms_pipeline_analysis = yield
@base_name = @msms_pipeline_analysis.msms_run_summary.base_name
end
end
# sets @h_plus and @avg_parent from the sequest params object
def set_mono_or_avg(sequest_params_obj)
case sequest_params_obj.precursor_mass_type
when "monoisotopic" ; @avg_parent = false
else ; @avg_parent = true
end
case @avg_parent
when true ; @h_plus = SpecID::AVG[:h_plus]
when false ; @h_plus = SpecID::MONO[:h_plus]
end
end
def date
Time.new.to_s
end
def xml_version
'' + "\n"
end
# for pepxml_version == 0
def doctype
'' + "\n"
end
def style_sheet
case self.class.pepxml_version
when 0
'' + "\n"
when 18
''
end
end
def header
case self.class.pepxml_version
when 0 ; xml_version + doctype + style_sheet
when 18 ; xml_version + style_sheet
end
end
# updates the private attrs _num_prots and _first_prot on bioworks pep
# objects. Ideally, we'd like these attributes to reside elsewhere, but for
# memory concerns, this is best for now.
def self._prot_num_and_first_prot_by_pep(pep_array)
pep_array.hash_by(:sequence).each do |seq, pep_arr|
prots = pep_arr.collect { |pep| pep.prot }
prots.uniq!
_size = prots.size
pep_arr.each do |pep|
pep._num_prots = _size.to_s
pep._first_prot = prots.first
end
end
end
Default_Options = {
:out_path => nil,
:backup_db_path => '/project/marcotte/marcotte/ms/database',
# a PepXML option
:pepxml_version => DEF_VERSION,
## MSMSRunSummary options:
# string must be recognized in sample_enzyme.rb
# or create your own SampleEnzyme object
:sample_enzyme => 'trypsin',
:ms_manufacturer => 'ThermoFinnigan',
:ms_model => 'LCQ Deca XP',
:ms_ionization => 'ESI',
:ms_mass_analyzer => 'Ion Trap',
:ms_detector => 'UNKNOWN',
:raw_data_type => "raw",
:raw_data => ".mzXML", ## even if you don't have it?
## SearchSummary options:
:out_data_type => "out", ## may be srf?? don't think pepxml recognizes this yet
:out_data => ".tgz" ## may be srf??
}
# will dynamically set :ms_model and :ms_mass_analyzer from srf info
# (ignoring defaults or anything passed in) for LTQ Orbitrap
# and LCQ Deca XP
# See SRF::Sequest::PepXML::Default_Options hash for defaults
# unless given, the out_path will be given as the path of the srf_file
def self.new_from_srf(srf_file, opts={})
opts = Default_Options.merge(opts)
## set the outpath
out_path = opts.delete(:out_path)
unless out_path
out_path = File.dirname(srf_file)
end
## read the srf file
srf = SRF.new(srf_file)
params = srf.params
## check to see if we need backup_db
backup_db_path = opts.delete(:backup_db_path)
unless File.exist? params.database
params.database_path = backup_db_path
end
#######################################################################
# PREPARE THE OPTIONS:
#######################################################################
## remove items from the options hash that don't belong to
ppxml_version = opts.delete(:pepxml_version)
out_data_type = opts.delete(:out_data_type)
out_data = opts.delete(:out_data)
## Extract meta info from srf
bn_noext = base_name_noext(srf.header.raw_filename)
opts[:ms_model] = srf.header.model
case opts[:ms_model]
when /Orbitrap/
opts[:ms_mass_analyzer] = 'Orbitrap'
when /LCQ Deca XP/
opts[:ms_mass_analyzer] = 'Ion Trap'
end
## Create the base name
full_base_name_no_ext = make_base_name( File.expand_path(out_path), bn_noext)
opts[:base_name] = full_base_name_no_ext
## Create the search summary:
search_summary_options = {
:search_database => SpecID::Sequest::PepXML::SearchDatabase.new(params),
:base_name => full_base_name_no_ext,
:out_data_type => out_data_type,
:out_data => out_data
}
opts[:search_summary] = SpecID::Sequest::PepXML::SearchSummary.new( params, search_summary_options)
## Create the SampleEnzyme object if necessary
unless opts[:sample_enzyme].is_a? SampleEnzyme
opts[:sample_enzyme] = SampleEnzyme.new(opts[:sample_enzyme])
end
## Create the pepxml obj
pepxml_obj = SpecID::Sequest::PepXML.new(ppxml_version, params)
## name some common variables we'll need
h_plus = pepxml_obj.h_plus
avg_parent = pepxml_obj.avg_parent
#######################################################################
# CREATE the spectrum_queries_ar
#######################################################################
srf_index = srf.index
out_files = srf.out_files
spectrum_queries_arr = Array.new(srf.dta_files.size)
files_with_hits_index = 0 ## will end up being 1 indexed
srf.dta_files.each_with_index do |dta_file,i|
next if out_files[i].num_hits == 0
files_with_hits_index += 1
# Sort the hits
hits = out_files[i].hits
arr = hits.sort_by{|v| v.xcorr }
# Get proper deltacn and deltacnstar
# Prophet deltacn is not the same as the native Sequest deltacn
# It is the deltacn of the second best hit!
top_hit = arr.pop
second_hit = arr.last
if second_hit
top_hit[1] = second_hit[1]
deltacnstar = '0'
else
top_hit[1] = '1.0'
deltacnstar = '1'
end
## mass calculations:
precursor_neutral_mass = dta_file.mh - h_plus
calc_neutral_pep_mass = top_hit[0] - h_plus
massdiff = precursor_neutral_mass - calc_neutral_pep_mass
if massdiff >= 0 ; massdiff = "+" + massdiff.to_s
else ; massdiff = massdiff.to_s end
(start_scan, end_scan, charge) = srf_index[i]
sq_hash = {
:spectrum => [bn_noext, start_scan, end_scan, charge].join('.'),
:start_scan => start_scan,
:end_scan => end_scan,
:precursor_neutral_mass => precursor_neutral_mass,
:assumed_charge => charge,
:pepxml_version => ppxml_version,
:index => files_with_hits_index,
}
# NEED TO MODIFY SPLIT SEQUENCE TO DO MODS!
## THIS IS ALL INNER LOOP, so we make every effort at speed here:
(prevaa, pepseq, nextaa) = SpecID::Sequest::PepXML::SearchHit.prepare_sequence(top_hit[8])
# ind_keys = {:mh => 0, :deltacn => 1, :sp => 2, :xcorr => 3, :id => 4, :rsp => 5, :ions_matched => 6, :ions_total => 7, :peptide => 8, :reference => 9 }
sh_hash = {
:hit_rank => "1",
:peptide => pepseq,
:peptide_prev_aa => prevaa,
:peptide_next_aa => nextaa,
:protein => top_hit[9].split(" ").first,
:num_tot_proteins => top_hit[10],
:num_matched_ions => top_hit[6],
:tot_num_ions => top_hit[7],
:calc_neutral_pep_mass => calc_neutral_pep_mass,
:massdiff => massdiff,
:num_tol_term => SpecID::Sequest::PepXML::SearchHit.calc_num_tol_term(params, top_hit[8]),
:num_missed_cleavages => SpecID::Sequest::PepXML::SearchHit.calc_num_missed_cleavages(params, top_hit[8]),
:is_rejected => '0',
# These are search score attributes:
:xcorr => top_hit[3],
:deltacn => top_hit[1],
:deltacnstar => deltacnstar,
:spscore => top_hit[2],
:sprank => top_hit[5],
}
spectrum_queries_arr[files_with_hits_index] = SpecID::Sequest::PepXML::SpectrumQuery.new(sq_hash) do
search_result = SpecID::Sequest::PepXML::SearchResult.new do
[ SpecID::Sequest::PepXML::SearchHit.new(sh_hash) ] # there can be multiple hits
end # SearchResult
[search_result] # can be multiple
end
end
spectrum_queries_arr.compact!
#######################################################################
# ADD the pipeline analysis
#######################################################################
pipeline = SpecID::Sequest::PepXML::MSMSPipelineAnalysis.new({:date=>nil,:summary_xml=> bn_noext +'.xml'}) do
SpecID::Sequest::PepXML::MSMSRunSummary.new(opts) { spectrum_queries_arr }
end
pepxml_obj.msms_pipeline_analysis = pipeline
pepxml_obj.base_name = pipeline.msms_run_summary.base_name
pepxml_obj
end
# Takes bioworks 3.2/3.3 xml output (with no filters)
# Returns a list of PepXML objects
# msdata = path to mzXML files (or .timeIndex files) (or @TODO: path to sqt file(s))
# params = sequest.params file
# bioworks = bioworks.xml exported multi-consensus view file
# pepxml_version = 0 for tpp 1.2.3
# pepxml_version = 18 for tpp 2.8.2, 2.8.3, 2.9.2
def self.set_from_bioworks(params, bioworks, msdata, out_path, pepxml_version=18, sample_enzyme='trypsin', ms_manufacturer='ThermoFinnigan', ms_model='LCQ Deca XP Plus', ms_ionization='ESI', ms_mass_analyzer='Ion Trap', ms_detector='UNKNOWN', raw_data_type="raw", raw_data=".mzXML", out_data_type="out", out_data=".tgz")
supported_versions = [0,18]
unless supported_versions.include?(pepxml_version)
abort "pepxml_version: #{pepxml_version} not currently supported. Current support is for versions #{supported_versions.join(', ')}"
end
## Turn params and bioworks_obj into objects if necessary:
# Params:
if params.class == SpecID::Sequest::Params # OK!
elsif params.class == String ; params = SpecID::Sequest::Params.new(params)
else ; abort "Don't recognize #{params} as object or string!"
end
# Bioworks:
if bioworks.class == SpecID::Bioworks # OK!
elsif bioworks.class == String ; bioworks = SpecID.new(bioworks)
else ; abort "Don't recognize #{bioworks} as object or string!"
end
#puts "bioworks.peps.size: #{bioworks.peps.size}"; #puts "bioworks.prots.size: #{bioworks.prots.size}"; #puts "Bioworks.version: #{bioworks.version}"
## TURN THIS ON IF YOU THINK YOU MIGHT NOT BE GETTING PEPTIDES from
## bioworks
#bioworks.peps.each { |pep| if pep.class != SpecID::Bioworks::Pep ; puts "trying to pass as pep: "; p pep; abort "NOT a pep!" end }
## Start
split_bio_objs = []
## Create a hash by pep object containing num_tot_proteins
## This is only valid if all hits are present (no previous thresholding)
self._prot_num_and_first_prot_by_pep(bioworks.peps)
## (num_prots_by_pep, prot_by_pep) =
#num_prots_by_pep.each do |k,v| puts "k: #{k} v: #{v}\n"; break end ; prot_by_pep.each do |k,v| puts "k: #{k} v: #{v}" ; break end ; abort "HERE"
## Create a hash of spectrum_query arrays by filename (this very big block):
spectrum_queries_by_base_name = {}
pepxml_objs_by_base_name = {}
# Hash by the filenames to split into filenames:
bioworks.peps.hash_by(:base_name).each do |base_name, pep_arr|
prec_mz_arr = nil
case x = bioworks.version
when /3.2/
calc_prec_by = :prec_mz_arr
# get the precursor_mz array for this filename
inner__full_base_name_no_ext = File.join(msdata, base_name)
prec_mz_arr = Spec::MSRun.precursor_mz_by_scan(inner__full_base_name_no_ext)
when /3.3/
calc_prec_by = :deltamass
else
abort "invalid BioworksBrowser version: #{x}"
end
pepxml_obj = SpecID::Sequest::PepXML.new(pepxml_version, params)
pepxml_objs_by_base_name[base_name] = pepxml_obj
spectrum_queries_ar = pep_arr.hash_by(:first_scan, :last_scan, :charge).collect do |key,arr|
# Sort_by_rank and take the top hit (to mimick out2summary):
arr = arr.sort_by {|pep| pep.xcorr.to_f } # ascending
top_pep = arr.pop
second_hit = arr.last # needed for deltacnstar
case calc_prec_by
when :prec_mz_arr
precursor_neutral_mass = SpecID::Sequest::PepXML::SpectrumQuery.calc_precursor_neutral_mass(calc_prec_by, top_pep.first_scan.to_i, top_pep.last_scan.to_i, prec_mz_arr, top_pep.charge.to_i, pepxml_obj.avg_parent)
when :deltamass
precursor_neutral_mass = SpecID::Sequest::PepXML::SpectrumQuery.calc_precursor_neutral_mass(calc_prec_by, top_pep.mass.to_f, top_pep.deltamass.to_f, pepxml_obj.avg_parent)
end
calc_neutral_pep_mass = (top_pep.mass.to_f - pepxml_obj.h_plus)
massdiff = precursor_neutral_mass - calc_neutral_pep_mass
if massdiff >= 0 ; massdiff = "+" + massdiff.to_s
else ; massdiff = massdiff.to_s end #already has a -
# deltacn & star:
# (NOTE: OLD?? out2summary wants the deltacn of the 2nd best hit.)
if second_hit
#top_pep.deltacn = second_hit.deltacn
deltacnstar = '0'
else
top_pep.deltacn = '1.0'
deltacnstar = '1'
end
# Create the nested structure of queries{results{hits}}
# (Ruby's blocks work beautifully for things like this)
spec_query = SpecID::Sequest::PepXML::SpectrumQuery.new({
:spectrum => [top_pep.base_name, top_pep.first_scan, top_pep.last_scan, top_pep.charge].join("."),
:start_scan => top_pep.first_scan,
:end_scan => top_pep.last_scan,
:precursor_neutral_mass => precursor_neutral_mass.to_s,
:assumed_charge => top_pep.charge,
:pepxml_version => pepxml_version,
}) do
search_result = SpecID::Sequest::PepXML::SearchResult.new do
## Calculate some interdependent values;
# NOTE: the bioworks mass is really M+H if two or more scans went
# into the search_hit; calc_neutral_pep_mass is simply the avg of
# precursor masses adjusted to be neutral
(prevaa, pepseq, nextaa) = SpecID::Sequest::PepXML::SearchHit.prepare_sequence(top_pep.sequence)
(num_matched_ions, tot_num_ions) = SpecID::Sequest::PepXML::SearchHit.split_ions(top_pep.ions)
search_hit = SpecID::Sequest::PepXML::SearchHit.new({
:hit_rank => "1",
:peptide => pepseq,
:peptide_prev_aa => prevaa,
:peptide_next_aa => nextaa,
:protein => top_pep._first_prot.reference.split(" ").first,
:num_tot_proteins => top_pep._num_prots,
:num_matched_ions => num_matched_ions,
:tot_num_ions => tot_num_ions,
:calc_neutral_pep_mass => calc_neutral_pep_mass.to_s,
:massdiff => massdiff,
:num_tol_term => SpecID::Sequest::PepXML::SearchHit.calc_num_tol_term(params, top_pep.sequence).to_s,
:num_missed_cleavages => SpecID::Sequest::PepXML::SearchHit.calc_num_missed_cleavages(params, top_pep.sequence).to_s,
:is_rejected => "0",
# These are search score attributes:
:xcorr => top_pep.xcorr,
:deltacn => top_pep.deltacn,
:deltacnstar => deltacnstar,
:spscore => top_pep.sp,
:sprank => top_pep.rsp,
})
[search_hit] # there can be multiple search hits
end # SearchResult
[search_result] # can be multiple search_results
end # SpectrumQuery
end # Collects the spectrum queries
# create an index by spectrum as results end up typically in out2summary
# (I really dislike this order, however)
spectrum_queries_ar = spectrum_queries_ar.sort_by {|pep| pep.spectrum }
spectrum_queries_ar.each_with_index {|res,index| res.index = "#{index + 1}" }
spectrum_queries_by_base_name[base_name] = spectrum_queries_ar
end
modifications_string = bioworks.modifications
spectrum_queries_by_base_name.collect do |base_name, spectrum_queries_ar|
case pepxml_version
when 18
pipeline = SpecID::Sequest::PepXML::MSMSPipelineAnalysis.new({:date=>nil,:summary_xml=>base_name+'.xml'}) do
full_base_name_no_ext = self.make_base_name( File.expand_path(out_path), base_name)
SpecID::Sequest::PepXML::MSMSRunSummary.new({
:base_name => full_base_name_no_ext,
:ms_manufacturer => ms_manufacturer,
:ms_model => ms_model,
:ms_ionization => ms_ionization,
:ms_mass_analyzer => ms_mass_analyzer,
:ms_detector => ms_detector,
:raw_data_type => raw_data_type,
:raw_data => raw_data,
:sample_enzyme => SampleEnzyme.new(sample_enzyme),
:search_summary => SpecID::Sequest::PepXML::SearchSummary.new(params, modifications_string, {:search_database => SpecID::Sequest::PepXML::SearchDatabase.new(params), :base_name => full_base_name_no_ext, :out_data_type => out_data_type, :out_data => out_data}),
}) { spectrum_queries_ar }
end
pepxml_obj = pepxml_objs_by_base_name[base_name]
pepxml_obj.msms_pipeline_analysis = pipeline
pepxml_obj.base_name = pipeline.msms_run_summary.base_name
pepxml_obj
when 0
## @TODO: NEED TO REVAMP THIS:
# SpecID::Sequest::PepXML.new(pepxml_version).set_from_hash({
# :params => params,
# :search_results => spectrum_queries_arr,
# :base_name => self.make_base_name( File.expand_path(out_path), base_name),
# :search_engine => params.search_engine,
# :database => params.database,
# :raw_data_type => "mzXML",
# :raw_data => ".mzXML",
# :out_data_type => "out",
# :out_data => ".tgz",
# :sample_enzyme => params.enzyme,
# })
end
end # collects the pepxml objects
end
def summary_xml
base_name + ".xml"
end
def precursor_mass_type
@params.precursor_mass_type
end
def fragment_mass_type
@params.fragment_mass_type
end
# combines filename in a manner consistent with the path
def self.make_base_name(path, filename)
sep = "/"
if path.split("/").size < path.split("\\").size
sep = "\\"
end
if path.split("").last == sep
return path + File.basename(filename)
else
return path + sep + File.basename(filename)
end
end
# outputs pepxml, (to file if given)
def to_pepxml(file=nil)
string = header
string << @msms_pipeline_analysis.to_pepxml
if file
File.open(file, "w") do |fh| fh.print string end
end
string
end
# given any kind of filename (from windows or whatever)
# returns the base of the filename with no file extension
def self.base_name_noext(file)
file.gsub!("\\", '/')
File.basename(file).sub(/\.[\w^\.]+$/, '')
end
end # PepXML
##
# In the future, this guy should accept any version of bioworks params file
# and spit out any param queried.
class SpecID::Sequest::Params
include SpecIDXML
# current attributes supported are:
# bioworks 3.2:
@@param_re = / = ?/o
@@param_two_split = ';'
# opts are the general options
# mods are the weights added to amino acids
attr_accessor :opts, :mods
# all keys and values stored as strings!
def initialize(file=nil)
if file
parse(file)
end
end
# returns hash of params for continuous lines of non-whitespace
def grab_params(fh)
hash = {}
while line = fh.gets
if line =~ /[^\s]/
one,two = line.split @@param_re
two,comment = two.split @@param_two_split
hash[one] = two.rstrip
# it is necessary to add this break so that params files inside srf
# files can be read. This will terminate the reading at the end of
# the file even though there are more lines
if line =~ /added to U/ || line =~ /digest_mass_range/## Will only work on bioworks 3.2 & 3.3 (bioworks 3.1 last line => Elastase/Tryp...)
break
end
if line =~ /digest_mass_range/ # there is no space in the srf params files
break
end
else
break
end
end
hash
end
# returns self
def parse_handle(fh)
sequest_line = fh.gets #[SEQUEST]
@opts = grab_params(fh)
@opts["search_engine"] = "SEQUEST"
@mods = grab_params(fh)
## this gets rid of the .hdr postfix on indexed databases
@opts["first_database_name"] = @opts["first_database_name"].sub(/\.hdr$/, '')
self
end
## parses file
## and drops the .hdr behind indexed fasta files
## returns self
def parse(file)
File.open(file) do |fh|
parse_handle(fh)
end
self
end
# returns( split_after, except_before)
def enzyme_specificity
if version == "3.2"
arr = enzyme_info.split(/\s+/)[3,2]
arr.collect! do |str|
if str && str.class == String ; str
else ; ""
end
end
return *arr
end
end
# Returns the version of the sequest.params file
# Returns String "3.2" if contains "enyzme_info"
# Returns String "3.1" if contains "enzyme_number"
def version
if @opts["enzyme_info"] ; return "3.2"
elsif @opts["enzyme_number"] ; return "3.1"
end
end
####################################################
# TO PEPXML
####################################################
# In some ways, this is merely translating to the older Bioworks
# sequest.params files
# I'm not sure if this is the right mapping for sequence_search_constraint?
def sequence
pseq = @opts['partial_sequence']
if !pseq || pseq == "" ; pseq = "0" end
pseq
end
# Returns xml in the form for list of symbols
def pepxml_parameters
keys_as_symbols = @opts.sort.map do |k,v| k.to_s end
params_xml(*keys_as_symbols)
# (:peptide_mass_tol, :peptide_mass_units, :fragment_ion_tol, :ion_series, :max_num_differential_AA_per_mod, :nucleotide_reading_frame, :num_output_lines, :remove_precursor_peak, :ion_cutoff_percentage, :match_peak_count, :match_peak_allowed_error, :match_peak_tolerance, :protein_mass_filter, :sequence_header_filter)
end
def precursor_mass_type
case @opts['mass_type_parent']
when '0' ; "average"
when '1' ; "monoisotopic"
else ; abort "error in mass_type_parent in sequest!"
end
end
def fragment_mass_type
fmtype =
case @opts['mass_type_fragment']
when '0' ; "average"
when '1' ; "monoisotopic"
else ; abort "error in mass_type_fragment in sequest!"
end
end
def method_missing(name, *args)
string = name.to_s
if @opts.key?(string) ; return @opts[string]
elsif @mods.key?(string) ; return @mods[string]
else ; return nil
end
end
## We only need to define values if they are different than sequest.params
## The method_missing will look them up in the hash!
# Returns a system independent basename
# Splits on "\" or "/"
def _sys_ind_basename(file)
return file.split(/[\\\/]/)[-1]
end
# changes the path of the database
def database_path=(newpath)
db = @opts["first_database_name"]
newpath = File.join(newpath, _sys_ind_basename(db))
@opts["first_database_name"] = newpath
end
def database
@opts["first_database_name"]
end
# returns the appropriate aminoacid mass lookup table (in spec_id.rb SpecID::MONO or
# SpecID::AVG based on precursor_mass_type
def mass_table
case precursor_mass_type
when 'average'
SpecID::AVG
when 'monoisotopic'
SpecID::MONO
end
end
# at least in Bioworks 3.2, the First number after the enzyme
# is the indication of the enzymatic end stringency (required):
# 1 = Fully enzymatic
# 2 = Either end
# 3 = N terminal only
# 4 = C terminal only
# So, to get min_number_termini we map like this:
# 1 => 2
# 2 => 1
def min_number_termini
termini_number = @opts["enzyme_info"].split(" ")[1]
if termini_number == "1"
return "2"
elsif termini_number == "2"
return "1"
else
puts "WARNING: Enzyme termini info might be imprecise!"
return "1"
end
end
def enzyme
#if @opts["enzyme_info"] =~ /Trypsin/ ; return "tryptic"
#else ; return @opts["enzyme_info"].split('(')[0] end
return @opts["enzyme_info"].split('(')[0]
end
def max_num_internal_cleavages
@opts["max_num_internal_cleavage_sites"]
end
def peptide_mass_tol
if @opts["peptide_mass_units"] != "0"
puts "WARNING: peptide_mass_tol units need to be adjusted!"
end
@opts["peptide_mass_tolerance"]
end
def fragment_ion_tol
@opts["fragment_ion_tolerance"]
end
def max_num_differential_AA_per_mod
@opts["max_num_differential_per_peptide"]
end
## @TODO: We could add some of the parameters not currently being asked for to be more complete
## @TODO: We could always add the Bioworks 3.2 specific params as params
####################################################
####################################################
end
class SpecID::Sequest::PepXML::SearchResult
include SpecIDXML
# an array of search_hits
attr_accessor :search_hits
# if block given, then search_hits set to return value
def initialize
if block_given? ; @search_hits = yield
else ; @search_hits = [] end
end
def to_pepxml
element_xml_no_atts(:search_result) do
@search_hits.map {|sh| sh.to_pepxml }.join
end
end
end
class SpecID::Sequest::PepXML::SearchSummary
include SpecIDXML
attr_accessor :params
attr_accessor :base_name
attr_accessor :out_data_type
attr_accessor :out_data
attr_accessor :modifications
# A SearchDatabase object (responds to :local_path and :type)
attr_accessor :search_database
# if given a sequest params object, then will set the following attributes:
# args is a hash of parameters
# modifications_string -> See Modifications
def initialize(params, modifications_string='', args=nil)
@search_id = nil
@params = params
@modifications = SpecID::Sequest::PepXML::Modifications.new(params, modifications_string)
if args ; set_from_hash(args) end
end
def method_missing(symbol, *args)
if @params ; @params.send(symbol, *args) end
end
def search_id
if @search_id ; @search_id
else ; '1' end
end
def to_pepxml
element_xml(:search_summary, [:base_name, :search_engine, :precursor_mass_type, :fragment_mass_type, :out_data_type, :out_data, :search_id]) do
search_database.to_pepxml +
short_element_xml(:enzymatic_search_constraint, [:enzyme, :max_num_internal_cleavages, :min_number_termini]) +
@modifications.to_pepxml +
@params.pepxml_parameters
end
end
end
class SpecID::Sequest::PepXML::Modifications
include SpecIDXML
# sequest params object
attr_accessor :params
# array holding AAModifications
attr_accessor :aa_mods
# array holding TerminalModifications
attr_accessor :term_mods
# a hash of all differential modifications present by aa_one_letter_symbol
# and special_symbol. This is NOT the mass difference but the total mass {
# 'M*' => 155.5, 'S@' => 190.3 }. NOTE: Since the termini are dependent on
# the amino acid sequence, they are give the *differential* mass. The
# termini are given the special symbol as in sequest e.g. '[' => 12.22, #
# cterminus ']' => 14.55 # nterminus
attr_accessor :masses_by_diff_mod_hash
# a hash, key is [AA_one_letter_symbol.to_sym, difference.to_f]
# values are the special_symbols
attr_accessor :mod_symbols_hash
# The modification symbols string looks like this:
# (M* +15.90000) (M# +29.00000) (S@ +80.00000) (C^ +12.00000) (ct[ +12.33000) (nt] +14.20000)
# ct is cterminal peptide (differential)
# nt is nterminal peptide (differential)
# the C is just cysteine
# will set_modifications and masses_by_diff_mod hash
def initialize(params, modification_symbols_string='')
@params = params
set_modifications(params, modification_symbols_string)
end
# set the masses_by_diff_mod and mod_symbols_hash from
def set_hashes(modification_symbols_string)
@mod_symbols_hash = {}
@masses_by_diff_mod = {}
if modification_symbols_string == nil || modification_symbols_string == ''
return nil
end
table = @params.mass_table
modification_symbols_string.split(/\)\s+\(/).each do |mod|
if mod =~ /\(?(\w{1,2})(.) (.[\d\.]+)\)?/
aa_as_sym = $1.to_sym,
@mod_symbols_hash[[aa_as_sym, $3.to_f]] = $2.dup
if $1 == 'ct' || $1 == 'nt'
@masses_by_diff_mod[$2] = $3.to_f
else
@masses_by_diff_mod[$1+$2] = $3.to_f + table[aa_as_sym]
end
end
end
end
# given a bare peptide (no end pieces) returns a ModificationInfo object
# e.g. given "]PEPT*IDE", NOT 'K.PEPTIDE.R'
# if there are no modifications, returns nil
def modification_info(peptide)
if @masses_by_diff_mod.size == 0
return nil
end
hash[:modified_peptide] = peptide.dup
hash = {}
hsh = @masses_by_diff_mod
table = @params.mass_table
h = table[:h] # this? or h_plus ??
oh = table[:o] + h
## only the termini can match a single char
if hsh.key? peptide[0,1]
# AA + H + differential_mod
hash[:mod_nterm_mass] = table[peptide[1,1].to_sym] + h + hsh[peptide[0,1]]
peptide.slice!( 1..-1 )
end
if hsh.key? peptide[-1,1]
# AA + OH + differential_mod
hash[:mod_cterm_mass] = table[peptide[-2,1].to_sym] + oh + hsh[peptide[-1,1]]
peptide.slice!( 0..-2 )
end
mod_array = []
(0...peptide.size).each do |i|
if hsh.key? peptide[i,2]
mod_array << [ i+1 , hsh[peptide[i,2]] ]
end
end
if mod_array.size > 0
hash[:mod_aminoacid_mass_array] = mod_array
end
if hash.size > 0
SpecID::Sequest::PepXML::SearchHit::ModificationInfo.new(hash)
else
nil
end
end
# 1. sets aa_mods and term_mods from a sequest params object
# 2. sets @params
# 3. sets @masses_by_diff_mod
def set_modifications(params, modification_symbols_string)
@params = params
set_hashes(modification_symbols_string)
####################################
## static mods
####################################
static_mods = [] # [[one_letter_amino_acid.to_sym, add_amount.to_f], ...]
static_terminal_mods = [] # e.g. [add_Cterm_peptide, amount.to_f]
params.mods.each do |k,v|
v_to_f = v.to_f
if v_to_f != 0.0
if k =~ /add_(\w)_/
static_mods << [$1.to_sym, v_to_f]
else
static_terminal_mods << [k, v_to_f]
end
end
end
aa_hash = params.mass_table
## Create the static_mods objects
static_mods.map! do |mod|
hash = {
:aminoacid => mod[0].to_s,
:massdiff => mod[1].to_plus_minus_string,
:mass => aa_hash[mod[0]] + mod[1],
:variable => 'N',
:binary => 'Y',
}
SpecID::Sequest::PepXML::AAModification.new(hash)
end
## Create the static_terminal_mods objects
static_terminal_mods.map! do |mod|
terminus = if mod[0] =~ /Cterm/ ; 'c'
else ; 'n' # only two possible termini
end
protein_terminus = case mod[0]
when /Nterm_protein/ ; 'n'
when /Cterm_protein/ ; 'c'
else nil
end
# create the hash
hash = {
:terminus => terminus,
:massdiff => mod[1].to_plus_minus_string,
:variable => 'N',
:description => mod[0],
}
hash[:protein_terminus] = protein_terminus if protein_terminus
SpecID::Sequest::PepXML::TerminalModification.new(hash)
end
#################################
# Variable Mods:
#################################
arr = params.diff_search_options.rstrip.split(/\s+/)
# [aa.to_sym, diff.to_f]
variable_mods = []
(0...arr.size).step(2) do |i|
if arr[i].to_f != 0.0
variable_mods << [arr[i+1].to_sym, arr[i].to_f]
end
end
variable_mods.map! do |mod|
hash = {
:aminoacid => mod[0].to_s,
:massdiff => mod[1].to_plus_minus_string,
:mass => aa_hash[mod[0]] + mod[1],
:variable => 'Y',
:binary => 'N',
:symbol => @mod_symbols_hash[mod],
}
SpecID::Sequest::PepXML::AAModification.new(hash)
end
#################################
# TERMINAL Variable Mods:
#################################
# These are always peptide, not protein termini (for sequest)
(nterm_diff, cterm_diff) = params.term_diff_search_options.rstrip.split(/\s+/).map{|v| v.to_f }
to_add = []
if nterm_diff != 0.0
to_add << ['n',nterm_diff.to_plus_minus_string, @mod_symbols_hash[:nt, nterm_diff]]
end
if cterm_diff != 0.0
to_add << ['c', cterm_diff.to_plus_minus_string, @mod_symbols_hash[:ct, cterm_diff]]
end
variable_terminal_mods = to_add.map do |term, mssdiff, symb|
hash = {
:terminus => term,
:massdiff => mssdiff,
:variable => 'Y',
:symbol => symb,
}
SpecID::Sequest::PepXML::TerminalModification.new(hash)
end
#########################
# COLLECT THEM
#########################
@aa_mods = static_mods + variable_mods
@term_mods = static_terminal_mods + variable_terminal_mods
end
## Generates the pepxml for static and differential amino acid mods based on
## sequest object
def to_pepxml
st = ''
if @aa_mods
st << @aa_mods.map {|v| v.to_pepxml }.join
end
if @term_mods
st << @term_mods.map {|v| v.to_pepxml }.join
end
st
end
end
# Modified aminoacid, static or variable
# unless otherwise stated, all attributes can be anything
class SpecID::Sequest::PepXML::AAModification
include SpecIDXML
# The amino acid (one letter code)
attr_accessor :aminoacid
# Must be a string!!!!
# Mass difference with respect to unmodified aminoacid, must begin with
# either + (nonnegative) or - [e.g. +1.05446 or -2.3342]
# consider Numeric#to_plus_minus_string at top
attr_accessor :massdiff
# Mass of modified aminoacid
attr_accessor :mass
# Y if both modified and unmodified aminoacid could be present in the
# dataset, N if only modified aminoacid can be present
attr_accessor :variable
# whether modification can reside only at protein terminus (specified 'n',
# 'c', or 'nc')
attr_accessor :peptide_terminus
# Special symbol used by search engine to designate this modification
attr_accessor :symbol
# Y if each peptide must have only modified or unmodified aminoacid, N if a
# peptide may contain both modified and unmodified aminoacid
attr_accessor :binary
def initialize(hash=nil)
instance_var_set_from_hash(hash) if hash # can use unless there are weird methods
end
def to_pepxml
short_element_xml_from_instance_vars("aminoacid_modification")
end
end
# Modified aminoacid, static or variable
class SpecID::Sequest::PepXML::TerminalModification
include SpecIDXML
# n for N-terminus, c for C-terminus
attr_accessor :terminus
# Mass difference with respect to unmodified terminus
attr_accessor :massdiff
# Mass of modified terminus
attr_accessor :mass
# Y if both modified and unmodified terminus could be present in the
# dataset, N if only modified terminus can be present
attr_accessor :variable
# Special symbol used by search engine to designate this modification
attr_accessor :symbol
# whether modification can reside only at protein terminus (specified n or
# c)
attr_accessor :protein_terminus
attr_accessor :description
def initialize(hash=nil)
instance_var_set_from_hash(hash) if hash # can use unless there are weird methods
end
def to_pepxml
short_element_xml_from_instance_vars("terminal_modification")
end
end
class SpecID::Sequest::PepXML::SearchDatabase
include SpecIDXML
attr_accessor :local_path
attr_writer :seq_type
# Takes a SequestParams object
# Sets :local_path from the params object attr :database
def initialize(params=nil, args=nil)
@seq_type = nil
if params
@local_path = params.database
end
if args ; set_from_hash(args) end
end
def seq_type
if @seq_type ; @seq_type
else
if @local_path =~ /\.fasta/
'AA'
else
abort "Don't recognize type from your database local path: #{@local_path}"
end
end
end
def to_pepxml
short_element_xml_and_att_string(:search_database, "local_path=\"#{local_path}\" type=\"#{seq_type}\"")
end
end
class SpecID::Sequest::PepXML::SpectrumQuery
include SpecIDXML
# basename_noext.first_scan.last_scan.charge
attr_accessor :spectrum
attr_accessor :start_scan
attr_accessor :end_scan
attr_accessor :precursor_neutral_mass
attr_accessor :index
attr_accessor :search_results
# this is a string
attr_accessor :assumed_charge
attr_accessor :pepxml_version
# sets the search_results array
# if block given, sets search_results to return value
def initialize(hash=nil)
if block_given? ; @search_results = yield
else ; @search_results = []
end
if hash ; set_from_hash(hash) end
end
############################################################
# FOR PEPXML:
############################################################
def to_pepxml
case SpecID::Sequest::PepXML.pepxml_version
when 18
element_xml("spectrum_query", [:spectrum, :start_scan, :end_scan, :precursor_neutral_mass, :assumed_charge, :index]) do
@search_results.collect { |sr| sr.to_pepxml }.join
end
when 0
#element_xml("search_result", [:spectrum, :start_scan, :end_scan, :precursor_neutral_mass, :assumed_charge, :index]) do
# @search_results.collect { |search_result|
# search_result.to_pepxml
# }.join("\n")
#end
end
end
# Returns the precursor_neutral based on the scans and an array indexed by
# scan numbers. first and last scan and charge should be integers.
# This is the precursor_mz - h_plus!
# by=:prec_mz_arr|:deltamass
# if prec_mz_arr then the following arguments must be supplied:
# :first_scan = int, :last_scan = int, :prec_mz_arr = array with the precursor
# m/z for each product scan, :charge = int
# if deltamass then the following arguments must be supplied:
# m_plus_h = float, deltamass = float
# For both flavors, a final additional argument 'average_weights'
# can be used. If true (default), average weights will be used, if false,
# monoisotopic weights (currently this is simply the mass of the proton)
def self.calc_precursor_neutral_mass(by, *args)
average_weights = true
case by
when :prec_mz_arr
(first_scan, last_scan, prec_mz_arr, charge, average_weights) = args
when :deltamass
(m_plus_h, deltamass, average_weights) = args
end
if average_weights
mass_h_plus = SpecID::AVG[:h_plus]
else
mass_h_plus = SpecID::MONO[:h_plus]
end
case by
when :prec_mz_arr
mz = nil
if first_scan != last_scan
sum = 0.0
tot_num = 0
(first_scan..last_scan).each do |scan|
val = prec_mz_arr[scan]
if val # if the scan is not an mslevel 2
sum += val.to_f
tot_num += 1
end
end
mz = sum/tot_num.to_f
else
mz = prec_mz_arr[first_scan].to_f
end
charge * (mz - mass_h_plus)
when :deltamass
m_plus_h - mass_h_plus + deltamass
else
abort "don't recognize 'by' in calc_precursor_neutral_mass: #{by}"
end
end
end
# This object inherits from Array. As such, it is very memory efficient
# (compared to a normal object). However, certain operations when used on
# these objects will produce undesirable results: An array of these objects
# will be flattened (becoming a long list of attributes) when 'flatten' is
# called on them, which is not the behavior we want! other odd behavior is
# possible. Possible fixes are to use a delegate class or redefine the way
# this responds to flatten (so that it won't flatten).
class SpecID::Sequest::PepXML::SearchHit < Array
include SpecIDXML
Non_standard_amino_acid_char_re = /[^A-Z\.\-]/
# num_tot_proteins = "Number of unique proteins in search database containing peptide"
#attr_accessor 0:hit_rank, 1:peptide, 2:peptide_prev_aa, 3:peptide_next_aa, 4:protein, 5:num_tot_proteins, 6:num_matched_ions, 7:tot_num_ions, 8:calc_neutral_pep_mass, 9:massdiff, 10:num_tol_term, 11:num_missed_cleavages, 12:is_rejected
#attr_accessor 13:deltacnstar
#attr_accessor 14:xcorr, 15:deltacn, 16:spscore, 17:sprank
ind_keys = {} ; ind_keys_w_eq = {}; @@ind = {}
ind_keys = {:hit_rank => 0, :peptide => 1, :peptide_prev_aa => 2, :peptide_next_aa => 3, :protein => 4, :num_tot_proteins => 5, :num_matched_ions => 6, :tot_num_ions => 7, :calc_neutral_pep_mass => 8, :massdiff => 9, :num_tol_term => 10, :num_missed_cleavages => 11, :is_rejected => 12, :deltacnstar => 13, :xcorr => 14, :deltacn => 15, :spscore => 16, :sprank => 17}
@@methods = ind_keys.keys
def hit_rank ; self[0] end ; def hit_rank=(oth) ; self[0] = oth end
def peptide ; self[1] end ; def peptide=(oth) ; self[1] = oth end
def peptide_prev_aa ; self[2] end ; def peptide_prev_aa=(oth) ; self[2] = oth end
def peptide_next_aa ; self[3] end ; def peptide_next_aa=(oth) ; self[3] = oth end
def protein ; self[4] end ; def protein=(oth) ; self[4] = oth end
def num_tot_proteins ; self[5] end ; def num_tot_proteins=(oth) ; self[5] = oth end
def num_matched_ions ; self[6] end ; def num_matched_ions=(oth) ; self[6] = oth end
def tot_num_ions ; self[7] end ; def tot_num_ions=(oth) ; self[7] = oth end
def calc_neutral_pep_mass ; self[8] end ; def calc_neutral_pep_mass=(oth) ; self[8] = oth end
def massdiff ; self[9] end ; def massdiff=(oth) ; self[9] = oth end
def num_tol_term ; self[10] end ; def num_tol_term=(oth) ; self[10] = oth end
def num_missed_cleavages ; self[11] end ; def num_missed_cleavages=(oth) ; self[11] = oth end
def is_rejected ; self[12] end ; def is_rejected=(oth) ; self[12] = oth end
def deltacnstar ; self[13] end ; def deltacnstar=(oth) ; self[13] = oth end
def xcorr ; self[14] end ; def xcorr=(oth) ; self[14] = oth end
def deltacn ; self[15] end ; def deltacn=(oth) ; self[15] = oth end
def spscore ; self[16] end ; def spscore=(oth) ; self[16] = oth end
def sprank ; self[17] end ; def sprank=(oth) ; self[17] = oth end
@@arr_size = ind_keys.size
ind_keys.each {|k,v| ind_keys_w_eq["#{k}=".to_sym] = v }
ind_keys.merge!(ind_keys_w_eq)
ind_keys.each {|k,v| @@ind[k] = v ; @@ind["#{k}"] = v}
# These are all search_score elements:
# 1 if there is no second ranked hit, 0 otherwise
def initialize(hash=nil)
super(@@arr_size)
self[0,18] = [hash[:hit_rank], hash[:peptide], hash[:peptide_prev_aa], hash[:peptide_next_aa], hash[:protein], hash[:num_tot_proteins], hash[:num_matched_ions], hash[:tot_num_ions], hash[:calc_neutral_pep_mass], hash[:massdiff], hash[:num_tol_term], hash[:num_missed_cleavages], hash[:is_rejected], hash[:deltacnstar], hash[:xcorr], hash[:deltacn], hash[:spscore], hash[:sprank]]
self
#if hash ; set_from_hash(hash) end
end
# remove_non_amino_acids && split_sequence
def self.prepare_sequence(val)
nv = remove_non_amino_acids(val)
split_sequence(nv)
end
# Returns prev, peptide, next from sequence. Parse errors return
# nil,nil,nil
# R.PEPTIDE.A # -> R, PEPTIDE, A
# R.PEPTIDE.- # -> R, PEPTIDE, -
# PEPTIDE.A # -> -, PEPTIDE, A
# A.PEPTIDE # -> A, PEPTIDE, -
# PEPTIDE # -> nil,nil,nil
def self.split_sequence(val)
peptide_prev_aa = ""; peptide = ""; peptide_next_aa = ""
pieces = val.split('.')
case pieces.size
when 3
peptide_prev_aa, peptide, peptide_next_aa = *pieces
when 2
if pieces[0].size > 1 ## N termini
peptide_prev_aa, peptide, peptide_next_aa = '-', pieces[0], pieces[1]
else ## C termini
peptide_prev_aa, peptide, peptide_next_aa = pieces[0], pieces[1], '-'
end
when 1 ## this must be a parse error!
peptide_prev_aa, peptide, peptide_next_aa = nil,nil,nil
when 0
peptide_prev_aa, peptide, peptide_next_aa = nil,nil,nil
end
return peptide_prev_aa, peptide, peptide_next_aa
end
# removes nonstandard chars with Non_standard_amino_acid_char_re
# preserves A-Z and '.
def self.remove_non_amino_acids(sequence)
sequence.gsub(Non_standard_amino_acid_char_re, '')
end
def inspect
var = @@methods.map do |m| "#{m}:#{self.send(m)}" end.join(" ")
"#"
end
# requires Params object and full sequence (with heads and tails)
def self.calc_num_missed_cleavages(params, sequence)
num_missed = 0
split_after, except_before = params.enzyme_specificity
first, middle, last = self.split_sequence(sequence)
arr = middle.scan(/[#{split_after}][^#{except_before}]/)
return arr.size
end
# requires Params object and full sequence (with heads and tails)
def self.calc_num_tol_term(params, sequence)
num_tol = 0
split_after, except_before = params.enzyme_specificity
first, middle, last = self.split_sequence(sequence)
last_of_middle = middle[-1,1]
first_of_middle = middle[0,1]
if ( split_after.include?(first) && !except_before.include?(first_of_middle) ) || first == '-'
num_tol += 1
end
if split_after.include?(last_of_middle) && !except_before.include?(last) || last == '-'
num_tol += 1
end
return num_tol
end
# Takes ions in the form XX/YY and returns XX, YY
def self.split_ions(ions)
return *(ions.split("/"))
end
def search_score_xml(symbol)
"#{tabs}"
end
def search_scores_xml(*symbol_list)
symbol_list.collect do |sy|
search_score_xml(sy)
end.join("\n") + "\n"
end
def to_pepxml
element_xml("search_hit", [:hit_rank, :peptide, :peptide_prev_aa, :peptide_next_aa, :protein, :num_tot_proteins, :num_matched_ions, :tot_num_ions, :calc_neutral_pep_mass, :massdiff, :num_tol_term, :num_missed_cleavages, :is_rejected]) do
search_scores_xml(:xcorr, :deltacn, :deltacnstar, :spscore, :sprank)
end
end
end
# Positions and masses of modifications
class SpecID::Sequest::PepXML::SearchHit::ModificationInfo
include SpecIDXML
## Should be something like this:
#
#
#
# Mass of modified N terminus<
attr_accessor :mod_nterm_mass
# Mass of modified C terminus<
attr_accessor :mod_cterm_mass
# Peptide sequence (with indicated modifications) I'm assuming that the
# native sequest indicators are OK here
attr_accessor :modified_peptide
## A few main types:
# this should be an array of arrays: [[position, modified_mass], ...]
# position ranges from 1 to peptide length
attr_accessor :mod_aminoacid_mass_array
def initialize(hash=nil)
instance_var_set_from_hash(hash)
end
# Will escape any xml special chars in modified_peptide
def to_pepxml
## Collect the modifications:
mod_strings = []
if @mod_aminoacid_mass_array
mod_strings = @mod_aminoacid_mass_array.map do |ar|
"position=\"#{ar[0]}\" mass=\"#{ar[1]}\""
end
end
## Create the attribute string:
att_parts = []
if @mod_nterm_mass
att_parts << "mod_nterm_mass=\"#{@mod_nterm_mass}\""
end
if @mod_cterm_mass
att_parts << "mod_cterm_mass=\"#{@mod_cterm_mass}\""
end
if @modified_peptide
att_parts << "modified_peptide=\"#{escape_special_chars(@modified_peptide)}\""
end
element_xml_and_att_string('modification_info', att_parts.join(" ")) do
mod_strings.map {|st| short_element_xml_and_att_string('mod_aminoacid_mass', st) }.join
end
end
##
#
#
#
#
end