require 'spec_helper'
require 'mspire/mass'
require 'mspire/mass/aa'
require 'mspire/ident/pepxml'
require 'mspire/ident/pepxml/modifications'
require 'mspire/ident/pepxml/spectrum_query'
require 'mspire/ident/pepxml/search_result'
require 'mspire/ident/pepxml/search_hit'
require 'mspire/ident/pepxml/search_hit/modification_info'
describe "creating an Mspire::Ident::Pepxml" do
it "can be creating in a nested fashion reflecting internal structure" do
tags_that_should_be_present = %w(msms_pipeline_analysis msms_run_summary sample_enzyme search_summary spectrum_query search_result search_hit modification_info mod_aminoacid_mass search_score)
pepxml = Mspire::Ident::Pepxml.new do |msms_pipeline_analysis|
msms_pipeline_analysis.merge!(:summary_xml => "020.xml") do |msms_run_summary|
# prep the sample enzyme and search_summary
msms_run_summary.merge!(
:base_name => '/home/jtprince/dev/mspire/020',
:ms_manufacturer => 'Thermo',
:ms_model => 'LTQ Orbitrap',
:ms_ionization => 'ESI',
:ms_mass_analyzer => 'Ion Trap',
:ms_detector => 'UNKNOWN'
) do |sample_enzyme, search_summary, spectrum_queries|
sample_enzyme.merge!(:name=>'Trypsin',:cut=>'KR',:no_cut=>'P',:sense=>'C')
search_summary.merge!(
:base_name=>'/path/to/file/020',
:search_engine => 'SEQUEST',
:precursor_mass_type =>'monoisotopic',
:fragment_mass_type => 'average'
) do |search_database, enzymatic_search_constraint, modifications, parameters|
search_database.merge!(:local_path => '/path/to/db.fasta', :seq_type => 'AA') # note seq_type == type
enzymatic_search_constraint.merge!(
:enzyme => 'Trypsin',
:max_num_internal_cleavages => 2,
:min_number_termini => 2
)
modifications << Mspire::Ident::Pepxml::AminoacidModification.new(
:aminoacid => 'M', :massdiff => 15.9994, :mass => Mspire::Mass::AA::MONO['M']+15.9994,
:variable => 'Y', :symbol => '*')
# invented, for example, a protein terminating mod
modifications << Mspire::Ident::Pepxml::TerminalModification.new(
:terminus => 'c', :massdiff => 23.3333, :mass => Mspire::Mass::MONO['oh'] + 23.3333,
:variable => 'Y', :symbol => '[', :protein_terminus => 'c',
:description => 'leave protein_terminus off if not protein mod'
)
modifications << Mspire::Ident::Pepxml::TerminalModification.new(
:terminus => 'c', :massdiff => 25.42322, :mass => Mspire::Mass::MONO['h+'] + 25.42322,
:variable => 'N', :symbol => ']', :description => 'example: c term mod'
)
parameters.merge!(
:fragment_ion_tolerance => 1.0000,
:digest_mass_range => '600.0 3500.0',
:enzyme_info => 'Trypsin(KR/P) 1 1 KR P', # etc....
)
end
spectrum_query1 = Mspire::Ident::Pepxml::SpectrumQuery.new(
:spectrum => '020.3.3.1', :start_scan => 3, :end_scan => 3,
:precursor_neutral_mass => 1120.93743421875, :assumed_charge => 1
) do |search_results|
search_result1 = Mspire::Ident::Pepxml::SearchResult.new do |search_hits|
modpositions = [[1, 243.1559], [6, 167.0581], [7,181.085]].map do |pair|
Mspire::Ident::Pepxml::SearchHit::ModificationInfo::ModAminoacidMass.new(*pair)
end
# order(modified_peptide, mod_aminoacid_masses, :mod_nterm_mass, :mod_cterm_mass)
# or can be set by hash
mod_info = Mspire::Ident::Pepxml::SearchHit::ModificationInfo.new('Y#RLGGS#T#K', modpositions)
search_hit1 = Mspire::Ident::Pepxml::SearchHit.new(
:hit_rank=>1, :peptide=>'YRLGGSTK', :peptide_prev_aa => "R", :peptide_next_aa => "K",
:protein => "gi|16130113|ref|NP_416680.1|", :num_tot_proteins => 1, :num_matched_ions => 5,
:tot_num_ions => 35, :calc_neutral_pep_mass => 1120.93163442, :massdiff => 0.00579979875010395,
:num_tol_term => 2, :num_missed_cleavages => 1, :is_rejected => 0,
:modification_info => mod_info) do |search_scores|
search_scores.merge!(:xcorr => 0.12346, :deltacn => 0.7959, :deltacnstar => 0,
:spscore => 29.85, :sprank => 1)
end
search_hits << search_hit1
end
search_results << search_result1
end
spectrum_queries << spectrum_query1
end
end
end
xml = pepxml.to_xml
tags_that_should_be_present.each do |tag|
xml.should match(/<#{tag} ?/)
end
xml.should match( /<\?xml version="1.0" encoding="UTF-8"\?>/ )
xml.should match( %r{<\?xml-stylesheet type="text/xsl" href="/tools/bin/TPP/tpp/schema/pepXML_std.xsl"\?>} )
end
end
=begin
# splits string on ' 'and matches the line found by find_line_regexp in
# lines
def match_modline_pieces(lines, find_line_regexp, string)
pieces = string.split(' ').map {|v| /#{Regexp.escape(v)}/ }
lines.each do |line|
if line =~ find_line_regexp
pieces.each do |piece|
line.should =~ piece
end
end
end
end
it 'gets modifications right in real run' do
@out_files.each do |fn|
fn.exist_as_a_file?.should be_true
beginning = IO.read(fn)
lines = beginning.split("\n")
[
[/aminoacid="M"/, ''],
].each do |a,b|
match_modline_pieces(lines, a, b)
end
[
'',
'',
'',
'',
''
].each do |line|
beginning.should =~ /#{Regexp.escape(line)}/ # "a modification info for a peptide")
end
end
end
end
end
=begin
describe "Mspire::Ident::Pepxml created from small bioworks.xml" do
spec_large do
before(:all) do
tf_mzxml_path = Tfiles_l + "/yeast_gly_mzXML"
tf_params = Tfiles + "/bioworks32.params"
tf_bioworks_xml = Tfiles + "/bioworks_small.xml"
out_path = Tfiles
@pepxml_objs = Sequest::Pepxml.set_from_bioworks(tf_bioworks_xml, :params => tf_params, :ms_data => tf_mzxml_path, :out_path => out_path)
end
it 'gets some spectrum queries' do
@pepxml_objs.each do |obj|
(obj.spectrum_queries.size > 2).should be_true
(obj.spectrum_queries.first.search_results.first.search_hits.size > 0).should be_true
end
#@pepxml_objs.each do |pep| puts pep.to_pepxml end
end
end
end
describe Sequest::Pepxml, " created from large bioworks.xml" do
# assert_equal_by_pairs (really any old array)
def assert_equal_pairs(obj, arrs)
arrs.each do |arr|
#if obj.send(arr[1]) != arr[0]
# puts "HELLO"
# puts "OBJ answer"
# p obj.send(arr[1])
# puts "ar0"
# p arr[0]
# puts "ar1"
# p arr[1]
#end
if arr[0].is_a? Float
obj.send(arr[1]).should be_close(arr[0], 0.0000000001)
else
obj.send(arr[1]).should == arr[0]
end
end
end
#swap the first to guys first
def assert_equal_pairs_swapped(obj, arrs)
arrs.each do |arr|
arr[0], arr[1] = arr[1], arr[0]
end
assert_equal_pairs(obj, arrs)
end
spec_large do
before(:all) do
st = Time.new
params = Tfiles + "/opd1/sequest.3.2.params"
bioworks_xml = Tfiles_l + "/opd1/bioworks.000.oldparams.xml"
mzxml_path = Tfiles_l + "/opd1"
out_path = Tfiles
@pepxml_version = 18
@pepxml_objs = Sequest::Pepxml.set_from_bioworks_xml(bioworks_xml, params, {:ms_data => mzxml_path, :out_path => out_path, :pepxml_version => @pepxml_version})
puts "- takes #{Time.new - st} secs"
end
it 'extracts MSMSPipelineAnalysis' do
######## HMMMMM...
Sequest::Pepxml.pepxml_version.should == @pepxml_version
# MSMSPipelineAnalysis
po = @pepxml_objs.first
msms_pipeline = po.msms_pipeline_analysis
msms_pipeline.xmlns.should == 'http://regis-web.systemsbiology.net/pepXML'
msms_pipeline.xmlns_xsi.should == 'http://www.w3.org/2001/XMLSchema-instance'
msms_pipeline.xsi_schema_location.should == 'http://regis-web.systemsbiology.net/pepXML /tools/bin/TPP/tpp/schema/pepXML_v18.xsd'
msms_pipeline.summary_xml.should == '000.xml'
end
it 'extracts MSmSRunSummary' do
# MSMSRunSummary
rs = @pepxml_objs.first.msms_pipeline_analysis.msms_run_summary
rs.base_name.should =~ /\/000/
assert_equal_pairs(rs, [ ['ThermoFinnigan', :ms_manufacturer], ['LCQ Deca XP Plus', :ms_model], ['ESI', :ms_ionization], ['Ion Trap', :ms_mass_analyzer], ['UNKNOWN', :ms_detector], ['raw', :raw_data_type], ['.mzXML', :raw_data], ])
end
it 'extracts SampleEnzyme' do
# SampleEnzyme
se = @pepxml_objs.first.msms_pipeline_analysis.msms_run_summary.sample_enzyme
assert_equal_pairs(se, [ ['Trypsin', :name], ['KR', :cut], [nil, :no_cut], ['C', :sense], ])
end
it 'extracts SearchSummary' do
# SearchSummary
ss = @pepxml_objs.first.msms_pipeline_analysis.msms_run_summary.search_summary
ss.is_a?(Sequest::Pepxml::SearchSummary).should be_true
ss.base_name.should =~ /\/000/
ss.peptide_mass_tol.should =~ /1\.500/
assert_equal_pairs_swapped(ss, [ # normal attributes
[:search_engine, "SEQUEST"], [:precursor_mass_type, "average"], [:fragment_mass_type, "average"], [:out_data_type, "out"], [:out_data, ".tgz"], [:search_id, "1"],
# enzymatic_search_constraint
[:enzyme, 'Trypsin'], [:max_num_internal_cleavages, '2'], [:min_number_termini, '2'],
# parameters
[:fragment_ion_tol, "1.0000"], [:ion_series, "0 1 1 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0"], [:max_num_differential_AA_per_mod, "3"], [:nucleotide_reading_frame, "0"], [:num_output_lines, "10"], [:remove_precursor_peak, "0"], [:ion_cutoff_percentage, "0.0000"], [:match_peak_count, "0"], [:match_peak_allowed_error, "1"], [:match_peak_tolerance, "1.0000"], [:protein_mass_filter, "0 0"],
])
end
it 'extracts SearchDatabase' do
# SearchDatabase
sd = @pepxml_objs.first.msms_pipeline_analysis.msms_run_summary.search_summary.search_database
sd.is_a?(Sequest::Pepxml::SearchDatabase).should be_true
assert_equal_pairs_swapped(sd, [ [:local_path, "C:\\Xcalibur\\database\\ecoli_K12.fasta"], [:seq_type, 'AA'], ])
end
it 'returns SpectrumQueries' do
# SpectrumQueries
sq = @pepxml_objs.first.msms_pipeline_analysis.msms_run_summary.spectrum_queries
spec = sq.first
assert_equal_pairs_swapped(spec, [
[:spectrum, "000.100.100.1"], [:start_scan, "100"], [:end_scan, "100"],
#[:precursor_neutral_mass, "1074.5920"], # out2summary
[:precursor_neutral_mass, 1074.666926], # mine
[:assumed_charge, 1], [:index, "1"],
])
sh = spec.search_results.first.search_hits.first
assert_equal_pairs_swapped(sh, [
# normal attributes
[:hit_rank, 1],
[:peptide, "SIYFRNFK"],
[:peptide_prev_aa, "R"],
[:peptide_next_aa, "G"],
[:protein, "gi|16130084|ref|NP_416651.1|"],
[:num_tot_proteins, 1],
[:num_matched_ions, 4],
[:tot_num_ions, 14],
#[:calc_neutral_pep_mass, "1074.1920"], # out2summary
[:calc_neutral_pep_mass, 1074.23261], # mine
#[:massdiff, "+0.400000"], # out2summary
[:massdiff, 0.434316000000081], # mine
[:num_tol_term, 2], [:num_missed_cleavages, 1], [:is_rejected, 0],
# search_score
[:xcorr, 0.4], [:deltacn, 0.023], [:deltacnstar, "0"], [:spscore, 78.8], [:sprank, 1],
])
spec = sq[1]
assert_equal_pairs_swapped(spec, [
[:spectrum, "000.1000.1000.1"], [:start_scan, "1000"], [:end_scan, "1000"], #[:precursor_neutral_mass, "663.1920"], # out2summary
[:precursor_neutral_mass, 663.206111], # mine
[:assumed_charge, 1], [:index, "2"],
])
sh = spec.search_results.first.search_hits.first
assert_equal_pairs_swapped(sh, [
# normal attributes
[:hit_rank, 1], [:peptide, "ALADFK"], [:peptide_prev_aa, "R"], [:peptide_next_aa, "S"], [:protein, "gi|16128765|ref|NP_415318.1|"], [:num_tot_proteins, 1], [:num_matched_ions, 5], [:tot_num_ions, 10],
[:num_tol_term, 2], [:num_missed_cleavages, 0], [:is_rejected, 0],
#[:massdiff, "-0.600000"], # out2summary
[:massdiff, -0.556499000000031], # mine
#[:calc_neutral_pep_mass, 663.7920], # out2summary
[:calc_neutral_pep_mass, 663.76261], # mine
# search_score
[:xcorr, 0.965], [:deltacn, 0.132], [:deltacnstar, "0"], [:spscore, 81.1], [:sprank, 1],
])
spec = sq[9]
assert_equal_pairs_swapped(spec, [
[:spectrum, "000.1008.1008.2"], [:start_scan, "1008"], [:end_scan, "1008"], [:assumed_charge, 2],
#[:precursor_neutral_mass, "691.0920"], # out2summary
[:precursor_neutral_mass, 691.150992], # mine
])
sh = spec.search_results.first.search_hits.first
assert_equal_pairs_swapped(sh, [
# normal attributes
[:hit_rank, 1], [:peptide, "RLFTR"], [:peptide_prev_aa, "R"], [:peptide_next_aa, "A"], [:protein, "gi|16130457|ref|NP_417027.1|"], [:num_tot_proteins, 1], [:num_matched_ions, 5], [:tot_num_ions, 8], [:num_tol_term, 2],
#[:num_missed_cleavages, "0"], # out2summary misses this!
[:num_missed_cleavages, 1],
[:is_rejected, 0],
#[:calc_neutral_pep_mass, "691.7920"], # out2summary
[:calc_neutral_pep_mass, 691.82261], # mine
#[:massdiff, "-0.700000"], # out2summary
[:massdiff, -0.67161800000008], # mine
# search_score
[:xcorr, 0.903], [:deltacn, 0.333], [:deltacnstar, "0"], [:spscore, 172.8], [:sprank, 1],
])
end
it 'can generate correct pepxml file' do
## IF OUR OBJECT IS CORRECT, THEN WE GET THE OUTPUT:
string = @pepxml_objs.first.to_pepxml
ans_lines = IO.read(Tfiles + "/opd1/000.my_answer.100lines.xml").split("\n")
base_name_re = /base_name=".*?files\//o
date_re = /date=".*?"/
string.split("\n").each_with_index do |line,i|
if i > 99 ; break end
ans, exp =
if i == 1
[line.sub(date_re,''), ans_lines[i].sub(date_re,'')]
elsif i == 2
[line.sub(base_name_re,''), ans_lines[i].sub(base_name_re, '').sub(/^\s+/, "\t")]
elsif i == 6
[line.sub(base_name_re,''), ans_lines[i].sub(base_name_re, '').sub(/^\s+/, "\t\t")]
else
[line, ans_lines[i]]
end
#ans.split('').zip(exp.split('')) do |l,a|
# if l != a
# puts line
# puts ans_lines[i]
# puts l
# puts a
# end
#end
if ans != exp
puts ans
puts exp
end
ans.should == exp
#line.sub(base_name_re,'').should == ans_lines[i].sub(base_name_re,'')
end
end
end
end
describe Sequest::Pepxml::Modifications do
before(:each) do
tf_params = Tfiles + "/bioworks32.params"
@params = Sequest::Params.new(tf_params)
# The params object here is completely unnecessary for this test, except
# that it sets up the mass table
@obj = Sequest::Pepxml::Modifications.new(@params, "(M* +15.90000) (M# +29.00000) (S@ +80.00000) (C^ +12.00000) (ct[ +12.33000) (nt] +14.20000) ")
end
it 'creates a mod_symbols_hash' do
answ = {[:C, 12.0]=>"^", [:S, 80.0]=>"@", [:M, 29.0]=>"#", [:M, 15.9]=>"*", [:ct, 12.33]=>"[", [:nt, 14.2]=>"]"}
@obj.mod_symbols_hash.should == answ
## need more here
end
it 'creates a ModificationInfo object given a special peptide sequence' do
mod_string = "(M* +15.90000) (M# +29.00000) (S@ +80.00000) (C^ +12.00000) (ct[ +12.33000) (nt] +14.20000) "
@params.diff_search_options = "15.90000 M 29.00000 M 80.00000 S 12.00000 C"
@params.term_diff_search_options = "14.20000 12.33000"
mod = Sequest::Pepxml::Modifications.new(@params, mod_string)
## no mods
peptide = "PEPTIDE"
mod.modification_info(peptide).should be_nil
peptide = "]M*EC^S@IDM#M*EMSCM["
modinfo = mod.modification_info(peptide)
modinfo.modified_peptide.should == peptide
modinfo.mod_nterm_mass.should be_close(146.40054, 0.000001)
modinfo.mod_cterm_mass.should be_close(160.52994, 0.000001)
end
end
describe Sequest::Pepxml::SearchHit::ModificationInfo do
before(:each) do
modaaobjs = [[3, 150.3], [6, 345.2]].map do |ar|
Sequest::Pepxml::SearchHit::ModificationInfo::ModAminoacidMass.new(ar)
end
hash = {
:mod_nterm_mass => 520.2,
:modified_peptide => "MOD*IFI^E&D",
:mod_aminoacid_masses => modaaobjs,
}
#answ = "\n\t\n\t\n\n"
@obj = Sequest::Pepxml::SearchHit::ModificationInfo.new(hash)
end
def _re(st)
/#{Regexp.escape(st)}/
end
it 'can produce pepxml' do
answ = @obj.to_pepxml
answ.should =~ _re('")
end
end
=end