lib/spec_id/sequest.rb in mspire-0.1.3 vs lib/spec_id/sequest.rb in mspire-0.1.5
- old
+ new
@@ -3,10 +3,11 @@
require 'spec/mzxml/parser'
require 'hash_by'
require 'set_from_hash'
require 'spec_id/bioworks'
require 'instance_var_set_from_hash'
+require 'spec/msrun'
module SpecID::Sequest; end
class SpecID::Sequest::PepXML; end
class SpecID::Sequest::PepXML::MSMSPipelineAnalysis
@@ -60,15 +61,15 @@
def to_pepxml
case SpecID::Sequest::PepXML.pepxml_version
when 0
element_xml(:msms_pipeline_analysis, [:date, :summary_xml]) do
- @msms_run_summary.to_pepxml
+ @msms_run_summary.to_pepxml
end
when 18
element_xml_and_att_string(:msms_pipeline_analysis, "date=\"#{date}\" xmlns=\"#{xmlns}\" xmlns:xsi=\"#{xmlns_xsi}\" xsi:schemaLocation=\"#{xsi_schema_location}\" summary_xml=\"#{summary_xml}\"") do
- @msms_run_summary.to_pepxml
+ @msms_run_summary.to_pepxml
end
else
abort "Don't know how to deal with version: #{SpecID::Sequest::PepXML.pepxml_version}"
end
end
@@ -231,46 +232,65 @@
## TURN THIS ON IF YOU THINK YOU MIGHT NOT BE GETTING PEPTIDES from
## bioworks
#bioworks.peps.each { |pep| if pep.class != SpecID::Bioworks::Pep ; puts "trying to pass as pep: "; p pep; abort "NOT a pep!" end }
- ## DOING THIS BY INDIVIDUAL FILE for when multiple files are in the path!
- ## Get hash by filename containing arrays of precursor_mz indexed by scan num
- #pre_mz_arr_hash = Spec::MzXML::Parser.new.precursor_mz_by_scan_for_path(msdata, /(\.mzXML)|(\.timeIndex)|(\.[Rr][Aa][Ww])$/)
## Start
split_bio_objs = []
## Create a hash by pep object containing num_tot_proteins
## This is only valid if all hits are present (no previous thresholding)
self._prot_num_and_first_prot_by_pep(bioworks.peps)
## (num_prots_by_pep, prot_by_pep) =
#num_prots_by_pep.each do |k,v| puts "k: #{k} v: #{v}\n"; break end ; prot_by_pep.each do |k,v| puts "k: #{k} v: #{v}" ; break end ; abort "HERE"
- mzxml_parser = Spec::MzXML::Parser.new
## Create a hash of spectrum_query arrays by filename (this very big block):
spectrum_queries_by_base_name = {}
# Hash by the filenames to split into filenames:
bioworks.peps.hash_by(:base_name).each do |base_name, pep_arr|
+ prec_mz_arr = nil
+ case x = bioworks.version
+ when /3.2/
+ calc_prec_by = :prec_mz_arr
# get the precursor_mz array for this filename
inner__full_base_name_no_ext = File.join(msdata, base_name)
- pre_mz_arr = mzxml_parser.precursor_mz_by_scan(inner__full_base_name_no_ext)
+ prec_mz_arr = Spec::MSRun.precursor_mz_by_scan(inner__full_base_name_no_ext)
+ when /3.3/
+ calc_prec_by = :deltamass
+ else
+ abort "invalid BioworksBrowser version: #{x}"
+ end
spectrum_queries = pep_arr.hash_by(:first_scan, :last_scan, :charge).collect do |key,arr|
# Sort_by_rank and take the top hit (to mimick out2summary):
arr = arr.sort_by {|pep| pep.xcorr.to_f } # ascending
top_pep = arr.pop
second_hit = arr.last # needed for deltacnstar
+
+ case params.precursor_mass_type
+ when "monoisotopic" ; avg_parent = false
+ else ; avg_parent = true
+ end
+
+ case avg_parent
+ when true ; h_plus = SpecID::AVG[:h_plus]
+ when false ; h_plus = SpecID::MONO[:h_plus]
+ end
+
+ case calc_prec_by
+ when :prec_mz_arr
+ precursor_neutral_mass = SpecID::Sequest::PepXML::SpectrumQuery.calc_precursor_neutral_mass(calc_prec_by, top_pep.first_scan.to_i, top_pep.last_scan.to_i, prec_mz_arr, top_pep.charge.to_i, avg_parent)
+ when :deltamass
+ precursor_neutral_mass = SpecID::Sequest::PepXML::SpectrumQuery.calc_precursor_neutral_mass(calc_prec_by, top_pep.mass.to_f, top_pep.deltamass.to_f, avg_parent)
+ end
- ## Calculate values with interdependency:
- # mass values:
- precursor_neutral_mass = SpecID::Sequest::PepXML::SpectrumQuery.calc_precursor_neutral_mass(top_pep.first_scan.to_i, top_pep.last_scan.to_i, pre_mz_arr, top_pep.charge.to_i)
- calc_neutral_pep_mass = (top_pep.mass.to_f - SpecID::AMU_HYDROGEN)
+ calc_neutral_pep_mass = (top_pep.mass.to_f - h_plus)
massdiff = precursor_neutral_mass - calc_neutral_pep_mass
if massdiff >= 0 ; massdiff = "+" + massdiff.to_s
else ; massdiff = massdiff.to_s end
# deltacn & star:
# (NOTE: OLD?? out2summary wants the deltacn of the 2nd best hit.)
@@ -369,47 +389,47 @@
# :sample_enzyme => params.enzyme,
# })
end
end # collects the pepxml objects
-end
+ end
-def summary_xml
- base_name + ".xml"
-end
+ def summary_xml
+ base_name + ".xml"
+ end
-def precursor_mass_type
- @params.precursor_mass_type
-end
+ def precursor_mass_type
+ @params.precursor_mass_type
+ end
-def fragment_mass_type
- @params.fragment_mass_type
-end
+ def fragment_mass_type
+ @params.fragment_mass_type
+ end
-# combines filename in a manner consistent with the path
-def self.make_base_name(path, filename)
- sep = "/"
- if path.split("/").size < path.split("\\").size
- sep = "\\"
+ # combines filename in a manner consistent with the path
+ def self.make_base_name(path, filename)
+ sep = "/"
+ if path.split("/").size < path.split("\\").size
+ sep = "\\"
+ end
+ if path.split("").last == sep
+ return path + File.basename(filename)
+ else
+ return path + sep + File.basename(filename)
+ end
end
- if path.split("").last == sep
- return path + File.basename(filename)
- else
- return path + sep + File.basename(filename)
- end
-end
-# outputs pepxml, (to file if given)
-def to_pepxml(file=nil)
- string = header
- string << @msms_pipeline_analysis.to_pepxml
+ # outputs pepxml, (to file if given)
+ def to_pepxml(file=nil)
+ string = header
+ string << @msms_pipeline_analysis.to_pepxml
- if file
- File.open(file, "w") do |fh| fh.print string end
+ if file
+ File.open(file, "w") do |fh| fh.print string end
+ end
+ string
end
- string
-end
end # PepXML
##
# In the future, this guy should accept any version of bioworks params file
@@ -446,17 +466,21 @@
end
end
hash
end
+ ## parses file
+ ## and drops the .hdr behind indexed fasta files
def parse(file)
File.open(file) do |fh|
sequest_line = fh.gets #[SEQUEST]
@opts = grab_params(fh)
@opts["search_engine"] = "SEQUEST"
@mods = grab_params(fh)
end
+ ## this gets rid of the .hdr postfix on indexed databases
+ @opts["first_database_name"] = @opts["first_database_name"].sub(/\.hdr$/, '')
end
# returns( split_after, except_before)
def enzyme_specificity
if version == "3.2"
@@ -492,12 +516,14 @@
pseq
end
# Returns xml in the form <parameter name="#{method_name}"
# value="#{method_value}"/> for list of symbols
- def pepxml_parameters(*sym_list)
- params_xml(*sym_list)
+ def pepxml_parameters
+ keys_as_symbols = @opts.sort.map do |k,v| k.to_s end
+ params_xml(*keys_as_symbols)
+ # (:peptide_mass_tol, :peptide_mass_units, :fragment_ion_tol, :ion_series, :max_num_differential_AA_per_mod, :nucleotide_reading_frame, :num_output_lines, :remove_precursor_peak, :ion_cutoff_percentage, :match_peak_count, :match_peak_allowed_error, :match_peak_tolerance, :protein_mass_filter, :sequence_header_filter)
end
def precursor_mass_type
case @opts['mass_type_parent']
when '0' ; "average"
@@ -578,10 +604,11 @@
if @opts["peptide_mass_units"] != "0"
puts "WARNING: peptide_mass_tol units need to be adjusted!"
end
@opts["peptide_mass_tolerance"]
end
+
def fragment_ion_tol
@opts["fragment_ion_tolerance"]
end
def max_num_differential_AA_per_mod
@opts["max_num_differential_per_peptide"]
@@ -640,11 +667,11 @@
def to_pepxml
element_xml(:search_summary, [:base_name, :search_engine, :precursor_mass_type, :fragment_mass_type, :out_data_type, :out_data, :search_id]) do
search_database.to_pepxml +
short_element_xml(:enzymatic_search_constraint, [:enzyme, :max_num_internal_cleavages, :min_number_termini]) +
- params_xml(:peptide_mass_tol, :fragment_ion_tol, :ion_series, :max_num_differential_AA_per_mod, :nucleotide_reading_frame, :num_output_lines, :remove_precursor_peak, :ion_cutoff_percentage, :match_peak_count, :match_peak_allowed_error, :match_peak_tolerance, :protein_mass_filter, :sequence_header_filter)
+ @params.pepxml_parameters
end
end
end
@@ -713,103 +740,140 @@
end
end
# Returns the precursor_neutral based on the scans and an array indexed by
# scan numbers. first and last scan and charge should be integers.
- # This is the precursor_mz - SpecID::AMU_HYDROGEN!
- def self.calc_precursor_neutral_mass(first_scan, last_scan, prec_mz_arr, charge)
- mz = nil
- if first_scan != last_scan
- sum = 0.0
- tot_num = 0
- (first_scan..last_scan).each do |scan|
- val = prec_mz_arr[scan]
- if val # if the scan is not an mslevel 2
- sum += val.to_f
- tot_num += 1
+ # This is the precursor_mz - h_plus!
+ # by=:prec_mz_arr|:deltamass
+ # if prec_mz_arr then the following arguments must be supplied:
+ # :first_scan = int, :last_scan = int, :prec_mz_arr = array with the precursor
+ # m/z for each product scan, :charge = int
+ # if deltamass then the following arguments must be supplied:
+ # m_plus_h = float, deltamass = float
+ # For both flavors, a final additional argument 'average_weights'
+ # can be used. If true (default), average weights will be used, if false,
+ # monoisotopic weights (currently this is simply the mass of the proton)
+ def self.calc_precursor_neutral_mass(by, *args)
+ average_weights = true
+ case by
+ when :prec_mz_arr
+ (first_scan, last_scan, prec_mz_arr, charge, average_weights) = args
+ when :deltamass
+ (m_plus_h, deltamass, average_weights) = args
+ end
+
+ if average_weights
+ mass_h_plus = SpecID::AVG[:h_plus]
+ else
+ mass_h_plus = SpecID::MONO[:h_plus]
+ end
+
+ case by
+ when :prec_mz_arr
+ mz = nil
+ if first_scan != last_scan
+ sum = 0.0
+ tot_num = 0
+ (first_scan..last_scan).each do |scan|
+ val = prec_mz_arr[scan]
+ if val # if the scan is not an mslevel 2
+ sum += val.to_f
+ tot_num += 1
+ end
end
+ mz = sum/tot_num.to_f
+ else
+ mz = prec_mz_arr[first_scan].to_f
end
- mz = sum/tot_num.to_f
+ charge * (mz - mass_h_plus)
+ when :deltamass
+ m_plus_h - mass_h_plus + deltamass
else
- mz = prec_mz_arr[first_scan].to_f
+ abort "don't recognize 'by' in calc_precursor_neutral_mass: #{by}"
end
- return (mz*charge) - (SpecID::AMU_HYDROGEN*charge)
end
end
+# This object inherits from Array. As such, it is very memory efficient
+# (compared to a normal object). However, certain operations when used on
+# these objects will produce undesirable results: An array of these objects
+# will be flattened (becoming a long list of attributes) when 'flatten' is
+# called on them, which is not the behavior we want! other odd behavior is
+# possible. Possible fixes are to use a delegate class or redefine the way
+# this responds to flatten (so that it won't flatten).
class SpecID::Sequest::PepXML::SearchHit < Array
include SpecIDXML
#attr_accessor 0:hit_rank, 1:peptide, 2:peptide_prev_aa, 3:peptide_next_aa, 4:protein, 5:num_tot_proteins, 6:num_matched_ions, 7:tot_num_ions, 8:calc_neutral_pep_mass, 9:massdiff, 10:num_tol_term, 11:num_missed_cleavages, 12:is_rejected
#attr_accessor 13:deltacnstar
#attr_accessor 14:xcorr, 15:deltacn, 16:spscore, 17:sprank
ind_keys = {} ; ind_keys_w_eq = {}; @@ind = {}
ind_keys = {:hit_rank => 0, :peptide => 1, :peptide_prev_aa => 2, :peptide_next_aa => 3, :protein => 4, :num_tot_proteins => 5, :num_matched_ions => 6, :tot_num_ions => 7, :calc_neutral_pep_mass => 8, :massdiff => 9, :num_tol_term => 10, :num_missed_cleavages => 11, :is_rejected => 12, :deltacnstar => 13, :xcorr => 14, :deltacn => 15, :spscore => 16, :sprank => 17}
@@methods = ind_keys.keys
def hit_rank ; self[0] end ; def hit_rank=(oth) ; self[0] = oth end
- def peptide ; self[1] end ; def peptide=(oth) ; self[1] = oth end
- def peptide_prev_aa ; self[2] end ; def peptide_prev_aa=(oth) ; self[2] = oth end
- def peptide_next_aa ; self[3] end ; def peptide_next_aa=(oth) ; self[3] = oth end
- def protein ; self[4] end ; def protein=(oth) ; self[4] = oth end
- def num_tot_proteins ; self[5] end ; def num_tot_proteins=(oth) ; self[5] = oth end
- def num_matched_ions ; self[6] end ; def num_matched_ions=(oth) ; self[6] = oth end
- def tot_num_ions ; self[7] end ; def tot_num_ions=(oth) ; self[7] = oth end
- def calc_neutral_pep_mass ; self[8] end ; def calc_neutral_pep_mass=(oth) ; self[8] = oth end
- def massdiff ; self[9] end ; def massdiff=(oth) ; self[9] = oth end
- def num_tol_term ; self[10] end ; def num_tol_term=(oth) ; self[10] = oth end
- def num_missed_cleavages ; self[11] end ; def num_missed_cleavages=(oth) ; self[11] = oth end
- def is_rejected ; self[12] end ; def is_rejected=(oth) ; self[12] = oth end
- def deltacnstar ; self[13] end ; def deltacnstar=(oth) ; self[13] = oth end
- def xcorr ; self[14] end ; def xcorr=(oth) ; self[14] = oth end
- def deltacn ; self[15] end ; def deltacn=(oth) ; self[15] = oth end
- def spscore ; self[16] end ; def spscore=(oth) ; self[16] = oth end
- def sprank ; self[17] end ; def sprank=(oth) ; self[17] = oth end
+def peptide ; self[1] end ; def peptide=(oth) ; self[1] = oth end
+def peptide_prev_aa ; self[2] end ; def peptide_prev_aa=(oth) ; self[2] = oth end
+def peptide_next_aa ; self[3] end ; def peptide_next_aa=(oth) ; self[3] = oth end
+def protein ; self[4] end ; def protein=(oth) ; self[4] = oth end
+def num_tot_proteins ; self[5] end ; def num_tot_proteins=(oth) ; self[5] = oth end
+def num_matched_ions ; self[6] end ; def num_matched_ions=(oth) ; self[6] = oth end
+def tot_num_ions ; self[7] end ; def tot_num_ions=(oth) ; self[7] = oth end
+def calc_neutral_pep_mass ; self[8] end ; def calc_neutral_pep_mass=(oth) ; self[8] = oth end
+def massdiff ; self[9] end ; def massdiff=(oth) ; self[9] = oth end
+def num_tol_term ; self[10] end ; def num_tol_term=(oth) ; self[10] = oth end
+def num_missed_cleavages ; self[11] end ; def num_missed_cleavages=(oth) ; self[11] = oth end
+def is_rejected ; self[12] end ; def is_rejected=(oth) ; self[12] = oth end
+def deltacnstar ; self[13] end ; def deltacnstar=(oth) ; self[13] = oth end
+def xcorr ; self[14] end ; def xcorr=(oth) ; self[14] = oth end
+def deltacn ; self[15] end ; def deltacn=(oth) ; self[15] = oth end
+def spscore ; self[16] end ; def spscore=(oth) ; self[16] = oth end
+def sprank ; self[17] end ; def sprank=(oth) ; self[17] = oth end
- @@arr_size = ind_keys.size
- ind_keys.each {|k,v| ind_keys_w_eq["#{k}=".to_sym] = v }
- ind_keys.merge!(ind_keys_w_eq)
- ind_keys.each {|k,v| @@ind[k] = v ; @@ind["#{k}"] = v}
+@@arr_size = ind_keys.size
+ind_keys.each {|k,v| ind_keys_w_eq["#{k}=".to_sym] = v }
+ind_keys.merge!(ind_keys_w_eq)
+ind_keys.each {|k,v| @@ind[k] = v ; @@ind["#{k}"] = v}
- # These are all search_score elements:
+# These are all search_score elements:
- # 1 if there is no second ranked hit, 0 otherwise
+# 1 if there is no second ranked hit, 0 otherwise
- def initialize(hash=nil)
- super(@@arr_size)
- self[0,18] = [hash[:hit_rank], hash[:peptide], hash[:peptide_prev_aa], hash[:peptide_next_aa], hash[:protein], hash[:num_tot_proteins], hash[:num_matched_ions], hash[:tot_num_ions], hash[:calc_neutral_pep_mass], hash[:massdiff], hash[:num_tol_term], hash[:num_missed_cleavages], hash[:is_rejected], hash[:deltacnstar], hash[:xcorr], hash[:deltacn], hash[:spscore], hash[:sprank]]
- self
- #if hash ; set_from_hash(hash) end
- end
+def initialize(hash=nil)
+ super(@@arr_size)
+ self[0,18] = [hash[:hit_rank], hash[:peptide], hash[:peptide_prev_aa], hash[:peptide_next_aa], hash[:protein], hash[:num_tot_proteins], hash[:num_matched_ions], hash[:tot_num_ions], hash[:calc_neutral_pep_mass], hash[:massdiff], hash[:num_tol_term], hash[:num_missed_cleavages], hash[:is_rejected], hash[:deltacnstar], hash[:xcorr], hash[:deltacn], hash[:spscore], hash[:sprank]]
+ self
+ #if hash ; set_from_hash(hash) end
+end
- # Returns prev, peptide, next from sequence. Parse errors return
- # nil,nil,nil
- # R.PEPTIDE.A # -> R, PEPTIDE, A
- # R.PEPTIDE.- # -> R, PEPTIDE, -
- # PEPTIDE.A # -> -, PEPTIDE, A
- # A.PEPTIDE # -> A, PEPTIDE, -
- # PEPTIDE # -> nil,nil,nil
- def self.split_sequence(val)
- peptide_prev_aa = ""; peptide = ""; peptide_next_aa = ""
- pieces = val.split(".")
- case pieces.size
- when 3
- peptide_prev_aa, peptide, peptide_next_aa = *pieces
- when 2
- if pieces[0].size > 1 ## N termini
- peptide_prev_aa, peptide, peptide_next_aa = '-', pieces[0], pieces[1]
- else ## C termini
- peptide_prev_aa, peptide, peptide_next_aa = pieces[0], pieces[1], '-'
- end
- when 1 ## this must be a parse error!
- peptide_prev_aa, peptide, peptide_next_aa = nil,nil,nil
- when 0
- peptide_prev_aa, peptide, peptide_next_aa = nil,nil,nil
+# Returns prev, peptide, next from sequence. Parse errors return
+# nil,nil,nil
+# R.PEPTIDE.A # -> R, PEPTIDE, A
+# R.PEPTIDE.- # -> R, PEPTIDE, -
+# PEPTIDE.A # -> -, PEPTIDE, A
+# A.PEPTIDE # -> A, PEPTIDE, -
+# PEPTIDE # -> nil,nil,nil
+def self.split_sequence(val)
+ peptide_prev_aa = ""; peptide = ""; peptide_next_aa = ""
+ pieces = val.split(".")
+ case pieces.size
+ when 3
+ peptide_prev_aa, peptide, peptide_next_aa = *pieces
+ when 2
+ if pieces[0].size > 1 ## N termini
+ peptide_prev_aa, peptide, peptide_next_aa = '-', pieces[0], pieces[1]
+ else ## C termini
+ peptide_prev_aa, peptide, peptide_next_aa = pieces[0], pieces[1], '-'
end
- return peptide_prev_aa, peptide, peptide_next_aa
+ when 1 ## this must be a parse error!
+ peptide_prev_aa, peptide, peptide_next_aa = nil,nil,nil
+ when 0
+ peptide_prev_aa, peptide, peptide_next_aa = nil,nil,nil
end
+ return peptide_prev_aa, peptide, peptide_next_aa
+end
- def inspect
+def inspect
"#<SearchHit #{@@methods.map do |m| "#{m}:#{self.send(m)}" end.join(" ")}>"
end
# requires Params object and full sequence (with heads and tails)
def self.calc_num_missed_cleavages(params, sequence)