lib/spec_id/sequest.rb in mspire-0.1.3 vs lib/spec_id/sequest.rb in mspire-0.1.5

- old
+ new

@@ -3,10 +3,11 @@ require 'spec/mzxml/parser' require 'hash_by' require 'set_from_hash' require 'spec_id/bioworks' require 'instance_var_set_from_hash' +require 'spec/msrun' module SpecID::Sequest; end class SpecID::Sequest::PepXML; end class SpecID::Sequest::PepXML::MSMSPipelineAnalysis @@ -60,15 +61,15 @@ def to_pepxml case SpecID::Sequest::PepXML.pepxml_version when 0 element_xml(:msms_pipeline_analysis, [:date, :summary_xml]) do - @msms_run_summary.to_pepxml + @msms_run_summary.to_pepxml end when 18 element_xml_and_att_string(:msms_pipeline_analysis, "date=\"#{date}\" xmlns=\"#{xmlns}\" xmlns:xsi=\"#{xmlns_xsi}\" xsi:schemaLocation=\"#{xsi_schema_location}\" summary_xml=\"#{summary_xml}\"") do - @msms_run_summary.to_pepxml + @msms_run_summary.to_pepxml end else abort "Don't know how to deal with version: #{SpecID::Sequest::PepXML.pepxml_version}" end end @@ -231,46 +232,65 @@ ## TURN THIS ON IF YOU THINK YOU MIGHT NOT BE GETTING PEPTIDES from ## bioworks #bioworks.peps.each { |pep| if pep.class != SpecID::Bioworks::Pep ; puts "trying to pass as pep: "; p pep; abort "NOT a pep!" end } - ## DOING THIS BY INDIVIDUAL FILE for when multiple files are in the path! - ## Get hash by filename containing arrays of precursor_mz indexed by scan num - #pre_mz_arr_hash = Spec::MzXML::Parser.new.precursor_mz_by_scan_for_path(msdata, /(\.mzXML)|(\.timeIndex)|(\.[Rr][Aa][Ww])$/) ## Start split_bio_objs = [] ## Create a hash by pep object containing num_tot_proteins ## This is only valid if all hits are present (no previous thresholding) self._prot_num_and_first_prot_by_pep(bioworks.peps) ## (num_prots_by_pep, prot_by_pep) = #num_prots_by_pep.each do |k,v| puts "k: #{k} v: #{v}\n"; break end ; prot_by_pep.each do |k,v| puts "k: #{k} v: #{v}" ; break end ; abort "HERE" - mzxml_parser = Spec::MzXML::Parser.new ## Create a hash of spectrum_query arrays by filename (this very big block): spectrum_queries_by_base_name = {} # Hash by the filenames to split into filenames: bioworks.peps.hash_by(:base_name).each do |base_name, pep_arr| + prec_mz_arr = nil + case x = bioworks.version + when /3.2/ + calc_prec_by = :prec_mz_arr # get the precursor_mz array for this filename inner__full_base_name_no_ext = File.join(msdata, base_name) - pre_mz_arr = mzxml_parser.precursor_mz_by_scan(inner__full_base_name_no_ext) + prec_mz_arr = Spec::MSRun.precursor_mz_by_scan(inner__full_base_name_no_ext) + when /3.3/ + calc_prec_by = :deltamass + else + abort "invalid BioworksBrowser version: #{x}" + end spectrum_queries = pep_arr.hash_by(:first_scan, :last_scan, :charge).collect do |key,arr| # Sort_by_rank and take the top hit (to mimick out2summary): arr = arr.sort_by {|pep| pep.xcorr.to_f } # ascending top_pep = arr.pop second_hit = arr.last # needed for deltacnstar + + case params.precursor_mass_type + when "monoisotopic" ; avg_parent = false + else ; avg_parent = true + end + + case avg_parent + when true ; h_plus = SpecID::AVG[:h_plus] + when false ; h_plus = SpecID::MONO[:h_plus] + end + + case calc_prec_by + when :prec_mz_arr + precursor_neutral_mass = SpecID::Sequest::PepXML::SpectrumQuery.calc_precursor_neutral_mass(calc_prec_by, top_pep.first_scan.to_i, top_pep.last_scan.to_i, prec_mz_arr, top_pep.charge.to_i, avg_parent) + when :deltamass + precursor_neutral_mass = SpecID::Sequest::PepXML::SpectrumQuery.calc_precursor_neutral_mass(calc_prec_by, top_pep.mass.to_f, top_pep.deltamass.to_f, avg_parent) + end - ## Calculate values with interdependency: - # mass values: - precursor_neutral_mass = SpecID::Sequest::PepXML::SpectrumQuery.calc_precursor_neutral_mass(top_pep.first_scan.to_i, top_pep.last_scan.to_i, pre_mz_arr, top_pep.charge.to_i) - calc_neutral_pep_mass = (top_pep.mass.to_f - SpecID::AMU_HYDROGEN) + calc_neutral_pep_mass = (top_pep.mass.to_f - h_plus) massdiff = precursor_neutral_mass - calc_neutral_pep_mass if massdiff >= 0 ; massdiff = "+" + massdiff.to_s else ; massdiff = massdiff.to_s end # deltacn & star: # (NOTE: OLD?? out2summary wants the deltacn of the 2nd best hit.) @@ -369,47 +389,47 @@ # :sample_enzyme => params.enzyme, # }) end end # collects the pepxml objects -end + end -def summary_xml - base_name + ".xml" -end + def summary_xml + base_name + ".xml" + end -def precursor_mass_type - @params.precursor_mass_type -end + def precursor_mass_type + @params.precursor_mass_type + end -def fragment_mass_type - @params.fragment_mass_type -end + def fragment_mass_type + @params.fragment_mass_type + end -# combines filename in a manner consistent with the path -def self.make_base_name(path, filename) - sep = "/" - if path.split("/").size < path.split("\\").size - sep = "\\" + # combines filename in a manner consistent with the path + def self.make_base_name(path, filename) + sep = "/" + if path.split("/").size < path.split("\\").size + sep = "\\" + end + if path.split("").last == sep + return path + File.basename(filename) + else + return path + sep + File.basename(filename) + end end - if path.split("").last == sep - return path + File.basename(filename) - else - return path + sep + File.basename(filename) - end -end -# outputs pepxml, (to file if given) -def to_pepxml(file=nil) - string = header - string << @msms_pipeline_analysis.to_pepxml + # outputs pepxml, (to file if given) + def to_pepxml(file=nil) + string = header + string << @msms_pipeline_analysis.to_pepxml - if file - File.open(file, "w") do |fh| fh.print string end + if file + File.open(file, "w") do |fh| fh.print string end + end + string end - string -end end # PepXML ## # In the future, this guy should accept any version of bioworks params file @@ -446,17 +466,21 @@ end end hash end + ## parses file + ## and drops the .hdr behind indexed fasta files def parse(file) File.open(file) do |fh| sequest_line = fh.gets #[SEQUEST] @opts = grab_params(fh) @opts["search_engine"] = "SEQUEST" @mods = grab_params(fh) end + ## this gets rid of the .hdr postfix on indexed databases + @opts["first_database_name"] = @opts["first_database_name"].sub(/\.hdr$/, '') end # returns( split_after, except_before) def enzyme_specificity if version == "3.2" @@ -492,12 +516,14 @@ pseq end # Returns xml in the form <parameter name="#{method_name}" # value="#{method_value}"/> for list of symbols - def pepxml_parameters(*sym_list) - params_xml(*sym_list) + def pepxml_parameters + keys_as_symbols = @opts.sort.map do |k,v| k.to_s end + params_xml(*keys_as_symbols) + # (:peptide_mass_tol, :peptide_mass_units, :fragment_ion_tol, :ion_series, :max_num_differential_AA_per_mod, :nucleotide_reading_frame, :num_output_lines, :remove_precursor_peak, :ion_cutoff_percentage, :match_peak_count, :match_peak_allowed_error, :match_peak_tolerance, :protein_mass_filter, :sequence_header_filter) end def precursor_mass_type case @opts['mass_type_parent'] when '0' ; "average" @@ -578,10 +604,11 @@ if @opts["peptide_mass_units"] != "0" puts "WARNING: peptide_mass_tol units need to be adjusted!" end @opts["peptide_mass_tolerance"] end + def fragment_ion_tol @opts["fragment_ion_tolerance"] end def max_num_differential_AA_per_mod @opts["max_num_differential_per_peptide"] @@ -640,11 +667,11 @@ def to_pepxml element_xml(:search_summary, [:base_name, :search_engine, :precursor_mass_type, :fragment_mass_type, :out_data_type, :out_data, :search_id]) do search_database.to_pepxml + short_element_xml(:enzymatic_search_constraint, [:enzyme, :max_num_internal_cleavages, :min_number_termini]) + - params_xml(:peptide_mass_tol, :fragment_ion_tol, :ion_series, :max_num_differential_AA_per_mod, :nucleotide_reading_frame, :num_output_lines, :remove_precursor_peak, :ion_cutoff_percentage, :match_peak_count, :match_peak_allowed_error, :match_peak_tolerance, :protein_mass_filter, :sequence_header_filter) + @params.pepxml_parameters end end end @@ -713,103 +740,140 @@ end end # Returns the precursor_neutral based on the scans and an array indexed by # scan numbers. first and last scan and charge should be integers. - # This is the precursor_mz - SpecID::AMU_HYDROGEN! - def self.calc_precursor_neutral_mass(first_scan, last_scan, prec_mz_arr, charge) - mz = nil - if first_scan != last_scan - sum = 0.0 - tot_num = 0 - (first_scan..last_scan).each do |scan| - val = prec_mz_arr[scan] - if val # if the scan is not an mslevel 2 - sum += val.to_f - tot_num += 1 + # This is the precursor_mz - h_plus! + # by=:prec_mz_arr|:deltamass + # if prec_mz_arr then the following arguments must be supplied: + # :first_scan = int, :last_scan = int, :prec_mz_arr = array with the precursor + # m/z for each product scan, :charge = int + # if deltamass then the following arguments must be supplied: + # m_plus_h = float, deltamass = float + # For both flavors, a final additional argument 'average_weights' + # can be used. If true (default), average weights will be used, if false, + # monoisotopic weights (currently this is simply the mass of the proton) + def self.calc_precursor_neutral_mass(by, *args) + average_weights = true + case by + when :prec_mz_arr + (first_scan, last_scan, prec_mz_arr, charge, average_weights) = args + when :deltamass + (m_plus_h, deltamass, average_weights) = args + end + + if average_weights + mass_h_plus = SpecID::AVG[:h_plus] + else + mass_h_plus = SpecID::MONO[:h_plus] + end + + case by + when :prec_mz_arr + mz = nil + if first_scan != last_scan + sum = 0.0 + tot_num = 0 + (first_scan..last_scan).each do |scan| + val = prec_mz_arr[scan] + if val # if the scan is not an mslevel 2 + sum += val.to_f + tot_num += 1 + end end + mz = sum/tot_num.to_f + else + mz = prec_mz_arr[first_scan].to_f end - mz = sum/tot_num.to_f + charge * (mz - mass_h_plus) + when :deltamass + m_plus_h - mass_h_plus + deltamass else - mz = prec_mz_arr[first_scan].to_f + abort "don't recognize 'by' in calc_precursor_neutral_mass: #{by}" end - return (mz*charge) - (SpecID::AMU_HYDROGEN*charge) end end +# This object inherits from Array. As such, it is very memory efficient +# (compared to a normal object). However, certain operations when used on +# these objects will produce undesirable results: An array of these objects +# will be flattened (becoming a long list of attributes) when 'flatten' is +# called on them, which is not the behavior we want! other odd behavior is +# possible. Possible fixes are to use a delegate class or redefine the way +# this responds to flatten (so that it won't flatten). class SpecID::Sequest::PepXML::SearchHit < Array include SpecIDXML #attr_accessor 0:hit_rank, 1:peptide, 2:peptide_prev_aa, 3:peptide_next_aa, 4:protein, 5:num_tot_proteins, 6:num_matched_ions, 7:tot_num_ions, 8:calc_neutral_pep_mass, 9:massdiff, 10:num_tol_term, 11:num_missed_cleavages, 12:is_rejected #attr_accessor 13:deltacnstar #attr_accessor 14:xcorr, 15:deltacn, 16:spscore, 17:sprank ind_keys = {} ; ind_keys_w_eq = {}; @@ind = {} ind_keys = {:hit_rank => 0, :peptide => 1, :peptide_prev_aa => 2, :peptide_next_aa => 3, :protein => 4, :num_tot_proteins => 5, :num_matched_ions => 6, :tot_num_ions => 7, :calc_neutral_pep_mass => 8, :massdiff => 9, :num_tol_term => 10, :num_missed_cleavages => 11, :is_rejected => 12, :deltacnstar => 13, :xcorr => 14, :deltacn => 15, :spscore => 16, :sprank => 17} @@methods = ind_keys.keys def hit_rank ; self[0] end ; def hit_rank=(oth) ; self[0] = oth end - def peptide ; self[1] end ; def peptide=(oth) ; self[1] = oth end - def peptide_prev_aa ; self[2] end ; def peptide_prev_aa=(oth) ; self[2] = oth end - def peptide_next_aa ; self[3] end ; def peptide_next_aa=(oth) ; self[3] = oth end - def protein ; self[4] end ; def protein=(oth) ; self[4] = oth end - def num_tot_proteins ; self[5] end ; def num_tot_proteins=(oth) ; self[5] = oth end - def num_matched_ions ; self[6] end ; def num_matched_ions=(oth) ; self[6] = oth end - def tot_num_ions ; self[7] end ; def tot_num_ions=(oth) ; self[7] = oth end - def calc_neutral_pep_mass ; self[8] end ; def calc_neutral_pep_mass=(oth) ; self[8] = oth end - def massdiff ; self[9] end ; def massdiff=(oth) ; self[9] = oth end - def num_tol_term ; self[10] end ; def num_tol_term=(oth) ; self[10] = oth end - def num_missed_cleavages ; self[11] end ; def num_missed_cleavages=(oth) ; self[11] = oth end - def is_rejected ; self[12] end ; def is_rejected=(oth) ; self[12] = oth end - def deltacnstar ; self[13] end ; def deltacnstar=(oth) ; self[13] = oth end - def xcorr ; self[14] end ; def xcorr=(oth) ; self[14] = oth end - def deltacn ; self[15] end ; def deltacn=(oth) ; self[15] = oth end - def spscore ; self[16] end ; def spscore=(oth) ; self[16] = oth end - def sprank ; self[17] end ; def sprank=(oth) ; self[17] = oth end +def peptide ; self[1] end ; def peptide=(oth) ; self[1] = oth end +def peptide_prev_aa ; self[2] end ; def peptide_prev_aa=(oth) ; self[2] = oth end +def peptide_next_aa ; self[3] end ; def peptide_next_aa=(oth) ; self[3] = oth end +def protein ; self[4] end ; def protein=(oth) ; self[4] = oth end +def num_tot_proteins ; self[5] end ; def num_tot_proteins=(oth) ; self[5] = oth end +def num_matched_ions ; self[6] end ; def num_matched_ions=(oth) ; self[6] = oth end +def tot_num_ions ; self[7] end ; def tot_num_ions=(oth) ; self[7] = oth end +def calc_neutral_pep_mass ; self[8] end ; def calc_neutral_pep_mass=(oth) ; self[8] = oth end +def massdiff ; self[9] end ; def massdiff=(oth) ; self[9] = oth end +def num_tol_term ; self[10] end ; def num_tol_term=(oth) ; self[10] = oth end +def num_missed_cleavages ; self[11] end ; def num_missed_cleavages=(oth) ; self[11] = oth end +def is_rejected ; self[12] end ; def is_rejected=(oth) ; self[12] = oth end +def deltacnstar ; self[13] end ; def deltacnstar=(oth) ; self[13] = oth end +def xcorr ; self[14] end ; def xcorr=(oth) ; self[14] = oth end +def deltacn ; self[15] end ; def deltacn=(oth) ; self[15] = oth end +def spscore ; self[16] end ; def spscore=(oth) ; self[16] = oth end +def sprank ; self[17] end ; def sprank=(oth) ; self[17] = oth end - @@arr_size = ind_keys.size - ind_keys.each {|k,v| ind_keys_w_eq["#{k}=".to_sym] = v } - ind_keys.merge!(ind_keys_w_eq) - ind_keys.each {|k,v| @@ind[k] = v ; @@ind["#{k}"] = v} +@@arr_size = ind_keys.size +ind_keys.each {|k,v| ind_keys_w_eq["#{k}=".to_sym] = v } +ind_keys.merge!(ind_keys_w_eq) +ind_keys.each {|k,v| @@ind[k] = v ; @@ind["#{k}"] = v} - # These are all search_score elements: +# These are all search_score elements: - # 1 if there is no second ranked hit, 0 otherwise +# 1 if there is no second ranked hit, 0 otherwise - def initialize(hash=nil) - super(@@arr_size) - self[0,18] = [hash[:hit_rank], hash[:peptide], hash[:peptide_prev_aa], hash[:peptide_next_aa], hash[:protein], hash[:num_tot_proteins], hash[:num_matched_ions], hash[:tot_num_ions], hash[:calc_neutral_pep_mass], hash[:massdiff], hash[:num_tol_term], hash[:num_missed_cleavages], hash[:is_rejected], hash[:deltacnstar], hash[:xcorr], hash[:deltacn], hash[:spscore], hash[:sprank]] - self - #if hash ; set_from_hash(hash) end - end +def initialize(hash=nil) + super(@@arr_size) + self[0,18] = [hash[:hit_rank], hash[:peptide], hash[:peptide_prev_aa], hash[:peptide_next_aa], hash[:protein], hash[:num_tot_proteins], hash[:num_matched_ions], hash[:tot_num_ions], hash[:calc_neutral_pep_mass], hash[:massdiff], hash[:num_tol_term], hash[:num_missed_cleavages], hash[:is_rejected], hash[:deltacnstar], hash[:xcorr], hash[:deltacn], hash[:spscore], hash[:sprank]] + self + #if hash ; set_from_hash(hash) end +end - # Returns prev, peptide, next from sequence. Parse errors return - # nil,nil,nil - # R.PEPTIDE.A # -> R, PEPTIDE, A - # R.PEPTIDE.- # -> R, PEPTIDE, - - # PEPTIDE.A # -> -, PEPTIDE, A - # A.PEPTIDE # -> A, PEPTIDE, - - # PEPTIDE # -> nil,nil,nil - def self.split_sequence(val) - peptide_prev_aa = ""; peptide = ""; peptide_next_aa = "" - pieces = val.split(".") - case pieces.size - when 3 - peptide_prev_aa, peptide, peptide_next_aa = *pieces - when 2 - if pieces[0].size > 1 ## N termini - peptide_prev_aa, peptide, peptide_next_aa = '-', pieces[0], pieces[1] - else ## C termini - peptide_prev_aa, peptide, peptide_next_aa = pieces[0], pieces[1], '-' - end - when 1 ## this must be a parse error! - peptide_prev_aa, peptide, peptide_next_aa = nil,nil,nil - when 0 - peptide_prev_aa, peptide, peptide_next_aa = nil,nil,nil +# Returns prev, peptide, next from sequence. Parse errors return +# nil,nil,nil +# R.PEPTIDE.A # -> R, PEPTIDE, A +# R.PEPTIDE.- # -> R, PEPTIDE, - +# PEPTIDE.A # -> -, PEPTIDE, A +# A.PEPTIDE # -> A, PEPTIDE, - +# PEPTIDE # -> nil,nil,nil +def self.split_sequence(val) + peptide_prev_aa = ""; peptide = ""; peptide_next_aa = "" + pieces = val.split(".") + case pieces.size + when 3 + peptide_prev_aa, peptide, peptide_next_aa = *pieces + when 2 + if pieces[0].size > 1 ## N termini + peptide_prev_aa, peptide, peptide_next_aa = '-', pieces[0], pieces[1] + else ## C termini + peptide_prev_aa, peptide, peptide_next_aa = pieces[0], pieces[1], '-' end - return peptide_prev_aa, peptide, peptide_next_aa + when 1 ## this must be a parse error! + peptide_prev_aa, peptide, peptide_next_aa = nil,nil,nil + when 0 + peptide_prev_aa, peptide, peptide_next_aa = nil,nil,nil end + return peptide_prev_aa, peptide, peptide_next_aa +end - def inspect +def inspect "#<SearchHit #{@@methods.map do |m| "#{m}:#{self.send(m)}" end.join(" ")}>" end # requires Params object and full sequence (with heads and tails) def self.calc_num_missed_cleavages(params, sequence)