sequest.rb in mspire-0.1.5

- old
+ new

@@ -3,10 +3,11 @@
 require 'spec/mzxml/parser'
 require 'hash_by'
 require 'set_from_hash'
 require 'spec_id/bioworks'
 require 'instance_var_set_from_hash'
+require 'spec/msrun'
 
 module SpecID::Sequest; end
 class SpecID::Sequest::PepXML; end
 
 class SpecID::Sequest::PepXML::MSMSPipelineAnalysis
@@ -60,15 +61,15 @@
 
   def to_pepxml
     case SpecID::Sequest::PepXML.pepxml_version
     when 0
       element_xml(:msms_pipeline_analysis, [:date, :summary_xml]) do
-      @msms_run_summary.to_pepxml
+        @msms_run_summary.to_pepxml
       end
     when 18
       element_xml_and_att_string(:msms_pipeline_analysis, "date=\"#{date}\" xmlns=\"#{xmlns}\" xmlns:xsi=\"#{xmlns_xsi}\" xsi:schemaLocation=\"#{xsi_schema_location}\" summary_xml=\"#{summary_xml}\"") do
-      @msms_run_summary.to_pepxml
+        @msms_run_summary.to_pepxml
       end
     else
       abort "Don't know how to deal with version: #{SpecID::Sequest::PepXML.pepxml_version}"
     end
   end
@@ -231,46 +232,65 @@
 
     ## TURN THIS ON IF YOU THINK YOU MIGHT NOT BE GETTING PEPTIDES from
     ## bioworks
     #bioworks.peps.each { |pep| if pep.class != SpecID::Bioworks::Pep ; puts "trying to pass as pep: "; p pep; abort "NOT a pep!" end }
 
-    ## DOING THIS BY INDIVIDUAL FILE for when multiple files are in the path!
-    ## Get hash by filename containing arrays of precursor_mz indexed by scan num
-    #pre_mz_arr_hash = Spec::MzXML::Parser.new.precursor_mz_by_scan_for_path(msdata, /(\.mzXML)|(\.timeIndex)|(\.[Rr][Aa][Ww])$/)
 
     ## Start
     split_bio_objs = []
 
     ## Create a hash by pep object containing num_tot_proteins
     ## This is only valid if all hits are present (no previous thresholding)
     self._prot_num_and_first_prot_by_pep(bioworks.peps)
     ## (num_prots_by_pep, prot_by_pep) = 
     #num_prots_by_pep.each do |k,v| puts "k: #{k} v: #{v}\n"; break end ; prot_by_pep.each do |k,v| puts "k: #{k} v: #{v}" ; break end ; abort "HERE"
-    mzxml_parser = Spec::MzXML::Parser.new
 
     ## Create a hash of spectrum_query arrays by filename (this very big block):
     spectrum_queries_by_base_name = {}
     # Hash by the filenames to split into filenames:
     bioworks.peps.hash_by(:base_name).each do |base_name, pep_arr|
 
+      prec_mz_arr = nil
+      case x = bioworks.version
+      when /3.2/ 
+        calc_prec_by = :prec_mz_arr
         # get the precursor_mz array for this filename
         inner__full_base_name_no_ext = File.join(msdata, base_name)
-        pre_mz_arr = mzxml_parser.precursor_mz_by_scan(inner__full_base_name_no_ext)
+        prec_mz_arr = Spec::MSRun.precursor_mz_by_scan(inner__full_base_name_no_ext)
+      when /3.3/
+        calc_prec_by = :deltamass
+      else
+        abort "invalid BioworksBrowser version: #{x}"
+      end
 
       spectrum_queries = pep_arr.hash_by(:first_scan, :last_scan, :charge).collect do |key,arr|
 
 
         # Sort_by_rank and take the top hit (to mimick out2summary):
         arr = arr.sort_by {|pep| pep.xcorr.to_f } # ascending
         top_pep = arr.pop
         second_hit = arr.last # needed for deltacnstar
+
+        case params.precursor_mass_type
+        when "monoisotopic" ; avg_parent = false
+        else ; avg_parent = true
+        end
+
+        case avg_parent
+        when true ; h_plus = SpecID::AVG[:h_plus]
+        when false ; h_plus = SpecID::MONO[:h_plus]
+        end
+
         
+        case calc_prec_by
+        when :prec_mz_arr
+          precursor_neutral_mass = SpecID::Sequest::PepXML::SpectrumQuery.calc_precursor_neutral_mass(calc_prec_by, top_pep.first_scan.to_i, top_pep.last_scan.to_i, prec_mz_arr, top_pep.charge.to_i, avg_parent)
+        when :deltamass
+          precursor_neutral_mass = SpecID::Sequest::PepXML::SpectrumQuery.calc_precursor_neutral_mass(calc_prec_by, top_pep.mass.to_f, top_pep.deltamass.to_f, avg_parent)
+        end
 
-        ## Calculate values with interdependency:
-        # mass values:
-        precursor_neutral_mass = SpecID::Sequest::PepXML::SpectrumQuery.calc_precursor_neutral_mass(top_pep.first_scan.to_i, top_pep.last_scan.to_i, pre_mz_arr, top_pep.charge.to_i)
-        calc_neutral_pep_mass = (top_pep.mass.to_f - SpecID::AMU_HYDROGEN)
+        calc_neutral_pep_mass = (top_pep.mass.to_f - h_plus)
         massdiff = precursor_neutral_mass - calc_neutral_pep_mass
         if massdiff >= 0 ; massdiff = "+" + massdiff.to_s
         else ; massdiff = massdiff.to_s end
         # deltacn & star:
         # (NOTE: OLD?? out2summary wants the deltacn of the 2nd best hit.)
@@ -369,47 +389,47 @@
         #          :sample_enzyme => params.enzyme,
         #        })
       end
     end # collects the pepxml objects
 
-end
+  end
 
-def summary_xml
-  base_name + ".xml"
-end
+  def summary_xml
+    base_name + ".xml"
+  end
 
-def precursor_mass_type
-  @params.precursor_mass_type
-end
+  def precursor_mass_type
+    @params.precursor_mass_type
+  end
 
-def fragment_mass_type
-  @params.fragment_mass_type
-end
+  def fragment_mass_type
+    @params.fragment_mass_type
+  end
 
-# combines filename in a manner consistent with the path
-def self.make_base_name(path, filename)
-  sep = "/"
-  if path.split("/").size < path.split("\\").size
-    sep = "\\"
+  # combines filename in a manner consistent with the path
+  def self.make_base_name(path, filename)
+    sep = "/"
+    if path.split("/").size < path.split("\\").size
+      sep = "\\"
+    end
+    if path.split("").last == sep
+      return path + File.basename(filename)
+    else
+      return path + sep + File.basename(filename)
+    end
   end
-  if path.split("").last == sep
-    return path + File.basename(filename)
-  else
-    return path + sep + File.basename(filename)
-  end
-end
 
-# outputs pepxml, (to file if given)
-def to_pepxml(file=nil)
-  string = header
-  string << @msms_pipeline_analysis.to_pepxml
+  # outputs pepxml, (to file if given)
+  def to_pepxml(file=nil)
+    string = header
+    string << @msms_pipeline_analysis.to_pepxml
 
-  if file
-    File.open(file, "w") do |fh| fh.print string end
+    if file
+      File.open(file, "w") do |fh| fh.print string end
+    end
+    string
   end
-  string
-end
 
 end # PepXML
 
 ##
 # In the future, this guy should accept any version of bioworks params file
@@ -446,17 +466,21 @@
       end
     end
     hash
   end
 
+  ## parses file
+  ## and drops the .hdr behind indexed fasta files
   def parse(file)
     File.open(file) do |fh|
       sequest_line = fh.gets #[SEQUEST]
       @opts = grab_params(fh)
       @opts["search_engine"] = "SEQUEST"
       @mods = grab_params(fh)
     end
+    ## this gets rid of the .hdr postfix on indexed databases
+    @opts["first_database_name"] = @opts["first_database_name"].sub(/\.hdr$/, '')
   end
 
   # returns( split_after, except_before)
   def enzyme_specificity
     if version == "3.2"
@@ -492,12 +516,14 @@
     pseq
   end
 
   # Returns xml in the form <parameter name="#{method_name}"
   # value="#{method_value}"/> for list of symbols
-  def pepxml_parameters(*sym_list)
-    params_xml(*sym_list)
+  def pepxml_parameters
+    keys_as_symbols = @opts.sort.map do |k,v| k.to_s end
+    params_xml(*keys_as_symbols)
+    # (:peptide_mass_tol, :peptide_mass_units, :fragment_ion_tol, :ion_series, :max_num_differential_AA_per_mod, :nucleotide_reading_frame, :num_output_lines, :remove_precursor_peak, :ion_cutoff_percentage, :match_peak_count, :match_peak_allowed_error, :match_peak_tolerance, :protein_mass_filter, :sequence_header_filter)
   end
 
   def precursor_mass_type
     case @opts['mass_type_parent']
     when '0' ; "average" 
@@ -578,10 +604,11 @@
     if @opts["peptide_mass_units"] != "0"
       puts "WARNING: peptide_mass_tol units need to be adjusted!"
     end
     @opts["peptide_mass_tolerance"]
   end
+
   def fragment_ion_tol
     @opts["fragment_ion_tolerance"]
   end
   def max_num_differential_AA_per_mod
     @opts["max_num_differential_per_peptide"]
@@ -640,11 +667,11 @@
 
   def to_pepxml
     element_xml(:search_summary, [:base_name, :search_engine, :precursor_mass_type, :fragment_mass_type, :out_data_type, :out_data, :search_id]) do
       search_database.to_pepxml +
         short_element_xml(:enzymatic_search_constraint, [:enzyme, :max_num_internal_cleavages, :min_number_termini]) +
-        params_xml(:peptide_mass_tol, :fragment_ion_tol, :ion_series, :max_num_differential_AA_per_mod, :nucleotide_reading_frame, :num_output_lines, :remove_precursor_peak, :ion_cutoff_percentage, :match_peak_count, :match_peak_allowed_error, :match_peak_tolerance, :protein_mass_filter, :sequence_header_filter)
+        @params.pepxml_parameters
     end
   end
 
 end
 
@@ -713,103 +740,140 @@
     end
   end
 
   # Returns the precursor_neutral based on the scans and an array indexed by
   # scan numbers.  first and last scan and charge should be integers.
-  # This is the precursor_mz - SpecID::AMU_HYDROGEN!
-  def self.calc_precursor_neutral_mass(first_scan, last_scan, prec_mz_arr, charge)
-    mz = nil
-    if first_scan != last_scan
-      sum = 0.0
-      tot_num = 0
-      (first_scan..last_scan).each do |scan|
-        val = prec_mz_arr[scan]
-        if val  # if the scan is not an mslevel 2
-          sum += val.to_f
-          tot_num += 1
+  # This is the precursor_mz - h_plus!
+  # by=:prec_mz_arr|:deltamass
+  # if prec_mz_arr then the following arguments must be supplied:
+  # :first_scan = int, :last_scan = int, :prec_mz_arr = array with the precursor
+  # m/z for each product scan, :charge = int
+  # if deltamass then the following arguments must be supplied:
+  # m_plus_h = float, deltamass = float
+  # For both flavors, a final additional argument 'average_weights'
+  # can be used.  If true (default), average weights will be used, if false, 
+  # monoisotopic weights (currently this is simply the mass of the proton)
+  def self.calc_precursor_neutral_mass(by, *args)
+    average_weights = true
+    case by
+    when :prec_mz_arr
+      (first_scan, last_scan, prec_mz_arr, charge, average_weights) = args
+    when :deltamass
+      (m_plus_h, deltamass, average_weights) = args
+    end
+
+    if average_weights 
+      mass_h_plus = SpecID::AVG[:h_plus] 
+    else
+      mass_h_plus = SpecID::MONO[:h_plus] 
+    end
+
+    case by
+    when :prec_mz_arr
+      mz = nil
+      if first_scan != last_scan
+        sum = 0.0
+        tot_num = 0
+        (first_scan..last_scan).each do |scan|
+          val = prec_mz_arr[scan]
+          if val  # if the scan is not an mslevel 2
+            sum += val.to_f
+            tot_num += 1
+          end
         end
+        mz = sum/tot_num.to_f
+      else
+        mz = prec_mz_arr[first_scan].to_f
       end
-      mz = sum/tot_num.to_f
+      charge * (mz - mass_h_plus)
+    when :deltamass
+      m_plus_h - mass_h_plus + deltamass
     else
-      mz = prec_mz_arr[first_scan].to_f
+      abort "don't recognize 'by' in calc_precursor_neutral_mass: #{by}"
     end
-    return (mz*charge) - (SpecID::AMU_HYDROGEN*charge)
   end
 
 end
 
+# This object inherits from Array.  As such, it is very memory efficient
+# (compared to a normal object).  However, certain operations when used on
+# these objects will produce undesirable results: An array of these objects
+# will be flattened (becoming a long list of attributes) when 'flatten' is
+# called on them, which is not the behavior we want!  other odd behavior is
+# possible.  Possible fixes are to use a delegate class or redefine the way
+# this responds to flatten (so that it won't flatten).
 class SpecID::Sequest::PepXML::SearchHit < Array
   include SpecIDXML
   #attr_accessor 0:hit_rank, 1:peptide, 2:peptide_prev_aa, 3:peptide_next_aa, 4:protein, 5:num_tot_proteins, 6:num_matched_ions, 7:tot_num_ions, 8:calc_neutral_pep_mass, 9:massdiff, 10:num_tol_term, 11:num_missed_cleavages, 12:is_rejected
   #attr_accessor 13:deltacnstar
   #attr_accessor 14:xcorr, 15:deltacn, 16:spscore, 17:sprank
   ind_keys = {} ; ind_keys_w_eq = {}; @@ind = {}
 
   ind_keys = {:hit_rank => 0, :peptide => 1, :peptide_prev_aa => 2, :peptide_next_aa => 3, :protein => 4, :num_tot_proteins => 5, :num_matched_ions => 6, :tot_num_ions => 7, :calc_neutral_pep_mass => 8, :massdiff => 9, :num_tol_term => 10, :num_missed_cleavages => 11, :is_rejected => 12, :deltacnstar  => 13, :xcorr => 14, :deltacn => 15, :spscore => 16, :sprank => 17}
   @@methods = ind_keys.keys
   def hit_rank ; self[0] end ; def hit_rank=(oth) ; self[0] = oth end
-  def peptide ; self[1] end ; def peptide=(oth) ; self[1] = oth end
-  def peptide_prev_aa ; self[2] end ; def peptide_prev_aa=(oth) ; self[2] = oth end
-  def peptide_next_aa ; self[3] end ; def peptide_next_aa=(oth) ; self[3] = oth end
-  def protein ; self[4] end ; def protein=(oth) ; self[4] = oth end
-  def num_tot_proteins ; self[5] end ; def num_tot_proteins=(oth) ; self[5] = oth end
-  def num_matched_ions ; self[6] end ; def num_matched_ions=(oth) ; self[6] = oth end
-  def tot_num_ions ; self[7] end ; def tot_num_ions=(oth) ; self[7] = oth end
-  def calc_neutral_pep_mass ; self[8] end ; def calc_neutral_pep_mass=(oth) ; self[8] = oth end
-  def massdiff ; self[9] end ; def massdiff=(oth) ; self[9] = oth end
-  def num_tol_term ; self[10] end ; def num_tol_term=(oth) ; self[10] = oth end
-  def num_missed_cleavages ; self[11] end ; def num_missed_cleavages=(oth) ; self[11] = oth end
-  def is_rejected ; self[12] end ; def is_rejected=(oth) ; self[12] = oth end
-  def deltacnstar ; self[13] end ; def deltacnstar=(oth) ; self[13] = oth end
-  def xcorr ; self[14] end ; def xcorr=(oth) ; self[14] = oth end
-  def deltacn ; self[15] end ; def deltacn=(oth) ; self[15] = oth end
-  def spscore ; self[16] end ; def spscore=(oth) ; self[16] = oth end
-  def sprank ; self[17] end ; def sprank=(oth) ; self[17] = oth end
+def peptide ; self[1] end ; def peptide=(oth) ; self[1] = oth end
+def peptide_prev_aa ; self[2] end ; def peptide_prev_aa=(oth) ; self[2] = oth end
+def peptide_next_aa ; self[3] end ; def peptide_next_aa=(oth) ; self[3] = oth end
+def protein ; self[4] end ; def protein=(oth) ; self[4] = oth end
+def num_tot_proteins ; self[5] end ; def num_tot_proteins=(oth) ; self[5] = oth end
+def num_matched_ions ; self[6] end ; def num_matched_ions=(oth) ; self[6] = oth end
+def tot_num_ions ; self[7] end ; def tot_num_ions=(oth) ; self[7] = oth end
+def calc_neutral_pep_mass ; self[8] end ; def calc_neutral_pep_mass=(oth) ; self[8] = oth end
+def massdiff ; self[9] end ; def massdiff=(oth) ; self[9] = oth end
+def num_tol_term ; self[10] end ; def num_tol_term=(oth) ; self[10] = oth end
+def num_missed_cleavages ; self[11] end ; def num_missed_cleavages=(oth) ; self[11] = oth end
+def is_rejected ; self[12] end ; def is_rejected=(oth) ; self[12] = oth end
+def deltacnstar ; self[13] end ; def deltacnstar=(oth) ; self[13] = oth end
+def xcorr ; self[14] end ; def xcorr=(oth) ; self[14] = oth end
+def deltacn ; self[15] end ; def deltacn=(oth) ; self[15] = oth end
+def spscore ; self[16] end ; def spscore=(oth) ; self[16] = oth end
+def sprank ; self[17] end ; def sprank=(oth) ; self[17] = oth end
 
-  @@arr_size = ind_keys.size
-  ind_keys.each {|k,v| ind_keys_w_eq["#{k}=".to_sym] = v }
-  ind_keys.merge!(ind_keys_w_eq)
-  ind_keys.each {|k,v| @@ind[k] = v ; @@ind["#{k}"] = v}
+@@arr_size = ind_keys.size
+ind_keys.each {|k,v| ind_keys_w_eq["#{k}=".to_sym] = v }
+ind_keys.merge!(ind_keys_w_eq)
+ind_keys.each {|k,v| @@ind[k] = v ; @@ind["#{k}"] = v}
 
-  # These are all search_score elements:
+# These are all search_score elements:
 
-  # 1 if there is no second ranked hit, 0 otherwise
+# 1 if there is no second ranked hit, 0 otherwise
 
-  def initialize(hash=nil)
-    super(@@arr_size)
-    self[0,18] = [hash[:hit_rank], hash[:peptide], hash[:peptide_prev_aa], hash[:peptide_next_aa], hash[:protein], hash[:num_tot_proteins], hash[:num_matched_ions], hash[:tot_num_ions], hash[:calc_neutral_pep_mass], hash[:massdiff], hash[:num_tol_term], hash[:num_missed_cleavages], hash[:is_rejected], hash[:deltacnstar], hash[:xcorr], hash[:deltacn], hash[:spscore], hash[:sprank]]
-    self
-    #if hash ; set_from_hash(hash) end
-  end
+def initialize(hash=nil)
+  super(@@arr_size)
+  self[0,18] = [hash[:hit_rank], hash[:peptide], hash[:peptide_prev_aa], hash[:peptide_next_aa], hash[:protein], hash[:num_tot_proteins], hash[:num_matched_ions], hash[:tot_num_ions], hash[:calc_neutral_pep_mass], hash[:massdiff], hash[:num_tol_term], hash[:num_missed_cleavages], hash[:is_rejected], hash[:deltacnstar], hash[:xcorr], hash[:deltacn], hash[:spscore], hash[:sprank]]
+  self
+  #if hash ; set_from_hash(hash) end
+end
 
-  # Returns prev, peptide, next from sequence.  Parse errors return
-  # nil,nil,nil
-  #   R.PEPTIDE.A  # -> R, PEPTIDE, A
-  #   R.PEPTIDE.-  # -> R, PEPTIDE, -
-  #   PEPTIDE.A    # -> -, PEPTIDE, A
-  #   A.PEPTIDE    # -> A, PEPTIDE, -
-  #   PEPTIDE      # -> nil,nil,nil
-  def self.split_sequence(val)
-    peptide_prev_aa = ""; peptide = ""; peptide_next_aa = ""
-    pieces = val.split(".") 
-    case pieces.size
-    when 3
-      peptide_prev_aa, peptide, peptide_next_aa = *pieces
-    when 2
-      if pieces[0].size > 1  ## N termini
-        peptide_prev_aa, peptide, peptide_next_aa = '-', pieces[0], pieces[1]
-      else  ## C termini
-        peptide_prev_aa, peptide, peptide_next_aa = pieces[0], pieces[1], '-'
-      end
-    when 1  ## this must be a parse error!
-      peptide_prev_aa, peptide, peptide_next_aa = nil,nil,nil
-    when 0
-      peptide_prev_aa, peptide, peptide_next_aa = nil,nil,nil
+# Returns prev, peptide, next from sequence.  Parse errors return
+# nil,nil,nil
+#   R.PEPTIDE.A  # -> R, PEPTIDE, A
+#   R.PEPTIDE.-  # -> R, PEPTIDE, -
+#   PEPTIDE.A    # -> -, PEPTIDE, A
+#   A.PEPTIDE    # -> A, PEPTIDE, -
+#   PEPTIDE      # -> nil,nil,nil
+def self.split_sequence(val)
+  peptide_prev_aa = ""; peptide = ""; peptide_next_aa = ""
+  pieces = val.split(".") 
+  case pieces.size
+  when 3
+    peptide_prev_aa, peptide, peptide_next_aa = *pieces
+  when 2
+    if pieces[0].size > 1  ## N termini
+      peptide_prev_aa, peptide, peptide_next_aa = '-', pieces[0], pieces[1]
+    else  ## C termini
+      peptide_prev_aa, peptide, peptide_next_aa = pieces[0], pieces[1], '-'
     end
-    return peptide_prev_aa, peptide, peptide_next_aa
+  when 1  ## this must be a parse error!
+    peptide_prev_aa, peptide, peptide_next_aa = nil,nil,nil
+  when 0
+    peptide_prev_aa, peptide, peptide_next_aa = nil,nil,nil
   end
+  return peptide_prev_aa, peptide, peptide_next_aa
+end
 
-  def inspect
+def inspect
     "#<SearchHit #{@@methods.map do |m| "#{m}:#{self.send(m)}" end.join(" ")}>"
   end
 
   # requires Params object and full sequence (with heads and tails)
   def self.calc_num_missed_cleavages(params, sequence)