bioworks.rb in mspire-0.2.0

- old
+ new

@@ -4,29 +4,32 @@
 require 'xmlparser'
 require 'spec_id'
 require 'zlib'
 require 'hash_by'
 require 'set_from_hash'
+require 'array_class'
 
 ## have to pre-declare some guys
-class SpecID; end
-class SpecID::Prot; end
+module SpecID; end
+module SpecID::Prot; end
+module SpecID::Pep; end
 module SpecIDXML; end
 
-class SpecID::Bioworks
+class Bioworks
+  include SpecID
+
   # Regular expressions
   @@bioworksinfo_re = /<bioworksinfo>(.*)<\/bioworksinfo>/o
   @@modifications_re = /<modifications>(.*)<\/modifications>/o
   @@protein_re = /<protein>/o
   @@origfilename_re = /<origfilename>(.*)<\/origfilename>/o
   @@origfilepath_re = /<origfilepath>(.*)<\/origfilepath>/o
 
 
-  attr_accessor :prots, :version, :global_filename, :origfilename, :origfilepath
+  attr_accessor :peps, :prots, :version, :global_filename, :origfilename, :origfilepath
   # a string of modifications e.g., "(M* +15.99491) (S@ +14.9322) "
   attr_accessor :modifications
-  attr_writer :peps
 
   def hi_prob_best ; false end
 
   # -> prints to file filename1.sqt, filename2.sqt
   # @TODO: sqt file output
@@ -125,65 +128,50 @@
   # returns (peptides, proteins) where peptides is the unique list of peps
   # and proteins is a parallel array of arrays of represented proteins
   # note that each pep will contain its original prot it belongs to, even
   # though the parallel protein actually represents the proteins it belongs
   # to.
+  # assumes that each peptide points to all its proteins in pep.prots
   def _uniq_peps_by_sequence_charge(peps)
     new_arr = []
     prot_arr = []
     index_accounted_for = []
     (0...peps.size).each do |i|
       next if index_accounted_for.include?(i)
       new_arr << peps[i]
-      prot_arr.push( [peps[i].prot] )
+      prot_arr.push( peps[i].prots )
       ((i+1)...peps.size).each do |j|
         pep1, pep2 = peps[i], peps[j]
         if pep1.sequence == pep2.sequence && pep1.charge == pep2.charge
-          prot_arr.last.push pep2.prot
+          prot_arr.last.push( *(pep2.prots) )
           index_accounted_for << j
         end
       end
     end
     return new_arr, prot_arr
   end
 
   def initialize(file=nil)
     @peps = nil
     if file
+      @filename = file
       parse_xml(file)
       #parse_xml_by_xmlparser(file)
     end
   end
 
   def parse_xml_by_xmlparser(file)
-    parser = SpecID::Bioworks::XMLParser.new
+    parser = Bioworks::XMLParser.new
     File.open(file) do |fh|
       #3.times do fh.gets end  ## TEMPFIX
       parser.parse(fh)
     end
     #puts "ETETWSST"
     #p parser.prots
     @prots = parser.prots
   end
 
-
-  # Returns the list of all peptide hits.  A given sequence/charge or scan
-  # may be redundant!
-  def peps
-    if @peps
-      return @peps
-    else
-      @peps = []
-      prots.each do |prot|
-        prot.peps.each do |pep|
-          @peps << pep
-        end
-      end
-      return @peps
-    end
-  end
-
   # This is highly specific to Bioworks 3.2 xml export.  In other words,
   # unless the newlines, etc. are duplicated, this parser will fail! Not
   # robust, but it is faster than xmlparser (which is based on the speedy
   # expat)
   def parse_xml(file)
@@ -198,25 +186,27 @@
     if @origfilename
       @global_filename = @origfilename.gsub(File.extname(@origfilename), "")
     end
     @version = get_regex_val(fh, @@bioworksinfo_re)
     @modifications = get_regex_val(fh, @@modifications_re)
-    @prots = get_prots(fh, self)
+    @prots, @peps = get_prots_from_xml_stream(fh)
     fh.close
   end
 
-  def get_prots(fh, bioworks)
+  ## returns proteins and peptides
+  def get_prots_from_xml_stream(fh)
+    uniq_pephit_hash = {}
     prots = []
     while line = fh.gets
       if line =~ @@protein_re
-        prot =  SpecID::Bioworks::Prot.new
+        prot =  Bioworks::Prot.new
         prot.bioworks = self
-        prot.set_from_xml_stream(fh, bioworks)
+        prot.set_from_xml_stream(fh, uniq_pephit_hash)
         prots << prot
       end
     end
-    prots
+    [prots, uniq_pephit_hash.values] 
   end
 
   # gets the regex and stops (and rewinds if it hits a protein)
   # if no regex is found, returns nil and rewinds the filehandle
   def get_regex_val(fh, regex)
@@ -244,11 +234,11 @@
 
 end
 
 # Implements fast parsing via XMLParser (wrapper around Expat)
 # It is actually slower (about %25 slower) than regular expression parsing
-class SpecID::Bioworks::XMLParser < XMLParser
+class Bioworks::XMLParser < XMLParser
   @@at = '@'
   attr_accessor :prots
 
   def initialize
     @current_obj = nil
@@ -260,22 +250,22 @@
 
   def startElement(name, attrs)
     case name
     when "peptide"
       curr_prot = @current_obj
-      if @current_obj.class == SpecID::Bioworks::Prot
+      if @current_obj.class == Bioworks::Prot
         @current_obj.set_from_xml_hash_xmlparser(@current_hash)
       else
         curr_prot = @current_obj.prot  ## unless previous was a peptide
       end
-      peptide = SpecID::Bioworks::Pep.new
+      peptide = Bioworks::Pep.new
       peptide.prot = curr_prot
       curr_prot.peps << peptide
       @current_obj = peptide
       @current_hash = {}
     when "protein"
-      @current_obj = SpecID::Bioworks::Prot.new
+      @current_obj = Bioworks::Prot.new
       @current_hash = {}
       @prots << @current_obj
     else
       @current_name = name  
     end
@@ -295,17 +285,18 @@
     @current_data = data
   end
 
 end
 
-module SpecID::Bioworks::XML
+module Bioworks::XML
   # The regular expression to grab attributes from the bioworks xml format
   @@att_re = /<([\w]+)>(.*)<\/[\w]+>/o
 end
 
-class SpecID::Bioworks::Prot < SpecID::Prot
-  include SpecID::Bioworks::XML
+class Bioworks::Prot
+  include SpecID::Prot
+  include Bioworks::XML
 
   @@end_prot_re = /<\/protein>/o
   @@pep_re = /<peptide>/o
   @@atts = %w(reference protein_probability consensus_score sf unified_score coverage pi weight accession peps) 
   attr_accessor :reference, :protein_probability, :consensus_score, :sf, :unified_score, :coverage, :pi, :weight, :accession, :peps, :bioworks, :peptide_hit_counts
@@ -321,19 +312,36 @@
     args.collect do |arg|
       send(arg)
     end
   end
 
-  def set_from_xml_stream(fh, bioworks)
+  def set_from_xml_stream(fh, uniq_pephit_hash)
     hash = {}
+    @peps = []
     while line = fh.gets
       if line =~ @@att_re
         hash[$1] = $2
       elsif line =~ @@pep_re
-        pep = SpecID::Bioworks::Pep.new.set_from_xml_stream(fh, self)
-        pep.prot = self
+        ## Could do a look ahead to grab the file and sequence to check
+        ## uniqueness to increase speed here.
+        pep = Bioworks::Pep.new.set_from_xml_stream(fh)
+        # normal search results files have a global filename
+        # while multi-consensus do not
+        pep[12] ||= bioworks.global_filename
+
+        ## figure out uniqueness 
+        ky = [pep.base_name, pep.first_scan, pep.charge, pep.sequence]
+        if uniq_pephit_hash.key? ky
+          pep = uniq_pephit_hash[ky]
+        else
+          ## insert the new protein
+          pep.prots = []
+          uniq_pephit_hash[ky] = pep
+        end
+        pep.prots << self
         @peps << pep
+        
       elsif line =~ @@end_prot_re
         set_from_xml_hash(hash)
         break
       else
         puts "Bad parsing on: #{line}"
@@ -365,70 +373,38 @@
     @weight = hash["weight"]
     @accession = hash["accession"]
   end
 end
 
+Bioworks::Pep = ArrayClass.new( %w(sequence mass deltamass charge xcorr deltacn sp rsp ions count tic prots base_name first_scan last_scan peptide_probability file _num_prots _first_prot aaseq) )
+# 0=sequence 1=mass 2=deltamass 3=charge 4=xcorr 5=deltacn 6=sp 7=rsp 8=ions 9=count 10=tic 11=prots 12=base_name 13=first_scan 14=last_scan 15=peptide_probability 16=file 17=_num_prots 18=_first_prot 19=aaseq
 
-class SpecID::Bioworks::Pep < Array 
-  include SpecID::Bioworks::XML
+class Bioworks::Pep
+  include SpecID::Pep
+  include Bioworks::XML
   include SpecIDXML
 
   @@file_split_first_re = /, /o
   @@file_split_second_re = / - /o
   #@@att_re = /<(.*)>(.*)<\/(.*)>/
   @@end_pep_re = /<\/peptide>/o
   @@file_one_scan_re = /(.*), (\d+)/o
   @@file_mult_scan_re = /(.*), (\d+) - (\d+)/o
   ## NOTE! the mass is really the theoretical MH+!!!!
   ## NOTE! ALL values stored as strings, except peptide_probability!
-  ind_keys = {} ; ind_keys_w_eq = {}; @@ind = {}
-  ind_keys = {:sequence => 0, :mass => 1, :deltamass => 2, :charge => 3, :xcorr => 4, :deltacn => 5, :sp => 6, :rsp => 7, :ions => 8, :count => 9, :tic => 10, :prot => 11, :base_name => 12, :first_scan => 13, :last_scan => 14, :peptide_probability => 15, :file => 16, :_num_prots => 17, :_first_prot => 18 }
 
-
-  def sequence ; self[0] end ; def sequence=(oth) ; self[0] = oth end
-  def mass ; self[1] end ; def mass=(oth) ; self[1] = oth end
-  def deltamass ; self[2] end ; def deltamass=(oth) ; self[2] = oth end
-  def charge ; self[3] end ; def charge=(oth) ; self[3] = oth end
-  def xcorr ; self[4] end ; def xcorr=(oth) ; self[4] = oth end
-  def deltacn ; self[5] end ; def deltacn=(oth) ; self[5] = oth end
-  def sp ; self[6] end ; def sp=(oth) ; self[6] = oth end
-  def rsp ; self[7] end ; def rsp=(oth) ; self[7] = oth end
-  def ions ; self[8] end ; def ions=(oth) ; self[8] = oth end
-  def count ; self[9] end ; def count=(oth) ; self[9] = oth end
-  def tic ; self[10] end ; def tic=(oth) ; self[10] = oth end
-  def prot ; self[11] end ; def prot=(oth) ; self[11] = oth end
-  def base_name ; self[12] end ; def base_name=(oth) ; self[12] = oth end
-  def first_scan ; self[13] end ; def first_scan=(oth) ; self[13] = oth end
-  def last_scan ; self[14] end ; def last_scan=(oth) ; self[14] = oth end
-  def peptide_probability ; self[15] end ; def peptide_probability=(oth) ; self[15] = oth end
-  def file ; self[16] end   # we define a writer below
-  def _num_prots ; self[17] end ; def _num_prots=(oth) ; self[17] = oth end
-  def _first_prot ; self[18] end ; def _first_prot=(oth) ; self[18] = oth end
-
   ## other accessors:
   def probability ; self[15] end
+  def mh ; self[1] end
 
-  #ind_keys.keys do |k|
-  #  self.module_eval( "def #{k} ; self[#{ind_keys[k]}] end ; def #{k}=(oth) ; self[#{ind_keys[k]} = oth end ", __FILE__, __LINE__ )
-  #end
-  @@arr_size = ind_keys.size
-  ind_keys.each {|k,v| ind_keys_w_eq["#{k}=".to_sym] = v }
-  ind_keys.merge!(ind_keys_w_eq)
-  ind_keys.each {|k,v| @@ind[k] = v ; @@ind["#{k}"] = v}
-
-  def initialize(args=nil)
-    super(@@arr_size.size)
-    if args
-      if args.is_a? Hash
-        args.each do |k,v|
-          self[@@ind[k]] = v
-        end
-      end
-    end
+  # This is not a true ppm since it should be divided by the actual mh instead
+  # of the theoretical (but it is as close as we can get for this object)
+  def ppm 
+    1.0e6 * (self[2].abs/self[1])
+    #1.0e6 * (self.deltamass.abs/self.mh)
   end
 
-
   # returns array of values of the attributes given (as symbols)
   def get(*args)
     args.collect do |arg|
       send(arg)
     end
@@ -461,38 +437,42 @@
       last_scan = first_scan
     end
     [base_name, first_scan, last_scan]
   end
 
+  tmp_verb = $VERBOSE
+  $VERBOSE = nil
   def file=(arg)
     ## Set these vals by index:
     #puts "AERRG: #{arg}"
     self[16] = arg
     self[12,3] = self.class.extract_file_info(arg)
   end
+  $VERBOSE = tmp_verb
   
   def inspect
-    "<SpecID::Bioworks::Pep sequence: #{sequence}, mass: #{mass}, deltamass: #{deltamass}, charge: #{charge}, xcorr: #{xcorr}, deltacn: #{deltacn} prot: #{prot} base_name: #{base_name} first_scan: #{first_scan} last_scan: #{last_scan} file: #{file} peptide_probability: #{peptide_probability}>"
+    "<Bioworks::Pep sequence: #{sequence}, mass: #{mass}, deltamass: #{deltamass}, charge: #{charge}, xcorr: #{xcorr}, deltacn: #{deltacn}, prots(count):#{prots.size}, base_name: #{base_name}, first_scan: #{first_scan}, last_scan: #{last_scan}, file: #{file}, peptide_probability: #{peptide_probability}, aaseq:#{aaseq}>"
+
+
   end
 
   def set_from_hash(hash)
     self[0,11] = [hash["sequence"], hash["mass"], hash["deltamass"], hash["charge"], hash["xcorr"], hash["deltacn"], hash["sp"], hash["rsp"], hash["ions"], hash["count"], hash["tic"]]
     self.file = hash["file"]
     self[15] = hash["peptide_probability"].to_f
+    self[19] = SpecID::Pep.sequence_to_aaseq(self[0])  ## aaseq
   end
 
-  def set_from_xml_stream(fh, prot)
-    self[11] = prot
+  def set_from_xml_stream(fh)
     hash = {}
     while line = fh.gets
       if line =~ @@att_re
         #hash[$1] = $2.dup
         hash[$1] = $2
         #puts "IN PEP: " + $1 + ": " + $2
       elsif line =~ @@end_pep_re
         set_from_hash(hash)
         #puts "SELF[12]: #{self[12]}"
-        unless self[12] then self[12] = prot.bioworks.global_filename end
         #puts "SELF[12]: #{self[12]}"
         break
       else
         puts "Bad parsing on: #{line}"
         puts "EXITING!"