lib/spec_id/bioworks.rb in mspire-0.1.7 vs lib/spec_id/bioworks.rb in mspire-0.2.0

- old
+ new

@@ -4,29 +4,32 @@ require 'xmlparser' require 'spec_id' require 'zlib' require 'hash_by' require 'set_from_hash' +require 'array_class' ## have to pre-declare some guys -class SpecID; end -class SpecID::Prot; end +module SpecID; end +module SpecID::Prot; end +module SpecID::Pep; end module SpecIDXML; end -class SpecID::Bioworks +class Bioworks + include SpecID + # Regular expressions @@bioworksinfo_re = /<bioworksinfo>(.*)<\/bioworksinfo>/o @@modifications_re = /<modifications>(.*)<\/modifications>/o @@protein_re = /<protein>/o @@origfilename_re = /<origfilename>(.*)<\/origfilename>/o @@origfilepath_re = /<origfilepath>(.*)<\/origfilepath>/o - attr_accessor :prots, :version, :global_filename, :origfilename, :origfilepath + attr_accessor :peps, :prots, :version, :global_filename, :origfilename, :origfilepath # a string of modifications e.g., "(M* +15.99491) (S@ +14.9322) " attr_accessor :modifications - attr_writer :peps def hi_prob_best ; false end # -> prints to file filename1.sqt, filename2.sqt # @TODO: sqt file output @@ -125,65 +128,50 @@ # returns (peptides, proteins) where peptides is the unique list of peps # and proteins is a parallel array of arrays of represented proteins # note that each pep will contain its original prot it belongs to, even # though the parallel protein actually represents the proteins it belongs # to. + # assumes that each peptide points to all its proteins in pep.prots def _uniq_peps_by_sequence_charge(peps) new_arr = [] prot_arr = [] index_accounted_for = [] (0...peps.size).each do |i| next if index_accounted_for.include?(i) new_arr << peps[i] - prot_arr.push( [peps[i].prot] ) + prot_arr.push( peps[i].prots ) ((i+1)...peps.size).each do |j| pep1, pep2 = peps[i], peps[j] if pep1.sequence == pep2.sequence && pep1.charge == pep2.charge - prot_arr.last.push pep2.prot + prot_arr.last.push( *(pep2.prots) ) index_accounted_for << j end end end return new_arr, prot_arr end def initialize(file=nil) @peps = nil if file + @filename = file parse_xml(file) #parse_xml_by_xmlparser(file) end end def parse_xml_by_xmlparser(file) - parser = SpecID::Bioworks::XMLParser.new + parser = Bioworks::XMLParser.new File.open(file) do |fh| #3.times do fh.gets end ## TEMPFIX parser.parse(fh) end #puts "ETETWSST" #p parser.prots @prots = parser.prots end - - # Returns the list of all peptide hits. A given sequence/charge or scan - # may be redundant! - def peps - if @peps - return @peps - else - @peps = [] - prots.each do |prot| - prot.peps.each do |pep| - @peps << pep - end - end - return @peps - end - end - # This is highly specific to Bioworks 3.2 xml export. In other words, # unless the newlines, etc. are duplicated, this parser will fail! Not # robust, but it is faster than xmlparser (which is based on the speedy # expat) def parse_xml(file) @@ -198,25 +186,27 @@ if @origfilename @global_filename = @origfilename.gsub(File.extname(@origfilename), "") end @version = get_regex_val(fh, @@bioworksinfo_re) @modifications = get_regex_val(fh, @@modifications_re) - @prots = get_prots(fh, self) + @prots, @peps = get_prots_from_xml_stream(fh) fh.close end - def get_prots(fh, bioworks) + ## returns proteins and peptides + def get_prots_from_xml_stream(fh) + uniq_pephit_hash = {} prots = [] while line = fh.gets if line =~ @@protein_re - prot = SpecID::Bioworks::Prot.new + prot = Bioworks::Prot.new prot.bioworks = self - prot.set_from_xml_stream(fh, bioworks) + prot.set_from_xml_stream(fh, uniq_pephit_hash) prots << prot end end - prots + [prots, uniq_pephit_hash.values] end # gets the regex and stops (and rewinds if it hits a protein) # if no regex is found, returns nil and rewinds the filehandle def get_regex_val(fh, regex) @@ -244,11 +234,11 @@ end # Implements fast parsing via XMLParser (wrapper around Expat) # It is actually slower (about %25 slower) than regular expression parsing -class SpecID::Bioworks::XMLParser < XMLParser +class Bioworks::XMLParser < XMLParser @@at = '@' attr_accessor :prots def initialize @current_obj = nil @@ -260,22 +250,22 @@ def startElement(name, attrs) case name when "peptide" curr_prot = @current_obj - if @current_obj.class == SpecID::Bioworks::Prot + if @current_obj.class == Bioworks::Prot @current_obj.set_from_xml_hash_xmlparser(@current_hash) else curr_prot = @current_obj.prot ## unless previous was a peptide end - peptide = SpecID::Bioworks::Pep.new + peptide = Bioworks::Pep.new peptide.prot = curr_prot curr_prot.peps << peptide @current_obj = peptide @current_hash = {} when "protein" - @current_obj = SpecID::Bioworks::Prot.new + @current_obj = Bioworks::Prot.new @current_hash = {} @prots << @current_obj else @current_name = name end @@ -295,17 +285,18 @@ @current_data = data end end -module SpecID::Bioworks::XML +module Bioworks::XML # The regular expression to grab attributes from the bioworks xml format @@att_re = /<([\w]+)>(.*)<\/[\w]+>/o end -class SpecID::Bioworks::Prot < SpecID::Prot - include SpecID::Bioworks::XML +class Bioworks::Prot + include SpecID::Prot + include Bioworks::XML @@end_prot_re = /<\/protein>/o @@pep_re = /<peptide>/o @@atts = %w(reference protein_probability consensus_score sf unified_score coverage pi weight accession peps) attr_accessor :reference, :protein_probability, :consensus_score, :sf, :unified_score, :coverage, :pi, :weight, :accession, :peps, :bioworks, :peptide_hit_counts @@ -321,19 +312,36 @@ args.collect do |arg| send(arg) end end - def set_from_xml_stream(fh, bioworks) + def set_from_xml_stream(fh, uniq_pephit_hash) hash = {} + @peps = [] while line = fh.gets if line =~ @@att_re hash[$1] = $2 elsif line =~ @@pep_re - pep = SpecID::Bioworks::Pep.new.set_from_xml_stream(fh, self) - pep.prot = self + ## Could do a look ahead to grab the file and sequence to check + ## uniqueness to increase speed here. + pep = Bioworks::Pep.new.set_from_xml_stream(fh) + # normal search results files have a global filename + # while multi-consensus do not + pep[12] ||= bioworks.global_filename + + ## figure out uniqueness + ky = [pep.base_name, pep.first_scan, pep.charge, pep.sequence] + if uniq_pephit_hash.key? ky + pep = uniq_pephit_hash[ky] + else + ## insert the new protein + pep.prots = [] + uniq_pephit_hash[ky] = pep + end + pep.prots << self @peps << pep + elsif line =~ @@end_prot_re set_from_xml_hash(hash) break else puts "Bad parsing on: #{line}" @@ -365,70 +373,38 @@ @weight = hash["weight"] @accession = hash["accession"] end end +Bioworks::Pep = ArrayClass.new( %w(sequence mass deltamass charge xcorr deltacn sp rsp ions count tic prots base_name first_scan last_scan peptide_probability file _num_prots _first_prot aaseq) ) +# 0=sequence 1=mass 2=deltamass 3=charge 4=xcorr 5=deltacn 6=sp 7=rsp 8=ions 9=count 10=tic 11=prots 12=base_name 13=first_scan 14=last_scan 15=peptide_probability 16=file 17=_num_prots 18=_first_prot 19=aaseq -class SpecID::Bioworks::Pep < Array - include SpecID::Bioworks::XML +class Bioworks::Pep + include SpecID::Pep + include Bioworks::XML include SpecIDXML @@file_split_first_re = /, /o @@file_split_second_re = / - /o #@@att_re = /<(.*)>(.*)<\/(.*)>/ @@end_pep_re = /<\/peptide>/o @@file_one_scan_re = /(.*), (\d+)/o @@file_mult_scan_re = /(.*), (\d+) - (\d+)/o ## NOTE! the mass is really the theoretical MH+!!!! ## NOTE! ALL values stored as strings, except peptide_probability! - ind_keys = {} ; ind_keys_w_eq = {}; @@ind = {} - ind_keys = {:sequence => 0, :mass => 1, :deltamass => 2, :charge => 3, :xcorr => 4, :deltacn => 5, :sp => 6, :rsp => 7, :ions => 8, :count => 9, :tic => 10, :prot => 11, :base_name => 12, :first_scan => 13, :last_scan => 14, :peptide_probability => 15, :file => 16, :_num_prots => 17, :_first_prot => 18 } - - def sequence ; self[0] end ; def sequence=(oth) ; self[0] = oth end - def mass ; self[1] end ; def mass=(oth) ; self[1] = oth end - def deltamass ; self[2] end ; def deltamass=(oth) ; self[2] = oth end - def charge ; self[3] end ; def charge=(oth) ; self[3] = oth end - def xcorr ; self[4] end ; def xcorr=(oth) ; self[4] = oth end - def deltacn ; self[5] end ; def deltacn=(oth) ; self[5] = oth end - def sp ; self[6] end ; def sp=(oth) ; self[6] = oth end - def rsp ; self[7] end ; def rsp=(oth) ; self[7] = oth end - def ions ; self[8] end ; def ions=(oth) ; self[8] = oth end - def count ; self[9] end ; def count=(oth) ; self[9] = oth end - def tic ; self[10] end ; def tic=(oth) ; self[10] = oth end - def prot ; self[11] end ; def prot=(oth) ; self[11] = oth end - def base_name ; self[12] end ; def base_name=(oth) ; self[12] = oth end - def first_scan ; self[13] end ; def first_scan=(oth) ; self[13] = oth end - def last_scan ; self[14] end ; def last_scan=(oth) ; self[14] = oth end - def peptide_probability ; self[15] end ; def peptide_probability=(oth) ; self[15] = oth end - def file ; self[16] end # we define a writer below - def _num_prots ; self[17] end ; def _num_prots=(oth) ; self[17] = oth end - def _first_prot ; self[18] end ; def _first_prot=(oth) ; self[18] = oth end - ## other accessors: def probability ; self[15] end + def mh ; self[1] end - #ind_keys.keys do |k| - # self.module_eval( "def #{k} ; self[#{ind_keys[k]}] end ; def #{k}=(oth) ; self[#{ind_keys[k]} = oth end ", __FILE__, __LINE__ ) - #end - @@arr_size = ind_keys.size - ind_keys.each {|k,v| ind_keys_w_eq["#{k}=".to_sym] = v } - ind_keys.merge!(ind_keys_w_eq) - ind_keys.each {|k,v| @@ind[k] = v ; @@ind["#{k}"] = v} - - def initialize(args=nil) - super(@@arr_size.size) - if args - if args.is_a? Hash - args.each do |k,v| - self[@@ind[k]] = v - end - end - end + # This is not a true ppm since it should be divided by the actual mh instead + # of the theoretical (but it is as close as we can get for this object) + def ppm + 1.0e6 * (self[2].abs/self[1]) + #1.0e6 * (self.deltamass.abs/self.mh) end - # returns array of values of the attributes given (as symbols) def get(*args) args.collect do |arg| send(arg) end @@ -461,38 +437,42 @@ last_scan = first_scan end [base_name, first_scan, last_scan] end + tmp_verb = $VERBOSE + $VERBOSE = nil def file=(arg) ## Set these vals by index: #puts "AERRG: #{arg}" self[16] = arg self[12,3] = self.class.extract_file_info(arg) end + $VERBOSE = tmp_verb def inspect - "<SpecID::Bioworks::Pep sequence: #{sequence}, mass: #{mass}, deltamass: #{deltamass}, charge: #{charge}, xcorr: #{xcorr}, deltacn: #{deltacn} prot: #{prot} base_name: #{base_name} first_scan: #{first_scan} last_scan: #{last_scan} file: #{file} peptide_probability: #{peptide_probability}>" + "<Bioworks::Pep sequence: #{sequence}, mass: #{mass}, deltamass: #{deltamass}, charge: #{charge}, xcorr: #{xcorr}, deltacn: #{deltacn}, prots(count):#{prots.size}, base_name: #{base_name}, first_scan: #{first_scan}, last_scan: #{last_scan}, file: #{file}, peptide_probability: #{peptide_probability}, aaseq:#{aaseq}>" + + end def set_from_hash(hash) self[0,11] = [hash["sequence"], hash["mass"], hash["deltamass"], hash["charge"], hash["xcorr"], hash["deltacn"], hash["sp"], hash["rsp"], hash["ions"], hash["count"], hash["tic"]] self.file = hash["file"] self[15] = hash["peptide_probability"].to_f + self[19] = SpecID::Pep.sequence_to_aaseq(self[0]) ## aaseq end - def set_from_xml_stream(fh, prot) - self[11] = prot + def set_from_xml_stream(fh) hash = {} while line = fh.gets if line =~ @@att_re #hash[$1] = $2.dup hash[$1] = $2 #puts "IN PEP: " + $1 + ": " + $2 elsif line =~ @@end_pep_re set_from_hash(hash) #puts "SELF[12]: #{self[12]}" - unless self[12] then self[12] = prot.bioworks.global_filename end #puts "SELF[12]: #{self[12]}" break else puts "Bad parsing on: #{line}" puts "EXITING!"