lib/spec_id/bioworks.rb in mspire-0.1.7 vs lib/spec_id/bioworks.rb in mspire-0.2.0
- old
+ new
@@ -4,29 +4,32 @@
require 'xmlparser'
require 'spec_id'
require 'zlib'
require 'hash_by'
require 'set_from_hash'
+require 'array_class'
## have to pre-declare some guys
-class SpecID; end
-class SpecID::Prot; end
+module SpecID; end
+module SpecID::Prot; end
+module SpecID::Pep; end
module SpecIDXML; end
-class SpecID::Bioworks
+class Bioworks
+ include SpecID
+
# Regular expressions
@@bioworksinfo_re = /<bioworksinfo>(.*)<\/bioworksinfo>/o
@@modifications_re = /<modifications>(.*)<\/modifications>/o
@@protein_re = /<protein>/o
@@origfilename_re = /<origfilename>(.*)<\/origfilename>/o
@@origfilepath_re = /<origfilepath>(.*)<\/origfilepath>/o
- attr_accessor :prots, :version, :global_filename, :origfilename, :origfilepath
+ attr_accessor :peps, :prots, :version, :global_filename, :origfilename, :origfilepath
# a string of modifications e.g., "(M* +15.99491) (S@ +14.9322) "
attr_accessor :modifications
- attr_writer :peps
def hi_prob_best ; false end
# -> prints to file filename1.sqt, filename2.sqt
# @TODO: sqt file output
@@ -125,65 +128,50 @@
# returns (peptides, proteins) where peptides is the unique list of peps
# and proteins is a parallel array of arrays of represented proteins
# note that each pep will contain its original prot it belongs to, even
# though the parallel protein actually represents the proteins it belongs
# to.
+ # assumes that each peptide points to all its proteins in pep.prots
def _uniq_peps_by_sequence_charge(peps)
new_arr = []
prot_arr = []
index_accounted_for = []
(0...peps.size).each do |i|
next if index_accounted_for.include?(i)
new_arr << peps[i]
- prot_arr.push( [peps[i].prot] )
+ prot_arr.push( peps[i].prots )
((i+1)...peps.size).each do |j|
pep1, pep2 = peps[i], peps[j]
if pep1.sequence == pep2.sequence && pep1.charge == pep2.charge
- prot_arr.last.push pep2.prot
+ prot_arr.last.push( *(pep2.prots) )
index_accounted_for << j
end
end
end
return new_arr, prot_arr
end
def initialize(file=nil)
@peps = nil
if file
+ @filename = file
parse_xml(file)
#parse_xml_by_xmlparser(file)
end
end
def parse_xml_by_xmlparser(file)
- parser = SpecID::Bioworks::XMLParser.new
+ parser = Bioworks::XMLParser.new
File.open(file) do |fh|
#3.times do fh.gets end ## TEMPFIX
parser.parse(fh)
end
#puts "ETETWSST"
#p parser.prots
@prots = parser.prots
end
-
- # Returns the list of all peptide hits. A given sequence/charge or scan
- # may be redundant!
- def peps
- if @peps
- return @peps
- else
- @peps = []
- prots.each do |prot|
- prot.peps.each do |pep|
- @peps << pep
- end
- end
- return @peps
- end
- end
-
# This is highly specific to Bioworks 3.2 xml export. In other words,
# unless the newlines, etc. are duplicated, this parser will fail! Not
# robust, but it is faster than xmlparser (which is based on the speedy
# expat)
def parse_xml(file)
@@ -198,25 +186,27 @@
if @origfilename
@global_filename = @origfilename.gsub(File.extname(@origfilename), "")
end
@version = get_regex_val(fh, @@bioworksinfo_re)
@modifications = get_regex_val(fh, @@modifications_re)
- @prots = get_prots(fh, self)
+ @prots, @peps = get_prots_from_xml_stream(fh)
fh.close
end
- def get_prots(fh, bioworks)
+ ## returns proteins and peptides
+ def get_prots_from_xml_stream(fh)
+ uniq_pephit_hash = {}
prots = []
while line = fh.gets
if line =~ @@protein_re
- prot = SpecID::Bioworks::Prot.new
+ prot = Bioworks::Prot.new
prot.bioworks = self
- prot.set_from_xml_stream(fh, bioworks)
+ prot.set_from_xml_stream(fh, uniq_pephit_hash)
prots << prot
end
end
- prots
+ [prots, uniq_pephit_hash.values]
end
# gets the regex and stops (and rewinds if it hits a protein)
# if no regex is found, returns nil and rewinds the filehandle
def get_regex_val(fh, regex)
@@ -244,11 +234,11 @@
end
# Implements fast parsing via XMLParser (wrapper around Expat)
# It is actually slower (about %25 slower) than regular expression parsing
-class SpecID::Bioworks::XMLParser < XMLParser
+class Bioworks::XMLParser < XMLParser
@@at = '@'
attr_accessor :prots
def initialize
@current_obj = nil
@@ -260,22 +250,22 @@
def startElement(name, attrs)
case name
when "peptide"
curr_prot = @current_obj
- if @current_obj.class == SpecID::Bioworks::Prot
+ if @current_obj.class == Bioworks::Prot
@current_obj.set_from_xml_hash_xmlparser(@current_hash)
else
curr_prot = @current_obj.prot ## unless previous was a peptide
end
- peptide = SpecID::Bioworks::Pep.new
+ peptide = Bioworks::Pep.new
peptide.prot = curr_prot
curr_prot.peps << peptide
@current_obj = peptide
@current_hash = {}
when "protein"
- @current_obj = SpecID::Bioworks::Prot.new
+ @current_obj = Bioworks::Prot.new
@current_hash = {}
@prots << @current_obj
else
@current_name = name
end
@@ -295,17 +285,18 @@
@current_data = data
end
end
-module SpecID::Bioworks::XML
+module Bioworks::XML
# The regular expression to grab attributes from the bioworks xml format
@@att_re = /<([\w]+)>(.*)<\/[\w]+>/o
end
-class SpecID::Bioworks::Prot < SpecID::Prot
- include SpecID::Bioworks::XML
+class Bioworks::Prot
+ include SpecID::Prot
+ include Bioworks::XML
@@end_prot_re = /<\/protein>/o
@@pep_re = /<peptide>/o
@@atts = %w(reference protein_probability consensus_score sf unified_score coverage pi weight accession peps)
attr_accessor :reference, :protein_probability, :consensus_score, :sf, :unified_score, :coverage, :pi, :weight, :accession, :peps, :bioworks, :peptide_hit_counts
@@ -321,19 +312,36 @@
args.collect do |arg|
send(arg)
end
end
- def set_from_xml_stream(fh, bioworks)
+ def set_from_xml_stream(fh, uniq_pephit_hash)
hash = {}
+ @peps = []
while line = fh.gets
if line =~ @@att_re
hash[$1] = $2
elsif line =~ @@pep_re
- pep = SpecID::Bioworks::Pep.new.set_from_xml_stream(fh, self)
- pep.prot = self
+ ## Could do a look ahead to grab the file and sequence to check
+ ## uniqueness to increase speed here.
+ pep = Bioworks::Pep.new.set_from_xml_stream(fh)
+ # normal search results files have a global filename
+ # while multi-consensus do not
+ pep[12] ||= bioworks.global_filename
+
+ ## figure out uniqueness
+ ky = [pep.base_name, pep.first_scan, pep.charge, pep.sequence]
+ if uniq_pephit_hash.key? ky
+ pep = uniq_pephit_hash[ky]
+ else
+ ## insert the new protein
+ pep.prots = []
+ uniq_pephit_hash[ky] = pep
+ end
+ pep.prots << self
@peps << pep
+
elsif line =~ @@end_prot_re
set_from_xml_hash(hash)
break
else
puts "Bad parsing on: #{line}"
@@ -365,70 +373,38 @@
@weight = hash["weight"]
@accession = hash["accession"]
end
end
+Bioworks::Pep = ArrayClass.new( %w(sequence mass deltamass charge xcorr deltacn sp rsp ions count tic prots base_name first_scan last_scan peptide_probability file _num_prots _first_prot aaseq) )
+# 0=sequence 1=mass 2=deltamass 3=charge 4=xcorr 5=deltacn 6=sp 7=rsp 8=ions 9=count 10=tic 11=prots 12=base_name 13=first_scan 14=last_scan 15=peptide_probability 16=file 17=_num_prots 18=_first_prot 19=aaseq
-class SpecID::Bioworks::Pep < Array
- include SpecID::Bioworks::XML
+class Bioworks::Pep
+ include SpecID::Pep
+ include Bioworks::XML
include SpecIDXML
@@file_split_first_re = /, /o
@@file_split_second_re = / - /o
#@@att_re = /<(.*)>(.*)<\/(.*)>/
@@end_pep_re = /<\/peptide>/o
@@file_one_scan_re = /(.*), (\d+)/o
@@file_mult_scan_re = /(.*), (\d+) - (\d+)/o
## NOTE! the mass is really the theoretical MH+!!!!
## NOTE! ALL values stored as strings, except peptide_probability!
- ind_keys = {} ; ind_keys_w_eq = {}; @@ind = {}
- ind_keys = {:sequence => 0, :mass => 1, :deltamass => 2, :charge => 3, :xcorr => 4, :deltacn => 5, :sp => 6, :rsp => 7, :ions => 8, :count => 9, :tic => 10, :prot => 11, :base_name => 12, :first_scan => 13, :last_scan => 14, :peptide_probability => 15, :file => 16, :_num_prots => 17, :_first_prot => 18 }
-
- def sequence ; self[0] end ; def sequence=(oth) ; self[0] = oth end
- def mass ; self[1] end ; def mass=(oth) ; self[1] = oth end
- def deltamass ; self[2] end ; def deltamass=(oth) ; self[2] = oth end
- def charge ; self[3] end ; def charge=(oth) ; self[3] = oth end
- def xcorr ; self[4] end ; def xcorr=(oth) ; self[4] = oth end
- def deltacn ; self[5] end ; def deltacn=(oth) ; self[5] = oth end
- def sp ; self[6] end ; def sp=(oth) ; self[6] = oth end
- def rsp ; self[7] end ; def rsp=(oth) ; self[7] = oth end
- def ions ; self[8] end ; def ions=(oth) ; self[8] = oth end
- def count ; self[9] end ; def count=(oth) ; self[9] = oth end
- def tic ; self[10] end ; def tic=(oth) ; self[10] = oth end
- def prot ; self[11] end ; def prot=(oth) ; self[11] = oth end
- def base_name ; self[12] end ; def base_name=(oth) ; self[12] = oth end
- def first_scan ; self[13] end ; def first_scan=(oth) ; self[13] = oth end
- def last_scan ; self[14] end ; def last_scan=(oth) ; self[14] = oth end
- def peptide_probability ; self[15] end ; def peptide_probability=(oth) ; self[15] = oth end
- def file ; self[16] end # we define a writer below
- def _num_prots ; self[17] end ; def _num_prots=(oth) ; self[17] = oth end
- def _first_prot ; self[18] end ; def _first_prot=(oth) ; self[18] = oth end
-
## other accessors:
def probability ; self[15] end
+ def mh ; self[1] end
- #ind_keys.keys do |k|
- # self.module_eval( "def #{k} ; self[#{ind_keys[k]}] end ; def #{k}=(oth) ; self[#{ind_keys[k]} = oth end ", __FILE__, __LINE__ )
- #end
- @@arr_size = ind_keys.size
- ind_keys.each {|k,v| ind_keys_w_eq["#{k}=".to_sym] = v }
- ind_keys.merge!(ind_keys_w_eq)
- ind_keys.each {|k,v| @@ind[k] = v ; @@ind["#{k}"] = v}
-
- def initialize(args=nil)
- super(@@arr_size.size)
- if args
- if args.is_a? Hash
- args.each do |k,v|
- self[@@ind[k]] = v
- end
- end
- end
+ # This is not a true ppm since it should be divided by the actual mh instead
+ # of the theoretical (but it is as close as we can get for this object)
+ def ppm
+ 1.0e6 * (self[2].abs/self[1])
+ #1.0e6 * (self.deltamass.abs/self.mh)
end
-
# returns array of values of the attributes given (as symbols)
def get(*args)
args.collect do |arg|
send(arg)
end
@@ -461,38 +437,42 @@
last_scan = first_scan
end
[base_name, first_scan, last_scan]
end
+ tmp_verb = $VERBOSE
+ $VERBOSE = nil
def file=(arg)
## Set these vals by index:
#puts "AERRG: #{arg}"
self[16] = arg
self[12,3] = self.class.extract_file_info(arg)
end
+ $VERBOSE = tmp_verb
def inspect
- "<SpecID::Bioworks::Pep sequence: #{sequence}, mass: #{mass}, deltamass: #{deltamass}, charge: #{charge}, xcorr: #{xcorr}, deltacn: #{deltacn} prot: #{prot} base_name: #{base_name} first_scan: #{first_scan} last_scan: #{last_scan} file: #{file} peptide_probability: #{peptide_probability}>"
+ "<Bioworks::Pep sequence: #{sequence}, mass: #{mass}, deltamass: #{deltamass}, charge: #{charge}, xcorr: #{xcorr}, deltacn: #{deltacn}, prots(count):#{prots.size}, base_name: #{base_name}, first_scan: #{first_scan}, last_scan: #{last_scan}, file: #{file}, peptide_probability: #{peptide_probability}, aaseq:#{aaseq}>"
+
+
end
def set_from_hash(hash)
self[0,11] = [hash["sequence"], hash["mass"], hash["deltamass"], hash["charge"], hash["xcorr"], hash["deltacn"], hash["sp"], hash["rsp"], hash["ions"], hash["count"], hash["tic"]]
self.file = hash["file"]
self[15] = hash["peptide_probability"].to_f
+ self[19] = SpecID::Pep.sequence_to_aaseq(self[0]) ## aaseq
end
- def set_from_xml_stream(fh, prot)
- self[11] = prot
+ def set_from_xml_stream(fh)
hash = {}
while line = fh.gets
if line =~ @@att_re
#hash[$1] = $2.dup
hash[$1] = $2
#puts "IN PEP: " + $1 + ": " + $2
elsif line =~ @@end_pep_re
set_from_hash(hash)
#puts "SELF[12]: #{self[12]}"
- unless self[12] then self[12] = prot.bioworks.global_filename end
#puts "SELF[12]: #{self[12]}"
break
else
puts "Bad parsing on: #{line}"
puts "EXITING!"