require 'sample_enzyme' require 'xmlparser' require 'spec_id' require 'zlib' require 'hash_by' require 'set_from_hash' ## have to pre-declare some guys class SpecID; end class SpecID::Prot; end module SpecIDXML; end class SpecID::Bioworks # Regular expressions @@bioworksinfo_re = /(.*)<\/bioworksinfo>/o @@protein_re = //o @@origfilename_re = /(.*)<\/origfilename>/o @@origfilepath_re = /(.*)<\/origfilepath>/o attr_accessor :prots, :version, :global_filename, :origfilename, :origfilepath attr_writer :peps def hi_prob_best ; false end # -> prints to file filename1.sqt, filename2.sqt # @TODO: sqt file output def to_sqt(params_file) ## hash peps by filename ## hash prots by peptide end # returns the number of prots. Raises an Exception if open and closing xml # tags don't agree def num_prots(file) re = /()|(<\/protein>)/mo begin_tags = 0 end_tags = 0 IO.read(file).scan(re) do |match| if match.first begin_tags += 1 else end_tags += 1 end end if begin_tags != end_tags puts "WARNING: #{file} doesn't have matching closing tags" puts "for the tag. Returning # of beginning tags." end begin_tags end # Outputs the bioworks browser excel format (tab delimited) to file. # Useful if you have more than ~65,000 lines (can export bioworks.xml # and then convert to excel format). # Currently, the only things not precisely identical are: # 1. The peptide hit counts (although the first number [total # peptides] is accurate) # 2. The precise ordering of peptides within each protein. When dealing with output from multiple runs, peptides with runs with exactly the same scan numbers are not guaranteed to be in the same order. def to_excel(file) update_peptide_hit_counts arr = [] arr << ['', 'Reference', '', '', '', 'Score', 'Coverage', 'MW', 'Accession', 'Peptide (Hits)', '', ' '] arr << ['', '"File, Scan(s)"', 'Peptide', 'MH+', 'z', 'XC', 'DeltaCn', 'Sp', 'RSp', 'Ions', 'Count', ' '] @prots.each_with_index do |prot,index| line_arr = prot.get(:consensus_score, :coverage, :weight, :accession) if line_arr[1] == "0.0" then line_arr[1] = "" end line_arr.unshift('', '', '') line_arr.unshift('"' + prot.reference.split('|')[-1] + '"') line_arr.unshift(index+1) pep_hit_counts = prot.peptide_hit_counts pep_hit_counts_string = pep_hit_counts[0].to_s + ' (' + pep_hit_counts[1..-1].join(" ") + ')' line_arr.push( pep_hit_counts_string ) line_arr.push("") line_arr.push(" ") arr.push( line_arr ) prot.peps.sort_by{|obj| [obj.first_scan.to_i, obj.last_scan.to_i] }.each do |pep| pep_arr = pep.get(:sequence, :mass, :charge, :xcorr, :deltacn, :sp, :rsp, :ions) count = pep.count if count == '0' then count = "" end pep_arr.push(count) pep_arr.push(' ') pep_arr.unshift('"' + pep.file + '"') pep_arr.unshift( '' ) arr.push( pep_arr ) end end File.open(file, "w") do |out| arr.each do |line| out.print(line.join("\t"), "\n") end end end # for output to excel format or other things, updates each protein # with a peptide hit count array based on ranking of xcorr per dta file # where each array is the total number of peptide hits, then rank 1,2,3,4,5 # @TODO: Can't get this to check out yet. Perhaps they use normalized # Xcorr? def update_peptide_hit_counts @prots.each do |prot| prot.peptide_hit_counts[0] = prot.peps.size end hash = peps.hash_by(:file) hash.sort.each do |k,v| sorted = v.sort_by {|obj| obj.xcorr.to_f } peps, prot_groups = _uniq_peps_by_sequence_charge(sorted) ## but not on prot!!!!!uniq_peps_by_sequence_charge! prot_groups.each_with_index do |prot_group, i| prot_group.each do |prot| prot.peptide_hit_counts[i+1] += 1 if prot.peptide_hit_counts[i+1] end end end end # returns (peptides, proteins) where peptides is the unique list of peps # and proteins is a parallel array of arrays of represented proteins # note that each pep will contain its original prot it belongs to, even # though the parallel protein actually represents the proteins it belongs # to. def _uniq_peps_by_sequence_charge(peps) new_arr = [] prot_arr = [] index_accounted_for = [] (0...peps.size).each do |i| next if index_accounted_for.include?(i) new_arr << peps[i] prot_arr.push( [peps[i].prot] ) ((i+1)...peps.size).each do |j| pep1, pep2 = peps[i], peps[j] if pep1.sequence == pep2.sequence && pep1.charge == pep2.charge prot_arr.last.push pep2.prot index_accounted_for << j end end end return new_arr, prot_arr end def initialize(file=nil) @peps = nil if file parse_xml(file) #parse_xml_by_xmlparser(file) end end def parse_xml_by_xmlparser(file) parser = SpecID::Bioworks::XMLParser.new File.open(file) do |fh| #3.times do fh.gets end ## TEMPFIX parser.parse(fh) end #puts "ETETWSST" #p parser.prots @prots = parser.prots end # Returns the list of all peptide hits. A given sequence/charge or scan # may be redundant! def peps if @peps return @peps else @peps = [] prots.each do |prot| prot.peps.each do |pep| @peps << pep end end return @peps end end # This is highly specific to Bioworks 3.2 xml export. In other words, # unless the newlines, etc. are duplicated, this parser will fail! Not # robust, but it is faster than xmlparser (which is based on the speedy # expat) def parse_xml(file) fh = nil if file =~ /\.gz$/ fh = Zlib::GzipReader.open(file) else fh = File.open(file) end @origfilename = get_regex_val(fh, @@origfilename_re) @origfilepath = get_regex_val(fh, @@origfilepath_re) if @origfilename @global_filename = @origfilename.gsub(File.extname(@origfilename), "") end @version = get_regex_val(fh, @@bioworksinfo_re) @prots = get_prots(fh, self) fh.close end def get_prots(fh, bioworks) prots = [] while line = fh.gets if line =~ @@protein_re prot = SpecID::Bioworks::Prot.new prot.bioworks = self prot.set_from_xml_stream(fh, bioworks) prots << prot end end prots end # gets the regex and stops (and rewinds if it hits a protein) # if no regex is found, returns nil and rewinds the filehandle def get_regex_val(fh, regex) ver = nil last_pos = fh.pos while line = fh.gets if line =~ regex ver = $1.dup break elsif line =~ @@protein_re fh.seek last_pos break end last_pos = fh.pos end unless ver then fh.rewind end ver end # Outputs sequest xml files (pepxml) for the trans-proteomics pipeline def to_pepxml string = xml_version string end end # Implements fast parsing via XMLParser (wrapper around Expat) # It is actually slower (about %25 slower) than regular expression parsing class SpecID::Bioworks::XMLParser < XMLParser @@at = '@' attr_accessor :prots def initialize @current_obj = nil @current_hash = {} @current_name = nil @current_data = nil @prots = [] end def startElement(name, attrs) case name when "peptide" curr_prot = @current_obj if @current_obj.class == SpecID::Bioworks::Prot @current_obj.set_from_xml_hash_xmlparser(@current_hash) else curr_prot = @current_obj.prot ## unless previous was a peptide end peptide = SpecID::Bioworks::Pep.new peptide.prot = curr_prot curr_prot.peps << peptide @current_obj = peptide @current_hash = {} when "protein" @current_obj = SpecID::Bioworks::Prot.new @current_hash = {} @prots << @current_obj else @current_name = name end end def endElement(name) case name when "peptide" @current_obj.set_from_hash(@current_hash) when "protein" else @current_hash[name] = @current_data end end def character(data) @current_data = data end end module SpecID::Bioworks::XML # The regular expression to grab attributes from the bioworks xml format @@att_re = /<([\w]+)>(.*)<\/[\w]+>/o end class SpecID::Bioworks::Prot < SpecID::Prot include SpecID::Bioworks::XML @@end_prot_re = /<\/protein>/o @@pep_re = //o @@atts = %w(reference protein_probability consensus_score sf unified_score coverage pi weight accession peps) attr_accessor :reference, :protein_probability, :consensus_score, :sf, :unified_score, :coverage, :pi, :weight, :accession, :peps, :bioworks, :peptide_hit_counts def initialize @peps = [] @peptide_hit_counts = [0,0,0,0,0,0] end # returns array of values of the attributes given (as symbols) def get(*args) args.collect do |arg| send(arg) end end def set_from_xml_stream(fh, bioworks) hash = {} while line = fh.gets if line =~ @@att_re hash[$1] = $2 elsif line =~ @@pep_re pep = SpecID::Bioworks::Pep.new.set_from_xml_stream(fh, self) pep.prot = self @peps << pep elsif line =~ @@end_prot_re set_from_xml_hash(hash) break else puts "Bad parsing on: #{line}" puts "EXITING!" exit end end self end def set_from_xml_hash_xmlparser(hash) hash.delete("sequestresults") hash.delete("bioworksinfo") hash["sf"] = hash.delete("Sf") hash["pi"] = hash.delete("pI") set_from_hash(hash) end # changes the sf to Sf and pI to pi def set_from_xml_hash(hash) @reference = hash["reference"] @protein_probability = hash["protein_probability"] @probability = @protein_probability.to_f @consensus_score = hash["consensus_score"] @sf = hash["Sf"] @unified_score = hash["unified_score"] @coverage = hash["coverage"] @pi = hash["pI"] @weight = hash["weight"] @accession = hash["accession"] end end class SpecID::Bioworks::Pep < Array include SpecID::Bioworks::XML include SpecIDXML @@file_split_first_re = /, /o @@file_split_second_re = / - /o #@@att_re = /<(.*)>(.*)<\/(.*)>/ @@end_pep_re = /<\/peptide>/o @@file_one_scan_re = /(.*), (\d+)/o @@file_mult_scan_re = /(.*), (\d+) - (\d+)/o ## NOTE! the mass is really the theoretical MH+!!!! ## NOTE! ALL values stored as strings, except peptide_probability! ind_keys = {} ; ind_keys_w_eq = {}; @@ind = {} ind_keys = {:sequence => 0, :mass => 1, :deltamass => 2, :charge => 3, :xcorr => 4, :deltacn => 5, :sp => 6, :rsp => 7, :ions => 8, :count => 9, :tic => 10, :prot => 11, :base_name => 12, :first_scan => 13, :last_scan => 14, :peptide_probability => 15, :file => 16, :_num_prots => 17, :_first_prot => 18 } def sequence ; self[0] end ; def sequence=(oth) ; self[0] = oth end def mass ; self[1] end ; def mass=(oth) ; self[1] = oth end def deltamass ; self[2] end ; def deltamass=(oth) ; self[2] = oth end def charge ; self[3] end ; def charge=(oth) ; self[3] = oth end def xcorr ; self[4] end ; def xcorr=(oth) ; self[4] = oth end def deltacn ; self[5] end ; def deltacn=(oth) ; self[5] = oth end def sp ; self[6] end ; def sp=(oth) ; self[6] = oth end def rsp ; self[7] end ; def rsp=(oth) ; self[7] = oth end def ions ; self[8] end ; def ions=(oth) ; self[8] = oth end def count ; self[9] end ; def count=(oth) ; self[9] = oth end def tic ; self[10] end ; def tic=(oth) ; self[10] = oth end def prot ; self[11] end ; def prot=(oth) ; self[11] = oth end def base_name ; self[12] end ; def base_name=(oth) ; self[12] = oth end def first_scan ; self[13] end ; def first_scan=(oth) ; self[13] = oth end def last_scan ; self[14] end ; def last_scan=(oth) ; self[14] = oth end def peptide_probability ; self[15] end ; def peptide_probability=(oth) ; self[15] = oth end def file ; self[16] end # we define a writer below def _num_prots ; self[17] end ; def _num_prots=(oth) ; self[17] = oth end def _first_prot ; self[18] end ; def _first_prot=(oth) ; self[18] = oth end ## other accessors: def probability ; self[15] end #ind_keys.keys do |k| # self.module_eval( "def #{k} ; self[#{ind_keys[k]}] end ; def #{k}=(oth) ; self[#{ind_keys[k]} = oth end ", __FILE__, __LINE__ ) #end @@arr_size = ind_keys.size ind_keys.each {|k,v| ind_keys_w_eq["#{k}=".to_sym] = v } ind_keys.merge!(ind_keys_w_eq) ind_keys.each {|k,v| @@ind[k] = v ; @@ind["#{k}"] = v} def initialize(args=nil) super(@@arr_size.size) if args if args.is_a? Hash args.each do |k,v| self[@@ind[k]] = v end end end end # returns array of values of the attributes given (as symbols) def get(*args) args.collect do |arg| send(arg) end end #def peptide_probability=(prob) # @peptide_probability = prob.to_f #end # takes arguments in one of two forms: # 1. file, first_scan[ - last_scan] # 2. scan[ - last_scan] # returns base_name, first_scan, last_scan # base_name will be set for #1, nil for #2 def self.extract_file_info(arg) last_scan = nil (base_name, first_scan) = arg.split(@@file_split_first_re) unless first_scan first_scan = base_name base_name = nil end first_scan = first_scan.split(@@file_split_second_re) if first_scan.size > 1 (first_scan, last_scan) = first_scan else first_scan = first_scan[0] last_scan = first_scan end return base_name, first_scan, last_scan end def file=(arg) ## Set these vals by index: #puts "AERRG: #{arg}" self[16] = arg self[12,3] = self.class.extract_file_info(arg) end def inspect "" end def set_from_hash(hash) self[0,11] = [hash["sequence"], hash["mass"], hash["deltamass"], hash["charge"], hash["xcorr"], hash["deltacn"], hash["sp"], hash["rsp"], hash["ions"], hash["count"], hash["tic"]] self.file = hash["file"] self[15] = hash["peptide_probability"].to_f end def set_from_xml_stream(fh, prot) self[11] = prot hash = {} while line = fh.gets if line =~ @@att_re #hash[$1] = $2.dup hash[$1] = $2 #puts "IN PEP: " + $1 + ": " + $2 elsif line =~ @@end_pep_re set_from_hash(hash) #puts "SELF[12]: #{self[12]}" unless self[12] then self[12] = prot.bioworks.global_filename end #puts "SELF[12]: #{self[12]}" break else puts "Bad parsing on: #{line}" puts "EXITING!" exit end end self end end