# standard lib require 'set' require 'fileutils' require 'scanf' # in library require 'mspire/ident/search' require 'mspire/ident/peptide' require 'mspire/ident/protein' require 'mspire/sequest/params' module Mspire ; end module Mspire::Sequest ; end class Mspire::Sequest::Srf < Mspire::Ident::Search class NoSequestParamsError < ArgumentError end # inherits peptide_hits from Search # a String: 3.5, 3.3 or 3.2 attr_accessor :version attr_accessor :header attr_accessor :dta_files attr_accessor :out_files attr_accessor :params # a parallel array to dta_files and out_files where each entry is: # [first_scan, last_scan, charge] attr_accessor :index # the base name of the file with no extension attr_accessor :base_name alias_method :base_name_noext, :base_name alias_method :base_name_noext=, :base_name= # the directory the srf file was residing in when the filename was passed # in. May not be available. attr_accessor :resident_dir # a boolean to indicate if the results have been filtered by the # sequest.params precursor mass tolerance attr_accessor :filtered_by_precursor_mass_tolerance def protein_class Mspire::Sequest::Srf::Out::Protein end # returns a Sequest::Params object or nil if none def self.get_sequest_params_and_finish_pos(filename) # split the file in half and only read the second half (since we can be # confident that the params file will be there!) params = nil finish_parsing_io_pos = nil File.open(filename, 'rb') do |handle| halfway = handle.stat.size / 2 handle.seek halfway last_half = handle.read if sequest_start_from_last_half = last_half.rindex('[SEQUEST]') params_start_index = sequest_start_from_last_half + halfway handle.seek(params_start_index) params = Mspire::Sequest::Params.new.parse_io(handle) finish_parsing_io_pos = handle.pos else nil # not found end end [params, finish_parsing_io_pos] end def dta_start_byte case @version when '3.2' ; 3260 when '3.3' ; 3644 when '3.5' ; 3644 end end # opts: # :filter_by_precursor_mass_tolerance => true | false (default true) # this will filter by the sequest params prec tolerance as is # typically done by the Bioworks software. # # :read_pephits => true | false (default true) # will attempt to read peptide hit information (equivalent to .out # files), otherwise, just reads the dta information. def initialize(filename=nil, opts={}) @peptide_hits = [] @dta_files = [] @out_files = [] if filename from_file(filename, opts) end end # 1. updates the out_file's list of hits based on passing peptide_hits (but not # the original hit id; rank is implicit in array ordering) # 2. recalculates deltacn values completely if number of hits changed (does # not touch deltacn orig) # # This can spoil proper protein -> peptide linkages. Mspire::Id::Search.merge! # should be run after this method to ensure correct protein -> peptide # linkages. def filter_by_precursor_mass_tolerance! pmt = params.peptide_mass_tolerance.to_f methd = nil # the method to case params.peptide_mass_units when '0' amu_based = true milli_amu = false when '1' amu_based = true milli_amu = true when '2' amu_based = false end self.filtered_by_precursor_mass_tolerance = true self.out_files.each do |out_file| hits = out_file.hits before = hits.size hits.reject! do |pep| if amu_based if milli_amu (pep.deltamass.abs > (pmt/1000)) else (pep.deltamass.abs > pmt) end else (pep.ppm.abs > pmt) end end if hits.size != before out_file.hits = hits # <- is this necessary Mspire::Sequest::Srf::Out::Peptide.update_deltacns_from_xcorr(hits) out_file.num_hits = hits.size end end self end def read_dta_and_out_interleaved(fh, num_files, unpack_35, dup_refs_gt_0) dta_files = Array.new(num_files) out_files = Array.new(num_files) start = dta_start_byte fh.pos = start num_files.times do |i| dta_files[i] = Mspire::Sequest::Srf::Dta.from_io(fh, unpack_35) #p dta_files[i] out_files[i] = Mspire::Sequest::Srf::Out.from_io(fh, unpack_35, dup_refs_gt_0) #p out_files[i] end [dta_files, out_files] end # returns self # opts are the same as for 'new' def from_file(filename, opts) @resident_dir = File.dirname(File.expand_path(filename)) opts = { :filter_by_precursor_mass_tolerance => true, :read_pephits => true}.merge(opts) (@params, after_params_io_pos) = Mspire::Sequest::Srf.get_sequest_params_and_finish_pos(filename) return unless @params dup_references = 0 dup_refs_gt_0 = false dup_references = @params.print_duplicate_references.to_i if dup_references == 0 # warn %Q{ #***************************************************************************** #WARNING: This srf file lists only 1 protein per peptide! (based on the #print_duplicate_references parameter in the sequest.params file used in its #creation) So, downstream output will likewise only contain a single protein #for each peptide hit. In many instances this is OK since downstream programs #will recalculate protein-to-peptide linkages from the database file anyway. #For complete protein lists per peptide hit, .srf files must be created with #print_duplicate_references > 0. HINT: to capture all duplicate references, #set the sequest parameter 'print_duplicate_references' to 100 or greater. #***************************************************************************** # } else dup_refs_gt_0 = true end File.open(filename, 'rb') do |fh| @header = Mspire::Sequest::Srf::Header.from_io(fh) @version = @header.version unpack_35 = case @version when '3.2' false when '3.3' false when '3.5' true end if @header.combined @base_name = File.basename(filename, '.*') # I'm not sure why this is the case, but the reported number is too # big by one on the 2 files I've seen so far, so we will correct it here! @header.dta_gen.num_dta_files = @header.dta_gen.num_dta_files - 1 if opts[:read_pephits] == false raise NotImplementedError, "on combined files must read everything right now!" end (@dta_files, @out_files) = read_dta_and_out_interleaved(fh, @header.num_dta_files, unpack_35, dup_refs_gt_0) else @base_name = @header.raw_filename.scan(/[\\\/]([^\\\/]+)\.RAW$/).first.first @dta_files = read_dta_files(fh, @header.num_dta_files, unpack_35) if opts[:read_pephits] # need the params file to know if the duplicate_references is set > 0 raise NoSequestParamsError, "no sequest params info in srf file!\npass in path to sequest.params file" if @params.nil? @out_files = read_out_files(fh,@header.num_dta_files, unpack_35, dup_refs_gt_0) # FOR DISPLAY ONLY! #@out_files.each do |f| # if f.num_hits == 10 # p f.hits.last # end #end if fh.eof? #warn "FILE: '#{filename}' appears to be an abortive run (no params in srf file)\nstill continuing..." @params = nil @index = [] end end end fh.pos = after_params_io_pos # This is very sensitive to the grab_params method in sequest params fh.read(12) ## gap between last params entry and index @index = read_scan_index(fh,@header.num_dta_files) end ### UPDATE SOME THINGS: # give each hit a base_name, first_scan, last_scan if opts[:read_pephits] && !@header.combined @index.each_with_index do |ind,i| mass_measured = @dta_files[i][0] outfile = @out_files[i] outfile.first_scan = ind[0] outfile.last_scan = ind[1] outfile.charge = ind[2] pep_hits = @out_files[i].hits @peptide_hits.push( *pep_hits ) pep_hits.each do |pep_hit| pep_hit[15] = @base_name pep_hit[16] = ind[0] pep_hit[17] = ind[1] pep_hit[18] = ind[2] # add the deltamass pep_hit[12] = pep_hit[0] - mass_measured # real - measured (deltamass) pep_hit[13] = 1.0e6 * pep_hit[12].abs / mass_measured ## ppm pep_hit[19] = self ## link with the srf object end end filter_by_precursor_mass_tolerance! if params end self end # returns an index where each entry is [first_scan, last_scan, charge] def read_scan_index(fh, num) #string = fh.read(80) #puts "STRING: " #p string #puts string #File.open("tmp.tmp",'wb') {|out| out.print string } #abort 'her' ind_len = 24 index = Array.new(num) unpack_string = 'III' st = '' ind_len.times do st << '0' end ## create a 24 byte string to receive data num.times do |i| fh.read(ind_len, st) result = st.unpack(unpack_string) index[i] = st.unpack(unpack_string) end index end # returns an array of dta_files def read_dta_files(fh, num_files, unpack_35) dta_files = Array.new(num_files) start = dta_start_byte fh.pos = start header.num_dta_files.times do |i| dta_files[i] = Mspire::Sequest::Srf::Dta.from_io(fh, unpack_35) end dta_files end # filehandle (fh) must be at the start of the outfiles. 'read_dta_files' # will put the fh there. def read_out_files(fh,number_files, unpack_35, dup_refs_gt_0) out_files = Array.new(number_files) header.num_dta_files.times do |i| out_files[i] = Mspire::Sequest::Srf::Out.from_io(fh, unpack_35, dup_refs_gt_0) end out_files end end class Mspire::Sequest::Srf::Header Start_byte = { :enzyme => 438, :ion_series => 694, :model => 950, :modifications => 982, :raw_filename => 1822, :db_filename => 2082, :dta_log_filename => 2602, :params_filename => 3122, :sequest_log_filename => 3382, } Byte_length = { :enzyme => 256, :ion_series => 256, :model => 32, :modifications => 840, :raw_filename => 260, :db_filename => 520, :dta_log_filename => 520, :params_filename => 260, :sequest_log_filename => 262, ## is this really 262?? or should be 260?? } Byte_length_v32 = { :modifications => 456, } attr_accessor :version # a Mspire::Sequest::Srf::DtaGen object attr_accessor :dta_gen attr_accessor :enzyme attr_accessor :ion_series attr_accessor :model attr_accessor :modifications attr_accessor :raw_filename attr_accessor :db_filename attr_accessor :dta_log_filename attr_accessor :params_filename attr_accessor :sequest_log_filename # true if this is a combined file, false if represents a single file # this is set by examining the DtaGen object for signs of a single file attr_reader :combined __chars_re = Regexp.escape( "\r\0" ) NEWLINE_OR_NULL_RE = /[#{__chars_re}]/o def num_dta_files @dta_gen.num_dta_files end def self.from_io(fh) self.new.from_io(fh) end # sets fh to 0 and grabs the information it wants def from_io(fh) st = fh.read(4) @version = '3.' + st.unpack('I').first.to_s @dta_gen = Mspire::Sequest::Srf::DtaGen.from_io(fh) # if the start_mass end_mass start_scan and end_scan are all zero, its a # combined srf file: @combined = [0.0, 0.0, 0, 0].zip(%w(start_mass end_mass start_scan end_scan)).all? do |one,two| one == @dta_gen.send(two.to_sym) end ## get the rest of the info byte_length = Byte_length.dup byte_length.merge! Byte_length_v32 if @version == '3.2' fh.pos = Start_byte[:enzyme] [:enzyme, :ion_series, :model, :modifications, :raw_filename, :db_filename, :dta_log_filename, :params_filename, :sequest_log_filename].each do |param| send("#{param}=".to_sym, get_null_padded_string(fh, byte_length[param], @combined)) end self end private def get_null_padded_string(fh, bytes, combined=false) st = fh.read(bytes) # for empty declarations if st[0] == 0x000000 return '' end if combined st = st[ 0, st.index(NEWLINE_OR_NULL_RE) ] else st.rstrip! end st end end # the Dta Generation Params class Mspire::Sequest::Srf::DtaGen ## not sure if this is correct # Float attr_accessor :start_time # Float attr_accessor :start_mass # Float attr_accessor :end_mass # Integer attr_accessor :num_dta_files # Integer attr_accessor :group_scan ## not sure if this is correct # Integer attr_accessor :min_group_count # Integer attr_accessor :min_ion_threshold #attr_accessor :intensity_threshold # can't find yet #attr_accessor :precursor_tolerance # can't find yet # Integer attr_accessor :start_scan # Integer attr_accessor :end_scan def self.from_io(io) self.new.from_io(io) end # sets self based on the io object and returns self def from_io(io) io.pos = 0 if io.pos != 0 st = io.read(148) (@start_time, @start_mass, @end_mass, @num_dta_files, @group_scan, @min_group_count, @min_ion_threshold, @start_scan, @end_scan) = st.unpack('x36ex12ex4ex48Ix12IIIII') self end end # total_num_possible_charge_states is not correct under 3.5 (Bioworks 3.3.1) # unknown is, well unknown... Mspire::Sequest::Srf::Dta = Struct.new( *%w(mh dta_tic num_peaks charge ms_level unknown total_num_possible_charge_states peaks).map(&:to_sym) ) class Mspire::Sequest::Srf::Dta # original # Unpack = "EeIvvvv" Unpack_32 = "EeIvvvv" Unpack_35 = "Ex8eVx2vvvv" # note on peaks (self[7]) # this is a byte array of floats, you can get the peaks out with # unpack("e*") undef_method :inspect def inspect peaks_st = 'nil' if self[7] ; peaks_st = "[#{self[7].size} bytes]" end "" end def self.from_io(fh, unpack_35) (unpack, read_header, read_spacer) = if unpack_35 [Unpack_35, 34, 22] else [Unpack_32, 24, 24] end # get the bulk of the data in single unpack # sets the first 7 attributes dta = self.new(*fh.read(read_header).unpack(unpack)) # Scan numbers are given at the end in an index! fh.read(read_spacer) # throwaway the spacer dta[7] = fh.read(dta.num_peaks * 8) # (num_peaks * 8) is the number of bytes to read dta end def to_dta_file_data string = "#{round(mh, 6)} #{charge}\r\n" peak_ar = peaks.unpack('e*') (0...(peak_ar.size)).step(2) do |i| # %d is equivalent to floor, so we round by adding 0.5! string << "#{round(peak_ar[i], 4)} #{(peak_ar[i+1] + 0.5).floor}\r\n" #string << peak_ar[i,2].join(' ') << "\r\n" end string end # write a class dta file to the io object def write_dta_file(io) io.print to_dta_file_data end # returns a string where the float has been rounded to the specified number # of decimal places def round(float, decimal_places) sprintf("%.#{decimal_places}f", float) end end #Mspire::Sequest::Srf::Out = Struct.new( *%w(first_scan last_scan charge num_hits computer date_time hits total_inten lowest_sp num_matched_peptides db_locus_count).map(&:to_sym) ) Mspire::Sequest::Srf::Out = Struct.new( *%w(num_hits computer date_time total_inten lowest_sp num_matched_peptides db_locus_count hits first_scan last_scan charge).map(&:to_sym) ) # 0=first_scan, 1=last_scan, 2=charge, 3=num_hits, 4=computer, 5=date_time, 6=hits, 7=total_inten, 8=lowest_sp, 9=num_matched_peptides, 10=db_locus_count class Mspire::Sequest::Srf::Out Unpack_32 = '@36vx2Z*@60Z*' Unpack_35 = '@36vx4Z*@62Z*' undef_method :inspect def inspect hits_s = if self.hits ", @hits(#)=#{hits.size}" else '' end "" end # returns an Mspire::Sequest::Srf::Out object def self.from_io(fh, unpack_35, dup_refs_gt_0) ## EMPTY out file is 96 bytes ## each hit is 320 bytes ## num_hits and charge: st = fh.read(96) # num_hits computer date_time initial_vals = st.unpack( (unpack_35 ? Unpack_35 : Unpack_32) ) # total_inten lowest_sp num_matched_peptides db_locus_count initial_vals.push( *st.unpack('@8eex4Ix4I') ) out_obj = self.new( *initial_vals ) _num_hits = out_obj.num_hits ar = Array.new(_num_hits) if ar.size > 0 num_extra_references = 0 _num_hits.times do |i| ar[i] = Mspire::Sequest::Srf::Out::Peptide.from_io(fh, unpack_35) num_extra_references += ar[i].num_other_loci end if dup_refs_gt_0 Mspire::Sequest::Srf::Out::Peptide.read_extra_references(fh, num_extra_references, ar) end ## The xcorrs are already ordered by best to worst hit ## ADJUST the deltacn's to be meaningful for the top hit: ## (the same as bioworks and prophet) Mspire::Sequest::Srf::Out::Peptide.set_deltacn_from_deltacn_orig(ar) end out_obj.hits = ar out_obj[1].chomp! # computer out_obj end end # deltacn_orig - the one that sequest originally reports (top hit gets 0.0) # deltacn - modified to be that of the next best hit (by xcorr) and the last # hit takes 1.1. This is what is called deltacn by bioworks and pepprophet # (at least for the first few years). If filtering occurs, it will be # updated. # deltacn_orig_updated - the latest updated value of deltacn. # Originally, this will be equal to deltacn_orig. After filtering, this will # be recalculated. To know if this will be different from deltacn_orig, query # match.srf.filtered_by_precursor_mass_tolerance. If this is changed, then # deltacn should also be changed to reflect it. # mh - the theoretical mass + h # proteins are created as SRF prot objects with a reference and linked to their # peptide_hits (from global hash by reference) # ppm = 10^6 * ∆m_accuracy / mass_measured [ where ∆m_accuracy = mass_real – mass_measured ] # This is calculated for the M+H mass! # num_other_loci is the number of other loci that the peptide matches beyond # the first one listed # srf = the srf object this scan came from Mspire::Sequest::Srf::Out::Peptide = Struct.new( *%w(mh deltacn_orig sf sp xcorr id num_other_loci rsp ions_matched ions_total sequence proteins deltamass ppm aaseq base_name first_scan last_scan charge srf deltacn deltacn_orig_updated).map(&:to_sym) ) # 0=mh 1=deltacn_orig 2=sp 3=xcorr 4=id 5=num_other_loci 6=rsp 7=ions_matched 8=ions_total 9=sequence 10=proteins 11=deltamass 12=ppm 13=aaseq 14=base_name 15=first_scan 16=last_scan 17=charge 18=srf 19=deltacn 20=deltacn_orig_updated class Mspire::Sequest::Srf::Out::Peptide # creates the deltacn that is meaningful for the top hit (the deltacn_orig # or the second best hit and so on). # assumes sorted def self.set_deltacn_from_deltacn_orig(ar) (1...ar.size).each {|i| ar[i-1].deltacn = ar[i].deltacn_orig } ar[-1].deltacn = 1.1 end # (assumes sorted) # recalculates deltacn from xcorrs and sets deltacn_orig_updated and deltacn def self.update_deltacns_from_xcorr(ar) if ar.size > 0 top_score = ar.first[4] other_scores = (1...(ar.size)).to_a.map do |i| 1.0 - (ar[i][4]/top_score) end ar.first[21] = 0.0 (0...(ar.size-1)).each do |i| ar[i][20] = other_scores[i] # deltacn ar[i+1][21] = other_scores[i] # deltacn_orig_updated end ar.last[20] = 1.1 end end def self.read_extra_references(fh, num_extra_references, pep_hits) num_extra_references.times do # 80 bytes total (with index number) pep = pep_hits[fh.read(8).unpack('x4I').first - 1] ref = fh.read(80).unpack('A*').first pep[11] << Mspire::Sequest::Srf::Out::Protein.new(ref[0,38]) end # fh.read(6) if unpack_35 end Unpack_35 = '@64Ex8ex8eeeIx18Ivx2vvx8Z*@246Z*' # translation: @64=(64 bytes in to the record), E=mH, x8=8unknown bytes, e=deltacn, # x8=8unknown bytes, e=sf, e=sp, e=xcorr, I=ID#, x18=18 unknown bytes, v=rsp, # v=ions_matched, v=ions_total, x8=8unknown bytes, Z*=sequence, 240Z*=at # byte 240 grab the string (which is proteins). #Unpack_32 = '@64Ex8ex12eeIx18vvvx8Z*@240Z*' Unpack_32 = '@64Ex8ex8eeeIx14Ivvvx8Z*@240Z*' Unpack_four_null_bytes = 'a*' Unpack_Zstar = 'Z*' Read_35 = 426 Read_32 = 320 FourNullBytes_as_string = "\0\0\0\0" #NewRecordStart = "\0\0" + 0x3a.chr + 0x1a.chr + "\0\0" NewRecordStart = 0x01.chr + 0x00.chr Sequest_record_start = "[SEQUEST]" undef_method :inspect def inspect st = %w(aaseq sequence mh deltacn_orig sf sp xcorr id rsp ions_matched ions_total proteins deltamass ppm base_name first_scan last_scan charge deltacn).map do |v| if v == 'proteins' "#{v}(#)=#{send(v.to_sym).size}" elsif v.is_a? Array "##{v}=#{send(v.to_sym).size}" else "#{v}=#{send(v.to_sym).inspect}" end end st.unshift("<#{self.class}") if srf st.push("srf(base_name)=#{srf.base_name.inspect}") end st.push('>') st.join(' ') #"" end # extra_references_array is an array that grows with peptide_hits as extra # references are discovered. def self.from_io(fh, unpack_35) ## get the first part of the info st = fh.read( unpack_35 ? Read_35 : Read_32 ) ## read all the hit data # sets the the first 11 attributes peptide = self.new( *st.unpack( unpack_35 ? Unpack_35 : Unpack_32 ) ) # set deltacn_orig_updated peptide[21] = peptide[1] # we are slicing the reference to 38 chars to be the same length as # duplicate references peptide[11] = [Mspire::Sequest::Srf::Out::Protein.new(peptide[11][0,38])] peptide[14] = Mspire::Ident::Peptide.sequence_to_aaseq(peptide[10]) fh.read(6) if unpack_35 peptide end end class Mspire::Sequest::Srf::Out::Protein < Mspire::Ident::Protein alias_method :reference, :id # the first entry def first_entry reference.split(' ',2)[0] end end