require 'rexml/document' require 'hash_by' require 'instance_var_set_from_hash' require 'axml' require 'spec_id' require 'array_class' require 'spec_id/parser/proph' module SpecID ; end module SpecID::Prot ; end module SpecID::Pep ; end module Proph class ProtSummary include SpecID # if you get this match it's a protein prophet file and the version is the # first match! Filetype_and_version_re_old = /ProteinProphet_v([\.\d]+)\.dtd/ # gives 1.9 or what else? Filetype_and_version_re_new = /protXML_v([\.\d]+)\.xsd/ # gives 4 right now # inherits prots and peps # the protein groups attr_accessor :prot_groups attr_accessor :version def hi_prob_best ; true end def get_version(file) answer = nil File.open(file) do |fh| 5.times do line = fh.gets answer = if line =~ Filetype_and_version_re_new $1.dup elsif line =~ Filetype_and_version_re_old $1.dup end break if answer end end raise(ArgumentError, "couldn't detect version in #{file}") unless answer answer end def initialize(file=nil) @prots = nil if file @version = get_version(file) #@prot_groups = ProtSummary::Parser.new.parse_file(file) SpecID::Parser::ProtProph.new(:spec_id).parse(file, :spec_id => self) end end # returns a set of unique proteins def unique_prots(prot_groups) all_prots = [] prot_groups.each do |pg| pg.prots.each do |prt| all_prots << prt end end all_prots.hash_by(:protein_name).map{|name,prot_arr| prot_arr.first } end end class ProtSummary::Parser attr_accessor :prot_groups def initialize(file=nil, with_peps=false, tp='axml') if file @prot_groups = parse_file(file, with_peps, tp) end end # returns an array of protein_groups def parse_file(file, with_peps=false, tp='axml') File.open(file) do |fh| @prot_groups = _parse_for_prot_groups(fh, with_peps, tp) end @prot_groups end # returns an array of ProtGroup objects def _parse_for_prot_groups(stream, with_peps=false, tp='axml') prtgrps = [] case tp when 'axml' root = AXML.parse(stream) root.protein_group.each do |protein_group| pg = ProtGroup.new(protein_group.attrs) do protein_group.map do |protein| Prot.new(protein.attrs) end end prtgrps << pg end end prtgrps end end # ProtSummary::Parser class ProtGroup attr_accessor :group_number, :probability, :prots def initialize(args=nil) @prots = [] if args instance_var_set_from_hash(args) end if block_given? @prots = yield end end end end # Proph Proph::Prot = ArrayClass.new(%w(protein_name probability n_indistinguishable_proteins percent_coverage unique_stripped_peptides group_sibling_id total_number_peptides pct_spectrum_ids description peps)) # note that 'description' is found in the element 'annotation', attribute 'protein_description' # NOTE!: unique_stripped peptides is an array rather than + joined string class Proph::Prot include SpecID::Prot # returns protein_name def name ; self[0] end def reference ; self[0] end def first_entry ; self[0] end # the name is also the first_entry end #def to_s # '' #end # this is a pep from a -prot.xml file Proph::Prot::Pep = ArrayClass.new(%w(peptide_sequence charge initial_probability nsp_adjusted_probability weight is_nondegenerate_evidence n_enzymatic_termini n_sibling_peptides n_sibling_peptides_bin n_instances is_contributing_evidence calc_neutral_pep_mass modification_info prots)) class Proph::Prot::Pep include SpecID::Pep alias_method :mod_info, :modification_info alias_method :mod_info=, :modification_info= def aaseq ; self[0] end def probability ; self[3] end end # class Pep =begin #attr_accessor :sequence, :probability, :filenames, :charge, :precursor_neutral_mass, :nsp_cutoff, :scans #attr_writer :arithmetic_avg_scan_by_parent_time #def initialize(args=nil) # if args # @sequence = args[:sequence] # @probability = args[:probability] ## nsp prob # @filenames = args[:filenames] # @charge = args[:charge] # @nsp_cutoff = args[:nsp_cutoff] # if args.key?(:scans) # @scans = args[:scans] # else # @scans = [] ## this is set later if needed # end # else # @scans = [] # end #end # filter peptides based on the number of scans # if a peptide has more than max_dups scans, the peptide is tossed # note that multiple scans that were used as a single dtafile scan # will be counted as a single scan for these purposes! # (easy, since they are stored as a single item in the array of scans) def self.filter_by_max_dup_scans(max_dups=nil, peps=nil) if max_dups new_peps = [] peps.each do |pep| unless pep.scans.size > max_dups new_peps << pep end end new_peps else peps.dup end end ## from the list of scans, creates a scan object whose time is the ## arithmetic mean of the parent scans (based on prec_inten) and whose ## prec_mz is the avg of all prec_mz's. num is nil, charge is the first def arithmetic_avg_scan_by_parent_time unless @arithmetic_avg_scan_by_parent_time flat_scans = @scans.flatten # new_prec_mz prec_mz_sum = 0.0 prec_inten_sum = 0.0 times = [] intens = [] tot_inten = 0.0 flat_scans.each do |c| prec_inten = c.prec_inten prec_inten_sum += prec_inten prec_mz_sum += c.prec_mz tot_inten += prec_inten times << c.parent.time intens << prec_inten end new_prec_mz = prec_mz_sum / flat_scans.size new_prec_inten = prec_inten_sum / flat_scans.size fraction_inten = [] intens.each do |inten| fraction_inten.push( inten/tot_inten ) end new_time = 0.0 (0...times.size).each do |i| new_time += times[i] * fraction_inten[i] end @arithmetic_avg_scan_by_parent_time = MS::Scan.new( nil, @scans.first.ms_level, new_time, new_prec_mz, new_prec_inten ) end @arithmetic_avg_scan_by_parent_time end def to_s '' end def has_dta?(dta_filename) if @filenames @filenames.each do |fn| if dta_filename == fn return true end end end return false end # Given a list of peptides, returns only those unique based on # sequence/charge def self.uniq_by_seqcharge(peptides) # @TODO: this could be done with one fewer traversals, but it is beautiful peptides.hash_by(:sequence, :charge).collect do |k,v| v.first end end =end =begin # Class for parsing the peptide prophet output files in various ways class Proph::Pep::Parser < Parser # parse_type = "rexml" | "regex" # regex's are about 50 times faster but are not guaranteed to work # seq charge hash is keyed on an array -> [sequence,charge] # @TODO: implement parsing on this with xmlparser def dta_filenames_by_seq_charge(pep_xml_file, parse_type="rexml") seq_charge_hash = Hash.new {|hash,key| hash[key] = [] } case parse_type when "rexml" #puts "READING: " + pep_xml_file + " ..." doc = REXML::Document.new File.new(pep_xml_file) ## Create a hash of peptides based on sequence_charge (takes an array) doc.elements.each("msms_pipeline_analysis/msms_run_summary/search_result") do |result| pep_charge = result.attributes['assumed_charge'] filename = result.attributes['spectrum'] result.elements.to_a('search_hit').each do |hit| pep_seq = hit.attributes['peptide'] seq_charge = [pep_seq, pep_charge] seq_charge_hash[seq_charge] << filename end end seq_charge_hash when "regex" #puts "READING: " + pep_xml_file + " ..." ## Create a hash of peptides based on sequence_charge (takes an array) ## file from peptideAtlas: search_result_regex1 = /= prot_prob_cutoff prob = elem.attributes['probability'].to_f name= elem.attributes['protein_name'] curr_prot = Prot.new({:probability => prob, :protein_name => name, :cutoff => prot_prob_cutoff}) peptides = [] elem.elements.to_a('peptide').each do |pep| if pep.attributes['nsp_adjusted_probability'].to_f >= pep_nsp_prob_cutoff && pep.attributes['initial_probability'].to_f >= pep_init_prob_cutoff nsp_prob = pep.attributes['nsp_adjusted_probability'].to_f sequence = pep.attributes['peptide_sequence'] charge = pep.attributes['charge'] pnm = pep.attributes['precursor_neutral_mass'] peptides.push(Pep.new(:probability => nsp_prob, :sequence => sequence, :charge => charge, :precursor_neutral_mass => pnm, :nsp_cutoff => pep_nsp_prob_cutoff)) end ## Only take proteins with peptides! if peptides.size > 0 curr_prot.peps = peptides @prots << curr_prot end end end end when "regex" prot_regex = /= prot_prob_cutoff if peptides.size > 0 curr_prot.peps = peptides @prots.push(curr_prot) end end end curr_prot = Prot.new({:probability => prob, :protein_name => name, :cutoff => prot_prob_cutoff}) peptides = [] end if line =~ pep_regex sequence = $1.dup rest = $2 if rest =~ pep_else_regex charge = $1 init_prob = $2 nsp_prob = $3 if nsp_prob.to_f >= pep_nsp_prob_cutoff && init_prob.to_f >= pep_init_prob_cutoff peptides.push(Pep.new(:probability => nsp_prob, :sequence => sequence, :charge => charge, :nsp_cutoff => pep_nsp_prob_cutoff)) end end end # get the last one: if curr_prot && curr_prot.probability.to_f > prot_prob_cutoff && peptides.size > 0 curr_prot.peps = peptides @prots.push(curr_prot) end end end @prots end end # Prot::Parser ################ --END =end