require 'rexml/document' require 'hash_by' require 'instance_var_set_from_hash' require 'axml' require 'spec_id' class Proph ################ --BEGIN class Parser def root_el(file) AXML.parse_file(file) end end class ProtSummary include SpecID attr_writer :prots attr_accessor :prot_groups def hi_prob_best ; true end def initialize(file=nil) @prots = nil if file @prot_groups = ProtSummary::Parser.new.parse_file(file) end end def prots if @prots ; @prots else @prots = unique_prots(@prot_groups) @prots end end # returns a set of unique proteins def unique_prots(prot_groups) all_prots = [] prot_groups.each do |pg| pg.prots.each do |prt| all_prots << prt end end all_prots.hash_by(:protein_name).map{|name,prot_arr| prot_arr.first } end end class ProtSummary::Parser < Parser attr_accessor :prot_groups def initialize(file=nil, with_peps=false, tp='axml') if file @prot_groups = parse_file(file, with_peps, tp) end end # returns an array of protein_groups def parse_file(file, with_peps=false, tp='axml') File.open(file) do |fh| @prot_groups = _parse_for_prot_groups(fh, with_peps, tp) end @prot_groups end # returns an array of ProtGroup objects def _parse_for_prot_groups(stream, with_peps=false, tp='axml') prtgrps = [] case tp when 'axml' root = AXML.parse(stream) root.protein_group.each do |protein_group| pg = ProtGroup.new(protein_group.attrs) do protein_group.map do |protein| Prot.new(protein.attrs) end end prtgrps << pg end end prtgrps end end # ProtSummary::Parser class ProtGroup attr_accessor :group_number, :probability, :prots def initialize(args=nil) @prots = [] if args instance_var_set_from_hash(args) end if block_given? @prots = yield end end end class Prot include SpecID::Prot ## probability and reference accessors are inherited attr_accessor :peps, :protein_name, :cutoff, :group_sibling_id, :n_indistinguishable_proteins, :percent_coverage, :unique_stripped_peptides, :total_number_peptides, :pct_spectrum_ids, :description # returns protein_name def name ; @protein_name end def reference ; @protein_name end def initialize(args) self.instance_var_set_from_hash(args) if @probability ; @probability = @probability.to_f end end # def self.uniq_prots_with_prob_and_reference(file) # root = Parser.root_el(file) # prots = [] # root.protein_group.each do |group| # group.protein.each do |prt| # #prots << prt # prots << # end # end # # un_prots = prots.hash_by(:_protein_name).map{|name,prot_arr| prot_arr.first } # # end def to_s '' end end # class Prot class Pep include SpecID::Pep attr_accessor :sequence, :probability, :filenames, :charge, :precursor_neutral_mass, :nsp_cutoff, :scans attr_writer :arithmetic_avg_scan_by_parent_time def initialize(args=nil) if args @sequence = args[:sequence] @probability = args[:probability] ## nsp prob @filenames = args[:filenames] @charge = args[:charge] @nsp_cutoff = args[:nsp_cutoff] if args.key?(:scans) @scans = args[:scans] else @scans = [] ## this is set later if needed end else @scans = [] end end # filter peptides based on the number of scans # if a peptide has more than max_dups scans, the peptide is tossed # note that multiple scans that were used as a single dtafile scan # will be counted as a single scan for these purposes! # (easy, since they are stored as a single item in the array of scans) def self.filter_by_max_dup_scans(max_dups=nil, peps=nil) if max_dups new_peps = [] peps.each do |pep| unless pep.scans.size > max_dups new_peps << pep end end new_peps else peps.dup end end ## from the list of scans, creates a scan object whose time is the ## arithmetic mean of the parent scans (based on prec_inten) and whose ## prec_mz is the avg of all prec_mz's. num is nil, charge is the first def arithmetic_avg_scan_by_parent_time unless @arithmetic_avg_scan_by_parent_time flat_scans = @scans.flatten # new_prec_mz prec_mz_sum = 0.0 prec_inten_sum = 0.0 times = [] intens = [] tot_inten = 0.0 flat_scans.each do |c| prec_inten = c.prec_inten prec_inten_sum += prec_inten prec_mz_sum += c.prec_mz tot_inten += prec_inten times << c.parent.time intens << prec_inten end new_prec_mz = prec_mz_sum / flat_scans.size new_prec_inten = prec_inten_sum / flat_scans.size fraction_inten = [] intens.each do |inten| fraction_inten.push( inten/tot_inten ) end new_time = 0.0 (0...times.size).each do |i| new_time += times[i] * fraction_inten[i] end @arithmetic_avg_scan_by_parent_time = Spec::Scan.new( nil, @scans.first.ms_level, new_time, new_prec_mz, new_prec_inten ) end @arithmetic_avg_scan_by_parent_time end def to_s '' end def has_dta?(dta_filename) if @filenames @filenames.each do |fn| if dta_filename == fn return true end end end return false end # Given a list of peptides, returns only those unique based on # sequence/charge def self.uniq_by_seqcharge(peptides) # @TODO: this could be done with one fewer traversals, but it is beautiful peptides.hash_by(:sequence, :charge).collect do |k,v| v.first end end end # class Pep # Class for parsing the peptide prophet output files in various ways class Pep::Parser < Parser # parse_type = "rexml" | "regex" # regex's are about 50 times faster but are not guaranteed to work # seq charge hash is keyed on an array -> [sequence,charge] # @TODO: implement parsing on this with xmlparser def dta_filenames_by_seq_charge(pep_xml_file, parse_type="rexml") seq_charge_hash = Hash.new {|hash,key| hash[key] = [] } case parse_type when "rexml" #puts "READING: " + pep_xml_file + " ..." doc = REXML::Document.new File.new(pep_xml_file) ## Create a hash of peptides based on sequence_charge (takes an array) doc.elements.each("msms_pipeline_analysis/msms_run_summary/search_result") do |result| pep_charge = result.attributes['assumed_charge'] filename = result.attributes['spectrum'] result.elements.to_a('search_hit').each do |hit| pep_seq = hit.attributes['peptide'] seq_charge = [pep_seq, pep_charge] seq_charge_hash[seq_charge] << filename end end seq_charge_hash when "regex" #puts "READING: " + pep_xml_file + " ..." ## Create a hash of peptides based on sequence_charge (takes an array) ## file from peptideAtlas: search_result_regex1 = /= prot_prob_cutoff prob = elem.attributes['probability'].to_f name= elem.attributes['protein_name'] curr_prot = Prot.new({:probability => prob, :protein_name => name, :cutoff => prot_prob_cutoff}) peptides = [] elem.elements.to_a('peptide').each do |pep| if pep.attributes['nsp_adjusted_probability'].to_f >= pep_nsp_prob_cutoff && pep.attributes['initial_probability'].to_f >= pep_init_prob_cutoff nsp_prob = pep.attributes['nsp_adjusted_probability'].to_f sequence = pep.attributes['peptide_sequence'] charge = pep.attributes['charge'] pnm = pep.attributes['precursor_neutral_mass'] peptides.push(Pep.new(:probability => nsp_prob, :sequence => sequence, :charge => charge, :precursor_neutral_mass => pnm, :nsp_cutoff => pep_nsp_prob_cutoff)) end ## Only take proteins with peptides! if peptides.size > 0 curr_prot.peps = peptides @prots << curr_prot end end end end when "regex" prot_regex = /= prot_prob_cutoff if peptides.size > 0 curr_prot.peps = peptides @prots.push(curr_prot) end end end curr_prot = Prot.new({:probability => prob, :protein_name => name, :cutoff => prot_prob_cutoff}) peptides = [] end if line =~ pep_regex sequence = $1.dup rest = $2 if rest =~ pep_else_regex charge = $1 init_prob = $2 nsp_prob = $3 if nsp_prob.to_f >= pep_nsp_prob_cutoff && init_prob.to_f >= pep_init_prob_cutoff peptides.push(Pep.new(:probability => nsp_prob, :sequence => sequence, :charge => charge, :nsp_cutoff => pep_nsp_prob_cutoff)) end end end # get the last one: if curr_prot && curr_prot.probability.to_f > prot_prob_cutoff && peptides.size > 0 curr_prot.peps = peptides @prots.push(curr_prot) end end end @prots end end # Prot::Parser ################ --END end # Proph