require 'optparse' require 'mspire/digester' require 'mspire/fasta' require 'mspire/ident/peptide/db' class Mspire::Ident::Peptide::Db::Creator MAX_NUM_AA_EXPANSION = 3 # the twenty standard amino acids STANDARD_AA = %w(A C D E F G H I K L M N P Q R S T V W Y) EXPAND_AA = {'X' => STANDARD_AA} DEFAULT_PEPTIDE_CENTRIC_DB = { missed_cleavages: 2, min_length: 5, enzyme: Mspire::Digester[:trypsin], remove_digestion_file: true, cleave_initiator_methionine: true, expand_aa: false, uniprot: true, data_type: :hash, } # sets defaults according to DEFAULT_PEPTIDE_CENTRIC_DB def self.cmdline(argv) opt = DEFAULT_PEPTIDE_CENTRIC_DB.dup opts = OptionParser.new do |op| op.banner = "usage: #{File.basename($0)} .fasta ..." op.separator "output: " op.separator " .msd_clvg.min_aaseq.yml" op.separator "format:" op.separator " PEPTIDE: ID1ID2ID3..." op.separator "" op.separator " Initiator Methionines - by default, will generate two peptides" op.separator " for any peptide found at the N-termini starting with 'M'" op.separator " (i.e., one with and one without the leading methionine)" op.separator "" op.on("--missed-cleavages <#{opt[:missed_cleavages]}>", Integer, "max num of missed cleavages") {|v| opt[:missed_cleavages] = v } op.on("--min-length <#{opt[:min_length]}>", Integer, "the minimum peptide aaseq length") {|v| opt[:min_length] = v } op.on("--no-cleaved-methionine", "does not cleave off initiator methionine") { opt[:cleave_initiator_methionine] = false } op.on("--expand-x", "enumerate all aa possibilities for 'X'", "(default is to remove these peptides)") {|v| opt[:expand_aa] = v } op.on("--no-uniprot", "use entire protid section of fasta header", "for non-uniprot fasta files") { opt[:uniprot] = false } #op.on("--trie", "use a trie (for very large uniprot files)", "must have fast_trie gem installed") {|v| opt[:trie] = v } op.on("--data-type <#{opt[:data_type]}>", "hash|google_hash|trie", "need google_hash or fast_trie gem installed") {|v| opt[:data_type] = v.to_sym } op.on("-e", "--enzyme <#{opt[:enzyme].name}>", "enzyme for digestion (Mspire::Digester[name])") {|v| opt[:enzyme] = Mspire::Digester[v] } op.on("-v", "--verbose", "talk about it") { $VERBOSE = 5 } op.on("--list-enzymes", "lists approved enzymes and exits") do op.on("-v", "--verbose", "talk about it") { $VERBOSE = 5 } puts Mspire::Digester::ENZYMES.keys.join("\n") exit end end opts.parse!(argv) if argv.size == 0 puts opts || exit end argv.map do |file| creator = Mspire::Ident::Peptide::Db::Creator.new creator.create(file, opt) end end # returns the name of the digestion file that was written def create_digestion_file(fasta_file, opts={}) opts = DEFAULT_PEPTIDE_CENTRIC_DB.merge(opts) (missed_cleavages, enzyme, cleave_initiator_methionine, expand_aa) = opts.values_at(:missed_cleavages, :enzyme, :cleave_initiator_methionine, :expand_aa) start_time = Time.now print "Digesting #{fasta_file} ..." if $VERBOSE letters_to_expand_re = Regexp.new("[" << Regexp.escape(EXPAND_AA.keys.join) << "]") base = fasta_file.chomp(File.extname(fasta_file)) digestion_file = base + ".msd_clvg#{missed_cleavages}.peptides" File.open(digestion_file, "w") do |fh| Mspire::Fasta.open(fasta_file) do |fasta| fasta.each do |prot| peptides = enzyme.digest(prot.sequence, missed_cleavages) if (cleave_initiator_methionine && (prot.sequence[0,1] == "M")) m_peps = [] init_methionine_peps = [] peptides.each do |pep| # if the peptide is at the beginning of the protein sequence if prot.sequence[0,pep.size] == pep m_peps << pep[1..-1] end end peptides.push(*m_peps) end peptides = if expand_aa peptides.flat_map do |pep| (pep =~ letters_to_expand_re) ? expand_peptides(pep, EXPAND_AA) : pep end else peptides.select {|pep| pep !~ letters_to_expand_re } end header = prot.header id = opts[:uniprot] ? Mspire::Fasta.uniprot_id(header) : header.split(/\s/,2).first fh.puts( id + "\t" + peptides.join(" ") ) end end end puts "#{Time.now - start_time} sec" if $VERBOSE digestion_file end # returns the full path of the created file def db_from_fasta_digestion_file(digestion_file, opts={}) opts = DEFAULT_PEPTIDE_CENTRIC_DB.merge(opts) start_time = Time.now puts "Organizing raw digestion #{digestion_file} ..." if $VERBOSE puts "#{Time.now - start_time} sec" if $VERBOSE hash_like = hash_like_from_digestion_file(digestion_file, opts[:min_length], opts[:data_type]) base = digestion_file.chomp(File.extname(digestion_file)) final_outfile = if opts[:data_type] == :trie base + ".min_aaseq#{opts[:min_length]}" else base + ".min_aaseq#{opts[:min_length]}" + ".yml" end start_time = Time.now print "Writing #{hash_like.size} peptides to #{} ..." if $VERBOSE if opts[:data_type] == :trie trie = hash_like trie.save(final_outfile) else File.open(final_outfile, 'w') do |out| hash_like.each do |k,v| out.puts( [k, v.join(Mspire::Ident::Peptide::Db::PROTEIN_DELIMITER)].join(Mspire::Ident::Peptide::Db::KEY_VALUE_DELIMITER) ) #out.puts "#{k}#{Mspire::Ident::Peptide::Db::KEY_VALUE_DELIMITER}#{v}" end end end puts "#{Time.now - start_time} sec" if $VERBOSE if opts[:remove_digestion_file] File.unlink(digestion_file) end File.expand_path(final_outfile) end def get_a_trie begin require 'trie' rescue raise LoadError, "must first install fast_trie" end Trie.new end # data_type can be :hash (default), :google_hash (better memory and speed), # or :trie (experimental/broken) def hash_like_from_digestion_file(digestion_file, min_length, data_type=:hash) case data_type when :trie trie = get_a_trie ::IO.foreach(digestion_file) do |line| line.chomp! (prot, *peps) = line.split(/\s+/) # prot is something like this: "P31946" peps.uniq! peps.each do |pep| if pep.size >= min_length if trie.has_key?(pep) ar = trie.get(pep) ar << prot else trie.add( pep, [prot] ) end end end end trie when :hash, :google_hash hash = case data_type when :hash Hash.new when :google_hash require 'google_hash' # this guy is very slow compared to ruby hash (10X slower?) but more # memory efficient (but only ~10% or so improvement with this # application) GoogleHashDenseRubyToRuby.new # still need to try Sparse one out and compare performance #GoogleHashSparseRubyToRuby.new end ::IO.foreach(digestion_file) do |line| line.chomp! (prot, *peps) = line.split(/\s+/) # prot is something like this: "P31946" peps.uniq! peps.each do |pep| if pep.size >= min_length if hash.key?(pep) hash[pep] << prot else hash[pep] = [prot] end end end end hash end end # writes a new file with the added 'min_aaseq' # creates a temporary digestion file that contains all peptides digesting # with certain missed_cleavages (i.e., min_seq_length is not applied to # this file but on the final peptide centric db) # returns the full name of the written file. def create(fasta_file, opts={}) opts = DEFAULT_PEPTIDE_CENTRIC_DB.merge(opts) digestion_file = create_digestion_file(fasta_file, opts) puts "created file of size: #{File.size(digestion_file)}" if $VERBOSE db_from_fasta_digestion_file(digestion_file, opts) end # does combinatorial expansion of all letters requesting it. # expand_aa is hash like: {'X'=>STANDARD_AA} # returns nil if there are more than MAX_NUM_AA_EXPANSION amino acids to # be expanded # returns an empty array if there is no expansion def expand_peptides(peptide, expand_aa_hash) letters_in_order = expand_aa_hash.keys.sort index_and_key = [] peptide.split('').each_with_index do |char,i| if let_index = letters_in_order.index(char) index_and_key << [i, letters_in_order[let_index]] end end if index_and_key.size > MAX_NUM_AA_EXPANSION return nil end to_expand = [peptide] index_and_key.each do |i,letter| new_peps = [] while current_pep = to_expand.shift do new_peps << expand_aa_hash[letter].map {|v| dp = current_pep.dup ; dp[i] = v ; dp } end to_expand = new_peps.flatten end to_expand end end