# Top level module / namespace. module NpSearch # A class to score the Sequences class ScoreSequence class << self DI_CLV = 'KR|RR|KK'.freeze MONO_NP_CLV_2 = '[KR]..R'.freeze MONO_NP_CLV_4 = '[KR]....R'.freeze MONO_NP_CLV_6 = '[KR]......R'.freeze NP_CLV = "(#{DI_CLV})|(#{MONO_NP_CLV_2})|(#{MONO_NP_CLV_4})|" \ "(#{MONO_NP_CLV_6})".freeze def run(sequence, opt) split_into_potential_neuropeptides(sequence) count_np_cleavage_sites(sequence) count_c_terminal_glycines(sequence) np_similarity(sequence, opt[:temp_dir]) acidic_spacers(sequence) end private def split_into_potential_neuropeptides(sequence) potential_nps = [] results = sequence.seq.scan(/(?<=^|#{NP_CLV})(\w+?)(?=#{NP_CLV}|$)/i) headers = %w(di_clv_st mono_2_clv_st mono_4_clv_st mono_6_clv_st np di_clv_end mono_2_clv_end mono_4_clv_end mono_6_clv_end) results.each { |e| potential_nps << Hash[headers.map(&:to_sym).zip(e)] } sequence.potential_cleaved_nps = potential_nps end def count_np_cleavage_sites(sequence) return if sequence.potential_cleaved_nps.empty? sequence.potential_cleaved_nps.each do |e| count_dibasic_np_clv(sequence, e[:di_clv_end]) count_mono_basic_np_clv(sequence, e[:mono_2_clv_end], e[:mono_4_clv_end], e[:mono_6_clv_end]) end end def count_dibasic_np_clv(sequence, dibasic_clv) case dibasic_clv when 'KR' sequence.score += 0.09 when 'RR', 'KK' sequence.score += 0.05 end end def count_mono_basic_np_clv(sequence, mono_2_clv, mono_4_clv, mono_6_clv) return if mono_2_clv.nil? && mono_4_clv.nil? && mono_6_clv.nil? sequence.score += 0.02 end # Counts the number of C-terminal glycines def count_c_terminal_glycines(sequence) return if sequence.potential_cleaved_nps.empty? sequence.potential_cleaved_nps.each do |e| if e[:np] =~ /FG$/ && e[:di_clv_end] == 'KR' sequence.score += 0.40 elsif e[:np] =~ /G$/ && e[:di_clv_end] == 'KR' sequence.score += 0.25 elsif e[:np] =~ /G$|GK$|GR$/ sequence.score += 0.10 end end end # Adds 0.10 if the acidic spacer is detected. # Acidic Spacer is defined as being less than 25% of the precursor length # (not including the Signalp) && having more than 50% D and E amino acids. def acidic_spacers(sequence) sequence.potential_cleaved_nps.each do |e| next if e[:np].length / sequence.seq.length > 0.25 sequence.score += 0.10 if e[:np].count('DE') / e[:np].length > 0.5 end end def np_similarity(sequence, temp_dir, results = nil) results = run_cdhit(sequence, temp_dir) if results.nil? clusters = results.split(/^>Cluster \d+\n/) clusters.each do |c| next if c.nil? no_of_seqs_in_cluster = c.split("\n").length if no_of_seqs_in_cluster > 1 sequence.score += (0.15 * no_of_seqs_in_cluster) end end end def run_cdhit(sequence, temp_dir) f = Tempfile.new('clust', temp_dir) fo = Tempfile.new('clust_out', temp_dir) return unless write_potential_peptides_to_tempfile(sequence, f) `cd-hit -c 0.5 -n 3 -l 4 -i #{f.path} -o #{fo.path}` IO.read("#{fo.path}.clstr") end def write_potential_peptides_to_tempfile(sequence, tempfile) return false if sequence.potential_cleaved_nps.empty? sequences = '' sequence.potential_cleaved_nps.each_with_index do |e, i| sequences += ">seq#{i}\n#{e[:np]}\n" end tempfile.write(sequences) tempfile.close true end end end end