lib/npsearch/scoresequence.rb in npsearch-2.0.1 vs lib/npsearch/scoresequence.rb in npsearch-2.1.0
- old
+ new
@@ -1,109 +1,111 @@
-require 'csv'
-require 'tempfile'
-
+# Top level module / namespace.
module NpSearch
# A class to score the Sequences
class ScoreSequence
class << self
- DI_CLV = 'KR|RR|KK'
- MONO_NP_CLV_2 = '[KR]..R'
- MONO_NP_CLV_4 = '[KR]....R'
- MONO_NP_CLV_6 = '[KR]......R'
+ DI_CLV = 'KR|RR|KK'.freeze
+ MONO_NP_CLV_2 = '[KR]..R'.freeze
+ MONO_NP_CLV_4 = '[KR]....R'.freeze
+ MONO_NP_CLV_6 = '[KR]......R'.freeze
NP_CLV = "(#{DI_CLV})|(#{MONO_NP_CLV_2})|(#{MONO_NP_CLV_4})|" \
- "(#{MONO_NP_CLV_6})"
+ "(#{MONO_NP_CLV_6})".freeze
- def run(sequence)
- @sequence = sequence
- split_into_neuropeptides
- count_np_cleavage_sites
- count_c_terminal_glycines
- np_similarity
- acidic_spacers
+ def run(sequence, opt)
+ split_into_potential_neuropeptides(sequence)
+ count_np_cleavage_sites(sequence)
+ count_c_terminal_glycines(sequence)
+ np_similarity(sequence, opt[:temp_dir])
+ acidic_spacers(sequence)
end
private
- def split_into_neuropeptides
+ def split_into_potential_neuropeptides(sequence)
potential_nps = []
- results = @sequence.seq.scan(/(?<=^|#{NP_CLV})(\w+?)(?=#{NP_CLV}|$)/i)
+ results = sequence.seq.scan(/(?<=^|#{NP_CLV})(\w+?)(?=#{NP_CLV}|$)/i)
headers = %w(di_clv_st mono_2_clv_st mono_4_clv_st mono_6_clv_st np
di_clv_end mono_2_clv_end mono_4_clv_end mono_6_clv_end)
results.each { |e| potential_nps << Hash[headers.map(&:to_sym).zip(e)] }
- @sequence.potential_cleaved_nps = potential_nps
+ sequence.potential_cleaved_nps = potential_nps
end
- def count_np_cleavage_sites
- @sequence.potential_cleaved_nps.each do |e|
- count_dibasic_np_clv(e[:di_clv_end])
- count_mono_basic_np_clv(e[:mono_2_clv_end], e[:mono_4_clv_end],
- e[:mono_6_clv_end])
+ def count_np_cleavage_sites(sequence)
+ return if sequence.potential_cleaved_nps.empty?
+ sequence.potential_cleaved_nps.each do |e|
+ count_dibasic_np_clv(sequence, e[:di_clv_end])
+ count_mono_basic_np_clv(sequence, e[:mono_2_clv_end],
+ e[:mono_4_clv_end], e[:mono_6_clv_end])
end
end
- def count_dibasic_np_clv(dibasic_clv)
+ def count_dibasic_np_clv(sequence, dibasic_clv)
case dibasic_clv
when 'KR'
- @sequence.score += 0.09
+ sequence.score += 0.09
when 'RR', 'KK'
- @sequence.score += 0.05
+ sequence.score += 0.05
end
end
- def count_mono_basic_np_clv(mono_2_clv, mono_4_clv, mono_6_clv)
+ def count_mono_basic_np_clv(sequence, mono_2_clv, mono_4_clv, mono_6_clv)
return if mono_2_clv.nil? && mono_4_clv.nil? && mono_6_clv.nil?
- @sequence.score += 0.02
+ sequence.score += 0.02
end
# Counts the number of C-terminal glycines
- def count_c_terminal_glycines
- @sequence.potential_cleaved_nps.each do |e|
- if e[:np] =~ /G$/ && e[:di_clv_end] == 'KR'
- @sequence.score += 0.25
+ def count_c_terminal_glycines(sequence)
+ return if sequence.potential_cleaved_nps.empty?
+ sequence.potential_cleaved_nps.each do |e|
+ if e[:np] =~ /FG$/ && e[:di_clv_end] == 'KR'
+ sequence.score += 0.40
+ elsif e[:np] =~ /G$/ && e[:di_clv_end] == 'KR'
+ sequence.score += 0.25
elsif e[:np] =~ /G$|GK$|GR$/
- @sequence.score += 0.10
+ sequence.score += 0.10
end
end
end
- def acidic_spacers
- @sequence.potential_cleaved_nps.each do |e|
- acidic_residue = e[:np].count('DE')
- percentage_acidic = acidic_residue / e[:np].length
- @sequence.score += 0.10 if percentage_acidic > 0.5
+ # Adds 0.10 if the acidic spacer is detected.
+ # Acidic Spacer is defined as being less than 25% of the precursor length
+ # (not including the Signalp) && having more than 50% D and E amino acids.
+ def acidic_spacers(sequence)
+ sequence.potential_cleaved_nps.each do |e|
+ next if e[:np].length / sequence.seq.length > 0.25
+ sequence.score += 0.10 if e[:np].count('DE') / e[:np].length > 0.5
end
end
- def np_similarity
- results = run_uclust
- results.gsub!(/^[^C].*\n/, '')
- results.each_line do |c|
- cluster = c.split(/\t/)
- no_of_seq_in_cluster = cluster[3].to_i
- if no_of_seq_in_cluster > 1
- @sequence.score += (0.15 * no_of_seq_in_cluster)
+ def np_similarity(sequence, temp_dir, results = nil)
+ results = run_cdhit(sequence, temp_dir) if results.nil?
+ clusters = results.split(/^>Cluster \d+\n/)
+ clusters.each do |c|
+ next if c.nil?
+ no_of_seqs_in_cluster = c.split("\n").length
+ if no_of_seqs_in_cluster > 1
+ sequence.score += (0.15 * no_of_seqs_in_cluster)
end
end
end
- def run_uclust
- f = Tempfile.new('uclust')
- fo = Tempfile.new('uclust_out')
- write_sequence_content_to_tempfile(f)
- `usearch -cluster_fast #{f.path} -id 0.5 -uc #{fo.path} >/dev/null 2>&1`
- IO.read(fo.path)
- ensure
- f.unlink
- fo.unlink
+ def run_cdhit(sequence, temp_dir)
+ f = Tempfile.new('clust', temp_dir)
+ fo = Tempfile.new('clust_out', temp_dir)
+ return unless write_potential_peptides_to_tempfile(sequence, f)
+ `cd-hit -c 0.5 -n 3 -l 4 -i #{f.path} -o #{fo.path}`
+ IO.read("#{fo.path}.clstr")
end
- def write_sequence_content_to_tempfile(tempfile)
- content = ''
- @sequence.potential_cleaved_nps.each_with_index do |e, i|
- content += ">seq#{i}\n#{e[:np]}\n"
+ def write_potential_peptides_to_tempfile(sequence, tempfile)
+ return false if sequence.potential_cleaved_nps.empty?
+ sequences = ''
+ sequence.potential_cleaved_nps.each_with_index do |e, i|
+ sequences += ">seq#{i}\n#{e[:np]}\n"
end
- tempfile.write(content)
+ tempfile.write(sequences)
tempfile.close
+ true
end
end
end
end