require "getoptlong" require "rdoc/usage" require "logger" require "rubygems" require "narray" require "bio" require "set" require "facets" require "narray_extensions" require "nmatrix_extensions" require "enumerable_extensions" require "environment_feature" require "environment" module Rubst class CLI def self.print_version puts Rubst::VERSION end def self.print_usage puts <<-USAGE rubst [ options ] -f TEM-file or rubst [ options ] -l TEMLIST-file Available options: -h, --help show help -f, --tem-file FILE a tem file -l, --tem-list FILE a list for tem files -o, --outfile FILE output filename ("allmat.dat" if not specified) --weight INTEGER (PID) clustering level (PID) for the BLOSUM-like weighting (not supported yet) --noweight calculate substitution counts with no weights (default) -c, --classdef FILE a file for the defintion of environments (default: 'classdef.dat') -y, --cys INTEGER 0 for using C and J only for structure 1 for both structure and sequence (default) -output INTEGER 0 for raw counts (no-smoothing performed) 1 for probabilities 2 for log-odds (default) --scale INTEGER log-odds matrices in 1/n bit units (default 3) --sigma DOUBLE change the sigma value for smoothing (default 5) --add DOUBLE add this value to raw counts when deriving log-odds without smoothing (default 1/#classes) --penv use environment-dependent frequencies for log-odds calculation (default false) --pidmin DOUBLE count substitutions only for pairs with PID equal to or greater than this value (default none) --pidmax DOUBLE count substitutions only for pairs with PID smaller than this value (default none) --analysis analyze structural environments (also -D) -v, --verbose INTEGER 0 for WARN level (default) 1 for INFO level or more severe 2 for DEBUG level or more sever --version print version USAGE end def self.execute(arguments=[]) # # Abbreviations in the source codes # # * env: environment # * tem: (FUGUE) template # * classdef: (envlironment) class definition # * aa: amino acid # * tot: total # * rel: relative # * obs: observation (frequency) # * mut: mutation # * mutb: mutability # * freq: frequency # * prob: probability # * opts: options # # Global variables & Abbreviations # $logger = Logger.new(STDOUT) $amino_acids = "ACDEFGHIKLMNPQRSTVWYJ".split("") $tem_list = nil $tem_file = nil $classdef = nil $outfile = nil $format = nil $aa_tot_obs = {} $aa_mut_obs = {} $aa_mutb = {} $aa_rel_mutb = {} $aa_rel_freq = {} $tot_aa = 0 $sigma = 5.0 $smooth_prob = {} # # Options parsing # opts = GetoptLong.new( [ '--help', '-h', GetoptLong::NO_ARGUMENT ], [ '--tem-list', '-l', GetoptLong::REQUIRED_ARGUMENT ], [ '--tem-file', '-f', GetoptLong::REQUIRED_ARGUMENT ], [ '--classdef', '-c', GetoptLong::REQUIRED_ARGUMENT ], [ '--output', GetoptLong::REQUIRED_ARGUMENT ], [ '--cys', '-y', GetoptLong::REQUIRED_ARGUMENT ], [ '-o', GetoptLong::REQUIRED_ARGUMENT ], [ '--verbose', '-v', GetoptLong::REQUIRED_ARGUMENT ], [ '--version', GetoptLong::NO_ARGUMENT ] ) opts.each do |opt, arg| case opt when '--help' print_usage when '--tem-list' $tem_list = arg when '--tem-file' $tem_file = arg when '--classdef' $classdef = arg || 'classdef.dat' when '--output' $format = arg.to_i || 2 when '-o' $outfile = arg || 'allmat.dat' when '--cyc' $cysteine = (arg.to_i == 1 ? false : true) when '--verbose' $logger.level = case arg.to_i when 0 then Logger::WARN when 1 then Logger::INFO when 2 then Logger::DEBUG else Logger::INFO end when '--version' print_version end end if ARGV.length != 0 or !$tem_list && !$tem_file or $tem_list && $tem_file print_usage exit 0 end # # Reading Environment Class Definition File # $env_features = [] $env_features << EnvironmentFeature.new("sequence", $amino_acids, $amino_acids, "F", "F") IO.foreach($classdef) do |l| next if l =~ /^#/ if (env_ftr = l.chomp.split(/;/)).length == 5 $logger.info ">>> An environment feature, #{l.chomp} detected" if env_ftr[-1] == "T" $logger.warn "!!! The environment feature, #{l.chomp} silent" next end if env_ftr[-2] == "T" $logger.warn "!!! The environment feature, #{l.chomp} constrained" end $env_features << EnvironmentFeature.new(env_ftr[0], env_ftr[1].split(""), env_ftr[2].split(""), env_ftr[3], env_ftr[4]) end end $envs = {} $env_features.inject([]) { |sum, ec| sum << ec.labels }.inject { |pro, lb| pro.product(lb) }.each_with_index { |e, i| $envs[e.flatten.join] = Environment.new(i, e.flatten.join) } # # Reading TEM list file # if $tem_list IO.foreach($tem_list) do |tem_file| tem_file.chomp! $logger.debug ">>> Processing #{tem_file} ..." ali = Bio::Alignment::OriginalAlignment.new ff = Bio::FlatFile.auto(tem_file) ff.each_entry do |pir| if pir.definition == "sequence" ali.add_seq(pir.data.gsub("\n", ""), pir.entry_id) end end env_labels = {} disulphide = {} ali.each_pair do |key, seq| # check disulphide bond environment first! ff.rewind ff.each_entry do |pir| if (pir.entry_id == key) && (pir.definition == "disulphide") disulphide[key] = pir.data.gsub("\n", "").split("") end end $env_features.each_with_index do |ec, ei| env_labels[key] = [] unless env_labels.has_key?(key) ff.rewind ff.each_entry do |pir| if (pir.entry_id == key) && (pir.definition == ec.name) labels = pir.data.gsub("\n", "").split("").map_with_index { |sym, pos| if sym == "-" "-" elsif sym == "X" || sym == "x" "X" else if ei == 0 # Amino Acid Environment Feature ((disulphide[key][pos] == "F") && (sym == "C")) ? "J" : sym else ec.labels[ec.symbols.index(sym)] end end } if env_labels[key].empty? env_labels[key] = labels else env_labels[key].each_with_index { |e, i| env_labels[key][i] = e + labels[i] } end end end end end ali.each_pair do |key1, seq1| ali.each_pair do |key2, seq2| if key1 != key2 s1 = seq1.split("") s2 = seq2.split("") s1.each_with_index do |source, pos| if env_labels[key1][pos].include?("X") $logger.info ">>> Substitutions from #{key1}-#{pos}-#{source} were masked" next end source.upcase! target = s2[pos].upcase if !$amino_acids.include?(source) $logger.info "!!! #{key1}-#{pos}-#{source} is not standard amino acid" next end if !$amino_acids.include?(target) $logger.info "!!! #{key1}-#{pos}-#{target} is not standard amino acid" next end source = (((disulphide[key1][pos] == "F") && (source == "C")) ? "J" : source) target = (((disulphide[key2][pos] == "F") && (target == "C")) ? "J" : target) $envs[env_labels[key1][pos]].add_residue_count(target) if $aa_tot_obs.has_key?(source) $aa_tot_obs[source] += 1 else $aa_tot_obs[source] = 1 end if source != target if $aa_mut_obs.has_key?(source) $aa_mut_obs[source] += 1 else $aa_mut_obs[source] = 1 end end $logger.debug ">>> Add #{key1}-#{pos}-#{source} -> #{key2}-#{pos}-#{target} substituion for #{env_labels[key1][pos]}" end end end end end # IO.foreach($tem_list) # # # # Frequency matrix # # # $envs.values.sort_by { |v| v.number }.each do |env| # puts ">#{env.label} #{env.number}" # puts env.freq_array.pretty_string(:col_header => $amino_acids, # :row_header => "Prb") # end $tot_freq_matrix = NMatrix.int(21,21) # for each combination of environment features env_groups = $envs.values.group_by { |env| env.label[1..-1] } env_groups.each_pair do |label, group| $grp_freq_matrix = NMatrix.int(21,21) $amino_acids.each_with_index do |aa, ai| freq_array = group.find { |e| e.label.start_with?(aa) }.freq_array 0.upto($grp_freq_matrix.shape[1] - 1) do |j| $grp_freq_matrix[ai, j] = freq_array[j] end end $tot_freq_matrix += $grp_freq_matrix # puts ">#{label}" # puts $grp_freq_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids) end # # for total # puts ">Total" # puts $tot_freq_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids) # # Amino Acid Frequencies and Mutabilities # ala_factor = 100 * $aa_tot_obs["A"] / $aa_mut_obs["A"].to_f $tot_aa = $aa_tot_obs.values.sum # puts ">Total amino acid frequencies" # puts "%-3s %8s %8s %8s %8s %8s" % %w[RES MUT_OBS TOT_OBS MUT REL_MUT REL_FRQ] $aa_tot_obs.each_pair do |res, freq| $aa_mutb[res] = $aa_mut_obs[res] / freq.to_f $aa_rel_mutb[res] = $aa_mutb[res] * ala_factor $aa_rel_freq[res] = freq / $tot_aa.to_f # puts "%-3s %8d %8d %8.2f %8d %8.4f" % [res, $aa_mut_obs[res], freq, $aa_mutb[res], $aa_rel_mutb[res], $aa_rel_freq[res]] end # # Probability matrices # tot_prob_matrix = NMatrix.float(21, 21) 0.upto($tot_freq_matrix.shape[0] - 1) do |i| col_sum = 0 0.upto($tot_freq_matrix.shape[1] - 1) do |j| col_sum += $tot_freq_matrix[i, j] end 0.upto($tot_freq_matrix.shape[1] - 1) do |k| # normalized substitutions probabilities with mutability #tot_prob_matrix[i, k] = $aa_rel_mutb[$amino_acids[k]] * $tot_freq_matrix[i,k] / col_sum.to_f # raw substitution probabilities with just frequencies tot_prob_matrix[i, k] = 100 * $tot_freq_matrix[i,k] / col_sum.to_f end end # puts ">Total probability" # puts tot_prob_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids) # # a new way of getting probability matrix # new_tot_prob_matrix = NMatrix.float(21, 21) # # 0.upto($tot_freq_matrix.shape[0] - 1) do |i| # col_sum = 0 # 0.upto($tot_freq_matrix.shape[1] - 1) do |j| # col_sum += ((1 - $aa_mutb[$amino_acids[j]]) / $aa_rel_freq[$amino_acids[j]]) * $tot_freq_matrix[i, j] # end # 0.upto($tot_freq_matrix.shape[1] - 1) do |k| # new_tot_prob_matrix[i, k] = 100 * ((1 - $aa_mutb[$amino_acids[k]]) / $aa_rel_freq[$amino_acids[k]]) * $tot_freq_matrix[i, k] / col_sum # end # end # # p1 probability # p1 = NArray.float(21) a0 = NArray.float(21).fill(1 / 21.0) big_N = $tot_aa small_n = 21 omega1 = 1.0 / (1 + big_N.to_f / ($sigma * small_n.to_f)) omega2 = 1.0 - omega1 0.upto(p1.shape[0] - 1) { |i| p1[i] = 100.0 * (omega1 * a0[i] + omega2 * $aa_rel_freq[$amino_acids[i]]) } $smooth_prob[1] = p1 # puts "P1 probability matrix" # puts p1.pretty_string(:col_header => $amino_acids, :row_header => "Prb") # puts p1.sum # # p2 and above # env_labels = $env_features.map_with_index {|ef, ei| ef.labels.map { |l| "#{ei}#{l}" } } 1.upto($env_features.size) do |ci| env_labels.combination(ci) do |c1| Enumerable.cart_prod(*c1).each do |labels| pattern = "." * $env_features.size labels.each do |label| j = label[0].chr.to_i l = label[1].chr pattern[j] = l end # get environmetns, frequencies, and probabilities envs = $envs.values.select { |env| env.label.match(pattern.to_re) } freq_arr = envs.inject(NArray.int(21)) { |sum, env| sum + env.freq_array } # if freq_arr.sum == 0 # $logger.warn "!!! Environment combination, #{labels.to_set} has no frequency" # # # store smoothed probabilties in a hash using a set of envrionment labels as a key # smooth_prob_arr = NArray.float(21).fill(0.0) # if !$smooth_prob.has_key?(ci + 1) # $smooth_prob[ci + 1] = {} # $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr # else # $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr # end # # # print smoothed probabilties # puts "P#{ci + 1} probability for the combination of environments, #{labels.join}" # puts smooth_prob_arr.pretty_string(:col_header => $amino_acids, :row_header => "Prb") # puts smooth_prob_arr.sum # next # end prob_arr = NArray.float(21) 0.upto(20) { |i| prob_arr[i] = freq_arr[i] == 0 ? 0 : freq_arr[i] / freq_arr.sum.to_f } # collect priors priors = [] if ci > 1 labels.combination(ci - 1).each { |c2| priors << $smooth_prob[ci][c2.to_set] } else priors << $smooth_prob[1] end # averaging priors... have a look at Entropy based normalization! #pri_avg = priors.sum / priors.size # entropy based weighting priors entropy_max = Math::log(21) entropies = priors.map do |prior| (entropy_max + prior.to_a.inject(0.0) { |s, p| s + p * Math::log(p) }) / entropy_max end pri_avg = priors.map_with_index { |p, i| p * entropies[i] / entropies.sum }.sum smooth_prob_arr = NArray.float(21) big_N = freq_arr.sum.to_f small_n = 21.0 omega1 = 1.0 / (1 + big_N / ($sigma * small_n)) omega2 = 1.0 - omega1 # smoothing step 0.upto(20) { |i| smooth_prob_arr[i] = 100.0 * (omega1 * pri_avg[i] + omega2 * prob_arr[i]) } # normalization step smooth_prob_arr_sum = smooth_prob_arr.sum 0.upto(20) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / smooth_prob_arr_sum) } # store smoothed probabilties in a hash using a set of envrionment labels as a key if !$smooth_prob.has_key?(ci + 1) $smooth_prob[ci + 1] = {} $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr else $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr end # # print smoothed probabilties # puts "P#{ci + 1} probability for the combination of environments, #{labels.join}" # puts smooth_prob_arr.pretty_string(:col_header => $amino_acids, :row_header => "Prb") # puts smooth_prob_arr.sum end end end # summarizing ... $envs.values.each { |e| e.smooth_prob_array = $smooth_prob[$env_features.size + 1][e.label_set] } # $envs.values.each do |e| # puts ">" + e.label # puts e.smooth_prob_array.pretty_string(:col_header => $amino_acids, :row_header => "Prb") # end tot_smooth_prob_matrix = NMatrix.float(21,21) # for each combination of environment features env_groups = $envs.values.group_by { |env| env.label[1..-1] } env_groups.sort.each do |label, group| grp_prob_matrix = NMatrix.float(21,21) $amino_acids.each_with_index do |aa, ai| smooth_prob_array = group.find { |e| e.label.start_with?(aa) }.smooth_prob_array 0.upto(20) { |j| grp_prob_matrix[ai, j] = smooth_prob_array[j] } end tot_smooth_prob_matrix += grp_prob_matrix puts ">#{label}" puts grp_prob_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids) end tot_smooth_prob_matrix /= env_groups.size # for total puts ">Total Probability" puts tot_smooth_prob_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids) end end end end