require 'rubygems' require 'getoptlong' require 'logger' require 'narray' require 'bio' require 'set' require 'facets' require 'math_extensions' require 'string_extensions' require 'narray_extensions' require 'nmatrix_extensions' require 'egor/environment' require 'egor/environment_class_hash' require 'egor/environment_feature' require 'egor/environment_feature_array' require 'egor/heatmap_array' # This is a module for an actual command line interpreter for Egor # --- # Copyright (C) 2008-9 Semin Lee module Egor class CLI class << self # :nodoc: def print_version puts VERSION end # Print Egor's Usage on the screen # # :call-seq: # Egor::CLI::print_usage # def print_usage puts <<-USAGE egor: Esst GeneratOR, a program to calculate environment-specific amino acid substitution tables. Usage: egor [ options ] -l TEMLIST-file -c CLASSDEF-file or egor [ options ] -f TEM-file -c CLASSDEF-file Options: --tem-file (-f) FILE: a tem file --tem-list (-l) FILE: a list for tem files --classdef (-c) FILE: a file for the defintion of environments (default: 'classdef.dat') --outfile (-o) FILE: output filename (default 'allmat.dat') --weight (-w) INTEGER: clustering level (PID) for the BLOSUM-like weighting (default: 60) --noweight: calculate substitution counts with no weights --smooth (-s) INTEGER: 0 for partial smoothing (default) 1 for full smoothing --p1smooth: perform smoothing for p1 probability calculation when partial smoothing --nosmooth: perform no smoothing operation --cys (-y) INTEGER: 0 for using C and J only for structure (default) 1 for both structure and sequence 2 for using only C for both (must be set when you have no 'disulphide' or 'disulfide' annotation in templates) --output INTEGER: 0 for raw counts (no smoothing performed) 1 for probabilities 2 for log-odds (default) --noroundoff: do not round off log odds ratio --scale INTEGER: log-odds matrices in 1/n bit units (default 3) --sigma DOUBLE: change the sigma value for smoothing (default 5.0) --autosigma: automatically adjust the sigma value for smoothing --add DOUBLE: add this value to raw counts when deriving log-odds without smoothing (default 1/#classes) --pidmin DOUBLE: count substitutions only for pairs with PID equal to or greater than this value (default none) --pidmax DOUBLE: count substitutions only for pairs with PID smaller than this value (default none) --heatmap INTEGER: 0 create a heat map file for each substitution table 1 create one big file containing all substitution tables 2 do both 0 and 1 --heatmap-format INTEGER: 0 for Portable Network Graphics (PNG) Format (default) 1 for Graphics Interchange Format (GIF) 2 for Joint Photographic Experts Group (JPEG) Format 3 for Microsoft Windows bitmap (BMP) Format 4 for Portable Document Format (PDF) --heatmap-columns INTEGER: number of tables to print in a row when --heatmap 1 or 2 set (default: sqrt(no. of tables)) --heatmap-stem STRING: stem for a file name when --heatmap 1 or 2 set (default: 'heatmap') --heatmap-value: print values in the cells when generating heat maps --verbose (-v) INTEGER 0 for ERROR level 1 for WARN or above level (default) 2 for INFO or above level 3 for DEBUG or above level --version: print version --help (-h): show help USAGE end # Calculate PID between two sequences # # :call-seq: # Egor::CLI::calculate_pid(seq1, seq2) -> Float # def calculate_pid(seq1, seq2) aas1 = seq1.split('') aas2 = seq2.split('') cols = aas1.zip(aas2) align = 0 # no. of aligned columns ident = 0 # no. of identical columns intgp = 0 # no. of internal gaps cols.each do |col| if (col[0] != '-') && (col[1] != '-') align += 1 if col[0] == col[1] ident += 1 end elsif (((col[0] == '-') && (col[1] != '-')) || ((col[0] != '-') && (col[1] == '-'))) intgp += 1 end end pid = 100.0 * ident.to_f / (align + intgp) end # :nodoc: def execute(arguments=[]) # # * Abbreviations in the codes # # env: environment # tem: (FUGUE) template # classdef: (envlironment) class definition # aa: amino acid # aa: weighted amino acid # tot: total # rel: relative # jnt: joint # cnt: count # mut: mutation # mutb: mutability # freq: frequency # prob: probability # logo: log odds ratio # opts: options # fh: file handle # ff: flat file # ali: alignment # mat: matrix # arr: array # Part 1. # # Global variables and their default values # $logger = Logger.new(STDOUT) $logger.level = Logger::WARN # default set of 21 amino acids including J (Cysteine, the free thiol form) $amino_acids = 'ACDEFGHIKLMNPQRSTVWYJ'.split('') $tem_list = nil $tem_file = nil $classdef = 'classdef.dat' $outfile = 'allmat.dat' $outfh = nil # file hanfle for outfile $output = 2 # default: log odds matrix $ali_size = 0 $tot_aa = 0 $sigma = 5.0 $autosigma = false $weight = 60 $noweight = false $smooth = :partial $nosmooth = false $noroundoff = false $p1smooth = false $scale = 3 $pidmin = nil $pidmax = nil $scale = 3 $add = nil $cys = 0 $targetenv = false $penv = false $heatmap = nil $heatmapcol = nil $heatmapformat = 'png' $heatmapstem = 'heatmaps' $heatmapvalue = false $rvg_width = 550 $rvg_height = 650 $canvas_width = 550 $canvas_height = 650 $cell_width = 20 $cell_height = 20 $aa_tot_cnt = Hash.new(0) $aa_mut_cnt = Hash.new(0) $aa_mutb = {} $aa_rel_mutb = {} $aa_tot_freq = {} $aa_env_cnt = Hash.new(0) $smooth_prob = {} $tot_cnt_mat = nil $tot_prob_mat = nil $tot_logo_mat = nil $tot_smooth_prob = {} # minimum ratio of amino acid count to sigma value $min_cnt_sigma_ratio = 500.0 # # Part 1 END # # Part 2. # # Parsing options # opts = GetoptLong.new( [ '--help', '-h', GetoptLong::NO_ARGUMENT ], [ '--tem-list', '-l', GetoptLong::REQUIRED_ARGUMENT ], [ '--tem-file', '-f', GetoptLong::REQUIRED_ARGUMENT ], [ '--classdef', '-c', GetoptLong::REQUIRED_ARGUMENT ], [ '--smooth', '-s', GetoptLong::REQUIRED_ARGUMENT ], [ '--nosmooth', GetoptLong::NO_ARGUMENT ], [ '--p1smooth', GetoptLong::NO_ARGUMENT ], [ '--weight', '-w', GetoptLong::REQUIRED_ARGUMENT ], [ '--noweight', GetoptLong::NO_ARGUMENT ], [ '--noroundoff', GetoptLong::NO_ARGUMENT ], [ '--sigma', GetoptLong::REQUIRED_ARGUMENT ], [ '--autosigma', GetoptLong::NO_ARGUMENT ], [ '--heatmap', GetoptLong::REQUIRED_ARGUMENT ], [ '--heatmap-stem', GetoptLong::REQUIRED_ARGUMENT ], [ '--heatmap-format', GetoptLong::REQUIRED_ARGUMENT ], [ '--heatmap-columns',GetoptLong::REQUIRED_ARGUMENT ], [ '--heatmap-value', GetoptLong::NO_ARGUMENT ], [ '--output', GetoptLong::REQUIRED_ARGUMENT ], [ '--targetenv','-t', GetoptLong::REQUIRED_ARGUMENT ], [ '--cys', '-y', GetoptLong::REQUIRED_ARGUMENT ], [ '--penv', GetoptLong::NO_ARGUMENT ], [ '--outfile', '-o', GetoptLong::REQUIRED_ARGUMENT ], [ '--verbose', '-v', GetoptLong::REQUIRED_ARGUMENT ], [ '--version', GetoptLong::NO_ARGUMENT ] ) begin opts.each do |opt, arg| case opt when '--help' print_usage exit 0 when '--tem-list' $tem_list = arg when '--tem-file' $tem_file = arg when '--classdef' $classdef = arg when '--output' $output = arg.to_i when '--outfile' $outfile = arg when '--cys' $cys = arg.to_i when '--targetenv' $targetenv = (arg.to_i == 1) ? true : false when '--weight' $weight = arg.to_i when '--sigma' $sigma = arg.to_f when '--autosigma' $autosigma = true when '--pidmin' $pidmin = arg.to_f when '--pidmax' $pidmax = arg.to_f when '--noweight' $noweight = true when '--noroundoff' $noroundoff = true when '--smooth' $smooth = (arg.to_i == 1) ? :full : :partial when '--nosmooth' $nosmooth = true when '--p1smooth' $p1smooth = true when '--scale' $scale = arg.to_f when '--add' $add = arg.to_f when '--penv' warn "--penv option is not supported." exit 1 $penv = true when '--heatmap' $heatmap = case arg.to_i when (0..2) then arg.to_i else warn "--heatmap #{arg.to_i} is not allowed." exit1 end when '--heatmap-columns' $heatmapcol = arg.to_i when '--heatmap-stem' $heatmapstem = arg.to_s when '--heatmap-format' $heatmapformat = case arg.to_i when 0 then 'png' when 1 then 'gif' when 2 then 'jpg' when 3 then 'bmp' when 4 then 'pdf' else warn "--heatmap-format #{arg.to_i} is not supported." exit 1 end when '--heatmap-value' $heatmapvalue = true when '--verbose' $logger.level = case arg.to_i when 0 then Logger::ERROR when 1 then Logger::WARN when 2 then Logger::INFO when 3 then Logger::DEBUG else warn "--verbose (-v) #{arg.to_i} is not supported." exit 1 end when '--version' print_version exit 0 end end rescue # invalid option exit 1 end # when arguments are nonsense, print usage if ((ARGV.length != 0) || (!$tem_list && !$tem_file) || ($tem_list && $tem_file)) print_usage exit 1 end # warn if any input file is missing if $tem_list && !File.exist?($tem_list) warn "Cannot find template list file, #{$tem_list}" exit 1 end if $tem_file && !File.exist?($tem_file) warn "Cannot find template file, #{$tem_file}" exit 1 end if $classdef && !File.exist?($classdef) warn "Cannot find environment class definition file, #{$classdef}" exit 1 end # # Part 2 END # # Part 3. # # Reading Environment Class Definition File # # check --cys option and modify amino_acids set if necessary if $cys == 2 $amino_acids = 'ACDEFGHIKLMNPQRSTVWY'.split('') end # create an EnvironmentFeatureList object for storing all environment # features $env_features = EnvironmentFeatureArray.new # an array for storing indexes of constrained environment features $cst_features = [] # add substituted amino acid (aa1) in a substitution to the environment # feature list $env_features << EnvironmentFeature.new('sequence', $amino_acids, $amino_acids, 'F', 'F') # read environment class definiton file and store them into # the hash prepared above env_index = 1 IO.foreach($classdef) do |line| line.chomp! if line.start_with?('#') next elsif (env_ftr = line.chomp.split(/;/)).length == 5 $logger.info "An environment feature, #{line} detected." if env_ftr[-1] == 'T' # skip silenced environment feature $logger.warn "The environment feature, #{line} silent." next end if env_ftr[-2] == 'T' $cst_features << env_index $logger.warn "The environment feature, #{line} constrained." end $env_features << EnvironmentFeature.new(env_ftr[0], env_ftr[1].split(''), env_ftr[2].split(''), env_ftr[3], env_ftr[4]) env_index += 1 else $logger.error "\"#{line}\" doesn't seem to be a proper format for" + "a environment class definition." exit 1 end end # a hash for storing all environment classes $env_classes = EnvironmentClassHash.new # generate all possible combinations of environment labels, and store # every environment class into the hash prepared above with the label # as a key $env_features.label_combinations.each_with_index { |e, i| $env_classes[e.flatten.join] = Environment.new(i, e.flatten.join, $amino_acids) } # # Part 3 END # # Part 4. # # Reading TEM file or TEMLIST list file and couting substitutions # # a global file handle for output $outfh = File.open($outfile, 'w') if $tem_file $tem_list_io = StringIO.new($tem_file) end if $tem_list $tem_list_io = File.open($tem_list) end $tem_list_io.each_line do |tem_file| tem_file.chomp! ali = Bio::Alignment::OriginalAlignment.new ff = Bio::FlatFile.auto(tem_file) ff.each_entry do |pir| if (pir.definition == 'sequence') || (pir.definition == 'structure') ali.add_seq(pir.data.remove_internal_spaces, pir.entry_id) end end if ali.size < 2 $logger.warn "Skipped #{tem_file} which has only one unique entry." next end $ali_size += 1 env_labels = {} disulphide = {} ali.each_pair do |key, seq| # check disulphide bond environment first! ff.rewind ff.each_entry do |pir| if ((pir.entry_id == key) && ((pir.definition == "disulphide") || (pir.definition == "disulfide"))) disulphide[key] = pir.data.remove_internal_spaces.split('') end end $env_features.each_with_index do |ec, ei| env_labels[key] = [] unless env_labels.has_key?(key) ff.rewind ff.each_entry do |pir| if (pir.entry_id == key) && (pir.definition == ec.name) labels = pir.data.remove_internal_spaces.split('').map_with_index do |sym, pos| if sym == '-' '-' elsif sym == 'X' || sym == 'x' 'X' else if ei == 0 # Amino Acid Environment Feature (disulphide.has_key?(key) && (disulphide[key][pos] == 'F') && (sym == 'C')) ? 'J' : sym else ec.labels[ec.symbols.index(sym)] end end end if env_labels[key].empty? env_labels[key] = labels else env_labels[key].each_with_index { |e, i| env_labels[key][i] = e + labels[i] } end end end end end if $noweight ali.each_pair do |id1, seq1| ali.each_pair do |id2, seq2| if id1 != id2 pid = calculate_pid(seq1, seq2) s1 = seq1.split('') s2 = seq2.split('') # check PID_MIN if $pidmin && (pid < $pidmin) $logger.info "Skip alignment between #{id1} and #{id2} " + "having PID, #{pid}% less than PID_MIN, #{$pidmin}." next end # check PID_MAX if $pidmax && (pid > $pidmax) $logger.info "Skip alignment between #{id1} and #{id2} " + "having PID, #{pid}% greater than PID_MAX, #{$pidmax}." next end s1.each_with_index do |aa1, pos| aa1.upcase! aa2 = s2[pos].upcase if env_labels[id1][pos].include?('X') $logger.info "Substitutions from #{id1}-#{pos}-#{aa1} were masked." next end if env_labels[id2][pos].include?('X') $logger.info "Substitutions to #{id2}-#{pos}-#{aa2} were masked." next end unless $amino_acids.include?(aa1) $logger.warn "#{id1}-#{pos}-#{aa1} is not a standard amino acid." unless aa1 == "-" next end unless $amino_acids.include?(aa2) $logger.warn "#{id1}-#{pos}-#{aa2} is not a standard amino acid." unless aa2 == "-" next end aa1 = (disulphide.has_key?(id1) && (disulphide[id1][pos] == 'F') && (aa1 == 'C')) ? 'J' : aa1 aa2 = (disulphide.has_key?(id2) && (disulphide[id2][pos] == 'F') && (aa2 == 'C')) ? 'J' : aa2 if $cst_features.empty? $env_classes[env_labels[id1][pos]].increase_residue_count(aa2) elsif (env_labels[id1][pos].split('').values_at(*$cst_features) == env_labels[id2][pos].split('').values_at(*$cst_features)) $env_classes[env_labels[id1][pos]].increase_residue_count(aa2) else $logger.debug "Skipped #{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2}, they have different symbols for constrained environment features each other." next end grp_label = env_labels[id1][pos][1..-1] if $aa_env_cnt.has_key? grp_label if $aa_env_cnt[grp_label].has_key? aa1 $aa_env_cnt[grp_label][aa1] += 1 else $aa_env_cnt[grp_label][aa1] = 1 end else $aa_env_cnt[grp_label] = Hash.new(0) $aa_env_cnt[grp_label][aa1] = 1 end if $aa_tot_cnt.has_key? aa1 $aa_tot_cnt[aa1] += 1 else $aa_tot_cnt[aa1] = 1 end if aa1 != aa2 if $aa_mut_cnt.has_key? aa1 $aa_mut_cnt[aa1] += 1 else $aa_mut_cnt[aa1] = 1 end end $logger.debug "#{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substitution count (1) was added to the environments class, #{env_labels[id1][pos]}." end end end end else # BLOSUM-like weighting clusters = [] ali.each_pair { |i, s| clusters << [i] } # a loop for single linkage clustering begin continue = false 0.upto(clusters.size - 2) do |i| indexes = [] (i + 1).upto(clusters.size - 1) do |j| found = false clusters[i].each do |c1| clusters[j].each do |c2| if calculate_pid(ali[c1], ali[c2]) >= $weight indexes << j found = true break end end break if found end end unless indexes.empty? continue = true group = clusters[i] indexes.each do |k| group = group.concat(clusters[k]) clusters[k] = nil end clusters[i] = group clusters.compact! end end end while(continue) if clusters.size < 2 $logger.debug "Skipped #{tem_file} which has only one cluster at the #{$weight} PID level." next end clusters.combination(2).each do |cluster1, cluster2| cluster1.each do |id1| cluster2.each do |id2| seq1 = ali[id1].split('') seq2 = ali[id2].split('') seq1.each_with_index do |aa1, pos| aa1.upcase! aa2 = seq2[pos].upcase rescue next # should fix this in a sane way! if env_labels[id1][pos].include?('X') $logger.debug "All substitutions from #{id1}-#{pos}-#{aa1} are masked." next end if env_labels[id2][pos].include?('X') $logger.debug "All substitutions to #{id2}-#{pos}-#{aa2} are masked." next end unless $amino_acids.include?(aa1) $logger.warn "#{id1}-#{pos}-#{aa1} is not standard amino acid." unless aa1 == "-" next end unless $amino_acids.include?(aa2) $logger.warn "#{id2}-#{pos}-#{aa2} is not standard amino acid." unless aa2 == "-" next end aa1 = (disulphide.has_key?(id1) && (disulphide[id1][pos] == 'F') && (aa1 == 'C')) ? 'J' : aa1 aa2 = (disulphide.has_key?(id2) && (disulphide[id2][pos] == 'F') && (aa2 == 'C')) ? 'J' : aa2 cnt1 = 1.0 / cluster1.size cnt2 = 1.0 / cluster2.size jnt_cnt = cnt1 * cnt2 if $cst_features.empty? $env_classes[env_labels[id1][pos]].increase_residue_count(aa2, jnt_cnt) $env_classes[env_labels[id2][pos]].increase_residue_count(aa1, jnt_cnt) elsif (env_labels[id1][pos].split('').values_at(*$cst_features) == env_labels[id2][pos].split('').values_at(*$cst_features)) $env_classes[env_labels[id1][pos]].increase_residue_count(aa2, jnt_cnt) $env_classes[env_labels[id2][pos]].increase_residue_count(aa1, jnt_cnt) else $logger.debug "#{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2} have different symbols for constrained environment features each other." next end grp_label1 = env_labels[id1][pos][1..-1] grp_label2 = env_labels[id2][pos][1..-1] if $aa_env_cnt.has_key? grp_label1 if $aa_env_cnt[grp_label1].has_key? aa1 $aa_env_cnt[grp_label1][aa1] += cnt1 else $aa_env_cnt[grp_label1][aa1] = cnt1 end else $aa_env_cnt[grp_label1] = Hash.new(0.0) $aa_env_cnt[grp_label1][aa1] = cnt1 end if $aa_env_cnt.has_key? grp_label2 if $aa_env_cnt[grp_label2].has_key? aa2 $aa_env_cnt[grp_label2][aa2] += cnt2 else $aa_env_cnt[grp_label2][aa2] = cnt2 end else $aa_env_cnt[grp_label2] = Hash.new(0.0) $aa_env_cnt[grp_label2][aa2] = cnt2 end if $aa_tot_cnt.has_key? aa1 $aa_tot_cnt[aa1] += cnt1 else $aa_tot_cnt[aa1] = cnt1 end if $aa_tot_cnt.has_key? aa2 $aa_tot_cnt[aa2] += cnt2 else $aa_tot_cnt[aa2] = cnt2 end if aa1 != aa2 if $aa_mut_cnt.has_key? aa1 $aa_mut_cnt[aa1] += cnt1 else $aa_mut_cnt[aa1] = cnt1 end if $aa_mut_cnt.has_key? aa2 $aa_mut_cnt[aa2] += cnt2 else $aa_mut_cnt[aa2] = cnt2 end end $logger.debug "#{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substitution count (#{"%.2f" % jnt_cnt}) was added to the environments class, #{env_labels[id1][pos]}." $logger.debug "#{id2}-#{pos}-#{aa2} -> #{id1}-#{pos}-#{aa1} substitution count (#{"%.2f" % jnt_cnt}) was added to the environments class, #{env_labels[id2][pos]}." end end end end end $logger.info "Analysing #{tem_file} done." end # print out default header $outfh.puts <
0) && (min_cnt > $aa_tot_cnt[res]) min_cnt = $aa_tot_cnt[res] min_sigma = min_cnt / $min_cnt_sigma_ratio end $logger.warn "The current sigma value, #{$sigma} seems to be too big for the total count (#{"%.2f" % $aa_tot_cnt[res]}) of amino acid, #{res}." end $aa_mutb[res] = ($aa_tot_cnt[res] == 0) ? 1.0 : ($aa_mut_cnt[res] / $aa_tot_cnt[res].to_f) $aa_rel_mutb[res] = $aa_mutb[res] * ala_factor $aa_tot_freq[res] = ($aa_tot_cnt[res] == 0) ? 0.0 : ($aa_tot_cnt[res] / $tot_aa.to_f) end $amino_acids.each do |res| if $noweight $outfh.puts '# %-3s %9d %9d %5.2f %8d %8.4f' % [res, $aa_tot_cnt[res], $aa_mut_cnt[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_tot_freq[res]] else $outfh.puts '# %-3s %9.2f %9.2f %5.2f %8d %8.4f' % [res, $aa_tot_cnt[res], $aa_mut_cnt[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_tot_freq[res]] end end if min_cnt > -1 $logger.warn "We recommend you to use a sigma value equal to or smaller than #{min_sigma}." if $autosigma $logger.warn "The sigma value has been changed from #{$sigma} to #{min_sigma}." $sigma = min_sigma end end $outfh.puts '#' $outfh.puts '# RES: Amino acid one letter code' $outfh.puts '# TOT_OBS: Total count of incidence' $outfh.puts '# MUT_OBS: Total count of mutation' $outfh.puts '# MUTB: Mutability (MUT_OBS / TOT_OBS)' $outfh.puts '# REL_MUTB: Relative mutability (ALA = 100)' $outfh.puts '# REL_FREQ: Relative frequency' $outfh.puts '#' # # Part 4. END # # Part 5. # # Generating substitution frequency matrices # # calculating probabilities for each environment $env_classes.values.each do |e| if e.freq_array.sum != 0 e.prob_array = 100.0 * e.freq_array / e.freq_array.sum end end # count raw frequencies $tot_cnt_mat = NMatrix.send($noweight ? 'int' : 'float', $amino_acids.size, $amino_acids.size) group_matrices = [] # for each combination of environment features $env_classes.groups_sorted_by_residue_labels.each_with_index do |group, group_no| grp_cnt_mat = NMatrix.send($noweight ? 'int' : 'float', $amino_acids.size, $amino_acids.size) $amino_acids.each_with_index do |aa, aj| freq_array = group[1].find { |e| e.label.start_with?(aa) }.freq_array 0.upto($amino_acids.size - 1) { |i| grp_cnt_mat[aj, i] = freq_array[i] } end $tot_cnt_mat += grp_cnt_mat group_matrices << [group[0], grp_cnt_mat] end $logger.info "Counting substitutions done." if $output == 0 heatmaps = HeatmapArray.new if $heatmap == 1 or $heatmap == 2 grp_max_val = group_matrices.map { |l, m, n| m }.map { |m| m.max }.max $heatmapcol ||= Math::sqrt(group_matrices.size).round group_matrices.each_with_index do |(grp_label, grp_cnt_mat), grp_no| # for a matrix file stem = "#{grp_no}. #{grp_label}" $outfh.puts ">#{grp_label} #{grp_no}" $outfh.puts grp_cnt_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids) # for a heat map if $heatmap == 0 or $heatmap == 2 grp_cnt_mat.heatmap(:col_header => $amino_acids, :row_header => $amino_acids, :rvg_width => $rvg_width, :rvg_height => $rvg_height, :canvas_width => $canvas_width, :canvas_height => $canvas_height, :max_val => grp_max_val.ceil, :min_val => 0, :print_value => $heatmapvalue, :title => stem).write("#{stem}.#{$heatmapformat}") $logger.info "Generating a heat map for #{stem} table done." end if $heatmap == 1 or $heatmap == 2 heatmaps << grp_cnt_mat.heatmap(:col_header => $amino_acids, :row_header => $amino_acids, :rvg_width => $rvg_width, :rvg_height => $rvg_height - 50, :canvas_width => $canvas_width, :canvas_height => $canvas_height - 50, :max_val => grp_max_val.ceil, :min_val => 0, :print_value => $heatmapvalue, :print_gradient => false, :title => stem, :title_font_size => $rvg_width * $heatmapcol / 100.0) end end if $heatmap == 1 or $heatmap == 2 file = "#{$heatmapstem}.#{$heatmapformat}" heatmaps.heatmap(:columns => $heatmapcol, :rvg_width => $rvg_width, :max_val => grp_max_val.ceil, :min_val => 0).write(file) $logger.info "Generating heat maps in a file, #{file} done." end # total $outfh.puts '>Total' $outfh.puts $tot_cnt_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids) if $heatmap == 0 or $heatmap == 2 stem = "#{group_matrices.size}. TOTAL" heatmap = $tot_cnt_mat.heatmap(:col_header => $amino_acids, :row_header => $amino_acids, :rvg_width => $rvg_width, :rvg_height => $rvg_height, :canvas_width => $canvas_width, :canvas_height => $canvas_height, :max_val => $tot_cnt_mat.max.ceil, :min_val => 0, :print_value => $heatmapvalue, :title => stem).write("#{stem}.#{$heatmapformat}") $logger.info "Generating a heat map for #{stem} table done." end exit 0 end # # Part 5. END # # Part 6. # # Calculating substitution probability tables # if $output == 1 $outfh.puts <
) leading to # any other residue type (i) and sums up to 100. # HEADER end # when nosmoothing !!! if ($output > 0) && $nosmooth # reinitialize $tot_cnt_mat for pseudocounts $tot_cnt_mat = NMatrix.float($amino_acids.size, $amino_acids.size) # for each combination of environment features pseudo_cnt = $add || (1.0 / $env_classes.group_size) # add pseudo counts for each frequency vector $env_classes.values.each { |e| e.freq_array += pseudo_cnt } # re-calculate probability vector for each environment class $env_classes.values.each { |e| e.prob_array = 100.0 * e.freq_array / e.freq_array.sum } group_matrices = [] $env_classes.groups_sorted_by_residue_labels.each_with_index do |group, group_no| grp_cnt_mat = NMatrix.float($amino_acids.size, $amino_acids.size) grp_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size) $amino_acids.each_with_index do |aa, aj| env_class = group[1].find { |e| e.label.start_with?(aa) } 0.upto($amino_acids.size - 1) { |i| grp_cnt_mat[aj, i] = env_class.freq_array[i] } 0.upto($amino_acids.size - 1) { |i| grp_prob_mat[aj, i] = env_class.prob_array[i] } end $tot_cnt_mat += grp_cnt_mat group_matrices << [group[0], grp_prob_mat] end if $output == 1 heatmaps = HeatmapArray.new if $heatmap == 1 or $heatmap == 2 grp_max_val = group_matrices.map { |l, m, n| m }.map { |m| m.max }.max || 100 $heatmapcol ||= Math::sqrt(group_matrices.size).round group_matrices.each_with_index do |(grp_label, grp_prob_mat), grp_no| # for a matrix file stem = "#{grp_no}. #{grp_label}" $outfh.puts ">#{grp_label} #{grp_no}" $outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids) # for a heat map if $heatmap == 0 or $heatmap == 2 grp_prob_mat.heatmap(:col_header => $amino_acids, :row_header => $amino_acids, :rvg_width => $rvg_width, :rvg_height => $rvg_height, :canvas_width => $canvas_width, :canvas_height => $canvas_height, :max_val => grp_max_val.ceil, :min_val => 0, :print_value => $heatmapvalue, :title => stem).write("#{stem}.#{$heatmapformat}") $logger.info "Generating a heat map for #{stem} table done." end if $heatmap == 1 or $heatmap == 2 heatmaps << grp_prob_mat.heatmap(:col_header => $amino_acids, :row_header => $amino_acids, :rvg_width => $rvg_width, :rvg_height => $rvg_height - 50, :canvas_width => $canvas_width, :canvas_height => $canvas_height - 50, :max_val => grp_max_val.ceil, :min_val => 0, :print_value => $heatmapvalue, :print_gradient => false, :title => stem, :title_font_size => $rvg_width * $heatmapcol / 100.0) end end # for heat maps in a single file if $heatmap == 1 or $heatmap == 2 file = "#{$heatmapstem}.#{$heatmapformat}" heatmaps.heatmap(:columns => $heatmapcol, :rvg_width => $rvg_width, :max_val => grp_max_val.ceil, :min_val => 0).write(file) $logger.info "Generating heat maps in a file, #{file} done." end end $tot_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size) 0.upto($amino_acids.size - 1) do |aj| col_sum = (0..$amino_acids.size - 1).inject(0) { |s, i| s + $tot_cnt_mat[aj, i] } 0.upto($amino_acids.size - 1) { |i| $tot_prob_mat[aj, i] = 100.0 * $tot_cnt_mat[aj, i] / col_sum } end if $output == 1 $outfh.puts '>Total' $outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids) $outfh.close # for a heat map if $heatmap == 0 or $heatmap == 2 stem = "#{group_matrices.size}. TOTAL" $tot_prob_mat.heatmap(:col_header => $amino_acids, :row_header => $amino_acids, :rvg_width => $rvg_width, :rvg_height => $rvg_height, :canvas_width => $canvas_width, :canvas_height => $canvas_height, :max_val => $tot_prob_mat.max.ceil, :min_val => 0, :print_value => $heatmapvalue, :title => stem).write("#{stem}.#{$heatmapformat}") $logger.info "Generating a heat map for #{stem} table done." end exit 0 end $logger.info 'Calculating substitution probabilities (no smoothing) done.' end # when smoothing!!! if ($output > 0) && !$nosmooth # # p1 probabilities # p1 = NArray.float($amino_acids.size) a0 = NArray.float($amino_acids.size).fill(1.0 / $amino_acids.size) big_N = $tot_aa.to_f small_n = $amino_acids.size.to_f omega1 = 1.0 / (1 + big_N / ($sigma * small_n)) omega2 = 1.0 - omega1 if ($smooth == :full) || $p1smooth # smoothing p1 probabilities for the partial smoothing procedure if --p1smooth on or, if it is full smoothing 0.upto($amino_acids.size - 1) { |i| p1[i] = 100.0 * (omega1 * a0[i] + omega2 * $aa_tot_freq[$amino_acids[i]]) } $smooth_prob[1] = p1 elsif ($smooth == :partial) # no smoothing for p1 probabilities just as Kenji's subst # in this case, p1 probabilities were taken from the amino acid frequencies of your data set 0.upto($amino_acids.size - 1) { |i| p1[i] = 100.0 * $aa_tot_freq[$amino_acids[i]] } $smooth_prob[1] = p1 end # # p2 and above # env_labels = $env_features.map_with_index { |ef, ei| ef.labels.map { |l| "#{ei}#{l}" } } if $smooth == :partial $outfh.puts <
2) && (ci < $env_features.size) $logger.debug "Skipped the level #{ci + 1} probabilities, due to partial smoothing." next end env_labels.combination(ci) do |c1| c1[0].product(*c1[1..-1]).each do |labels| pattern = '.' * $env_features.size labels.each do |label| i = label[0].chr.to_i l = label[1].chr pattern[i] = l end if pattern =~ /^\./ $logger.debug "Skipped the environment class, #{pattern}, due to partial smoothing." next end # get environments matching the pattern created above # and calculate amino acid frequencies and their probabilities for all the environments envs = $env_classes.values.select { |env| env.label.match(pattern.to_re) } freq_arr = envs.inject(NArray.float($amino_acids.size)) { |sum, env| sum + env.freq_array } prob_arr = NArray.float($amino_acids.size) 0.upto($amino_acids.size - 1) { |i| prob_arr[i] = ((freq_arr[i] == 0) ? 0 : (freq_arr[i] / freq_arr.sum.to_f)) } # # assess whether a residue type j is compatible with a particular combination of structural features # # corrections for non-zero colum vector phenomenon by switching the smoothing procedure off as below # if ci == $env_features.size # aa_label = labels.find { |l| l.match(/^0/) }[1].chr # sub_pattern = '.' * $env_features.size # sub_pattern[0] = aa_label # sub_freq_sum = 0 # # labels[1..-1].each do |label| # next if label.start_with?('0') # i = label[0].chr.to_i # l = label[1].chr # sub_pattern[i] = l # sub_envs = $env_classes.values.select { |env| env.label.match(pattern.to_re) } # sub_freq_arr = sub_envs.inject(NArray.float($amino_acids.size)) { |sum, env| sum + env.freq_array } # sub_freq_sum += sub_freq_arr.sum # end # # if sub_freq_sum == 0 # if $smooth_prob.has_key?(ci + 1) # $smooth_prob[ci + 1][labels.to_set] = prob_arr # else # $smooth_prob[ci + 1] = {} # $smooth_prob[ci + 1][labels.to_set] = prob_arr # end # $logger.warn "Smoothing procedure is off for the environment feature combination, #{pattern}" # next # end # end # collect priors priors = [] if ci == 1 priors << $smooth_prob[1] elsif ci == 2 labels.combination(1).select { |c2| c2[0].start_with?('0') }.each { |c3| priors << $smooth_prob[2][c3.to_set] } elsif ci == $env_features.size labels.combination(2).select { |c2| c2[0].start_with?('0') || c2[1].start_with?('0') }.each { |c3| priors << $smooth_prob[3][c3.to_set] } end # entropy based prior weighting step entropy_max = Math::log($amino_acids.size) entropies = priors.map { |prior| -1.0 * prior.to_a.inject(0.0) { |s, p| begin p == 0.0 ? s - 1 : s + p * Math::log(p) rescue #puts "P: #{p}" end } } mod_entropies = entropies.map_with_index { |entropy, i| (entropy_max - entropies[i]) / entropy_max } weights = mod_entropies.map { |mod_entropy| mod_entropy / mod_entropies.sum } weighted_priors = priors.map_with_index { |prior, i| prior * weights[i] }.sum # smoothing step smooth_prob_arr = NArray.float($amino_acids.size) big_N = freq_arr.sum.to_f small_n = $amino_acids.size.to_f omega1 = 1.0 / (1 + big_N / ($sigma * small_n)) omega2 = 1.0 - omega1 0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (omega1 * weighted_priors[i] + omega2 * prob_arr[i]) } # normalization step smooth_prob_arr_sum = smooth_prob_arr.sum 0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / smooth_prob_arr_sum) } # store smoothed probabilties in a hash using a set of envrionment labels as a key if $smooth_prob.has_key?(ci + 1) $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr else $smooth_prob[ci + 1] = {} $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr end end end end $logger.info 'Calculating substitution probabilities (partial smoothing) done.' else $outfh.puts <
1 labels.combination(ci - 1).each { |c2| priors << $smooth_prob[ci][c2.to_set] } else priors << $smooth_prob[1] end # entropy based weighting priors entropy_max = Math::log($amino_acids.size) entropies = priors.map do |prior| (entropy_max + prior.to_a.inject(0.0) { |s, p| s + p * Math::log(p) }) / entropy_max end weighted_priors = priors.map_with_index { |p, i| p * entropies[i] / entropies.sum }.sum # smoothing step smooth_prob_arr = NArray.float($amino_acids.size) big_N = freq_arr.sum.to_f small_n = $amino_acids.size.to_f omega1 = 1.0 / (1 + big_N / ($sigma * small_n)) omega2 = 1.0 - omega1 0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (omega1 * weighted_priors[i] + omega2 * prob_arr[i]) } # normalization step smooth_prob_arr_sum = smooth_prob_arr.sum 0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / smooth_prob_arr_sum) } # store smoothed probabilties in a hash using a set of envrionment labels as a key if $smooth_prob.has_key?(ci + 1) $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr else $smooth_prob[ci + 1] = {} $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr end end end end $logger.info 'Calculating substitution probabilities (full smoothing) done.' end # updating smoothed probability array for each envrionment $env_classes.values.each do |env| env.smooth_prob_array = $smooth_prob[$env_features.size + 1][env.label_set] end # sorting environments and build 21X21 substitution matrices group_matrices = [] $env_classes.groups_sorted_by_residue_labels.each do |group| # calculating 21X21 substitution probability matrix for each envrionment grp_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size) $amino_acids.each_with_index do |aa, ai| smooth_prob_arr = group[1].find { |e| e.label.start_with?(aa) }.smooth_prob_array 0.upto($amino_acids.size - 1) { |j| grp_prob_mat[ai, j] = smooth_prob_arr[j] } end group_matrices << [group[0], grp_prob_mat] end if $output == 1 heatmaps = HeatmapArray.new if $heatmap == 1 or $heatmap == 2 grp_max_val = group_matrices.map { |l, m, n| m }.map { |m| m.max }.max || 100 $heatmapcol ||= Math::sqrt(group_matrices.size).round group_matrices.each_with_index do |(grp_label, grp_prob_mat), grp_no| # for a matrix file stem = "#{grp_no}. #{grp_label}" $outfh.puts ">#{grp_label} #{grp_no}" $outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids) # for heat map generation if $heatmap == 0 or $heatmap == 2 grp_prob_mat.heatmap(:col_header => $amino_acids, :row_header => $amino_acids, :rvg_width => $rvg_width, :rvg_height => $rvg_height, :canvas_width => $canvas_width, :canvas_height => $canvas_height, :max_val => grp_max_val.ceil, :min_val => 0, :print_value => $heatmapvalue, :title => stem).write("#{stem}.#{$heatmapformat}") $logger.info "Generating a heat map for #{stem} table done." end if $heatmap == 1 or $heatmap == 2 heatmaps << grp_prob_mat.heatmap(:col_header => $amino_acids, :row_header => $amino_acids, :rvg_width => $rvg_width, :rvg_height => $rvg_height - 50, :canvas_width => $canvas_width, :canvas_height => $canvas_height - 50, :max_val => grp_max_val.ceil, :min_val => 0, :print_value => $heatmapvalue, :print_gradient => false, :title => stem, :title_font_size => $rvg_width * $heatmapcol / 100.0) end end # for heat maps in a single file if $heatmap == 1 or $heatmap == 2 file = "#{$heatmapstem}.#{$heatmapformat}" heatmaps.heatmap(:columns => $heatmapcol, :rvg_width => $rvg_width, :max_val => grp_max_val.ceil, :min_val => 0).write(file) $logger.info "Generating heat maps in a file, #{file} done." end end # for a total substitution probability matrix $tot_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size) $amino_acids.each_with_index do |aa, aj| 0.upto($amino_acids.size - 1) do |ai| $tot_prob_mat[aj, ai] = $smooth_prob[2][["0#{aa}"].to_set][ai] end end if $output == 1 $outfh.puts '>Total' $outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids) $outfh.close # for a heat map if $heatmap == 0 or $heatmap == 2 stem = "#{group_matrices.size}. TOTAL" $tot_prob_mat.heatmap(:col_header => $amino_acids, :row_header => $amino_acids, :rvg_width => $rvg_width, :rvg_height => $rvg_height, :canvas_width => $canvas_width, :canvas_height => $canvas_height, :max_val => $tot_prob_mat.max.ceil, :min_val => 0, :print_value => $heatmapvalue, :title => stem).write("#{stem}.#{$heatmapformat}") $logger.info "Generating a heat map for #{stem} table done." end exit 0 end end # # Part 6. END # # Part 7. # # Calculating log odds ratio scoring matrices # if $output == 2 $outfh.puts <
#{grp_label} #{grp_no}" $outfh.puts grp_logo_mat.pretty_string(:col_header => $amino_acids, :row_header => row_header) # for a heat map if $heatmap == 0 or $heatmap == 2 grp_logo_mat.heatmap(:col_header => $amino_acids, :row_header => row_header, :rvg_width => $rvg_width, :rvg_height => $rvg_height, :canvas_width => $canvas_width, :canvas_height => $canvas_height, :gradient_beg_color => '#0000FF', :gradient_mid_color => '#FFFFFF', :gradient_end_color => '#FF0000', :max_val => abs_max_val.ceil, :mid_val => 0, :min_val => -1 * abs_max_val.ceil, :print_value => $heatmapvalue, :title => stem).write("#{stem}.#{$heatmapformat}") $logger.info "Generating a heat map for #{stem} table done." end if $heatmap == 1 or $heatmap == 2 heatmaps << grp_logo_mat.heatmap(:col_header => $amino_acids, :row_header => row_header, :rvg_width => $rvg_width, :rvg_height => $rvg_height - 50, :canvas_width => $canvas_width, :canvas_height => $canvas_height - 50, :gradient_beg_color => '#0000FF', :gradient_mid_color => '#FFFFFF', :gradient_end_color => '#FF0000', :max_val => abs_max_val.ceil, :mid_val => 0, :min_val => -1 * abs_max_val.ceil, :print_value => $heatmapvalue, :print_gradient => false, :title => stem, :title_font_size => $rvg_width * $heatmapcol / 100.0) end end # for heat maps in a single file if $heatmap == 1 or $heatmap == 2 file = "#{$heatmapstem}.#{$heatmapformat}" heatmaps.heatmap(:columns => $heatmapcol, :rvg_width => $rvg_width, :gradient_beg_color => '#0000FF', :gradient_mid_color => '#FFFFFF', :gradient_end_color => '#FF0000', :max_val => abs_max_val.ceil, :mid_val => 0, :min_val => -1 * abs_max_val.ceil).write(file) $logger.info "Generating heat maps in a file, #{file} done." end # for a matrix file unless $noroundoff $tot_logo_mat = $tot_logo_mat.round end $outfh.puts ">Total #{grp_logo_mats.size}" $outfh.puts $tot_logo_mat.pretty_string(:col_header => $amino_acids, :row_header => row_header) # for a heat map if $heatmap == 0 or $heatmap == 2 stem = "#{group_matrices.size}. TOTAL" tot_abs_max_val = [$tot_logo_mat.max.abs, $tot_logo_mat.min.abs].max $tot_logo_mat.heatmap(:col_header => $amino_acids, :row_header => row_header, :rvg_width => $rvg_width, :rvg_height => $rvg_height, :canvas_width => $canvas_width, :canvas_height => $canvas_height, :gradient_beg_color => '#0000FF', :gradient_mid_color => '#FFFFFF', :gradient_end_color => '#FF0000', :max_val => tot_abs_max_val.ceil, :mid_val => 0, :min_val => -1 * tot_abs_max_val.ceil, :print_value => $heatmapvalue, :title => stem).write("#{stem}.#{$heatmapformat}") $logger.info "Generating a heat map for #{stem} table done." end $logger.info "Calculating log odds ratios done." end # # Part 7. END # $outfh.close exit 0 end end end # class CLI end # module Egor