b0VIM 7.2IѴ@?seminbrunor/BiO/Develop/ulla/lib/ulla/cli.rbutf-8 3210#"! UtpW~cY>:5dnUUe h ]" H V LZiFU H^[S@T9ad[WE?zgF>( w E D ;  J i L t J o N  Ml`F(|HAtQ4 a;  # # Calculate # # Calculate PI # # Calculate PI # # Calculate PID # # Calculate P # # Calculate PID between # # Calculate PID betwe # # Calculate PID between two # # Calculate PID # # Calculate PID between two sequences end USAGE --help (-h): show help --version: print version 3 for DEBUG or above level 2 for INFO or above level 1 for WARN or above level (default) 0 for ERROR level --verbose (-v) INTEGER --heatmap-values: print values in the cells when generating heat maps --heatmap-stem STRING: stem for a file name when --heatmap 1 or 2 set (default: 'heatmap') --heatmap-columns INTEGER: number of tables to print in a row when --heatmap 1 or 2 set (default: sqrt(no. of tables)) 4 for Portable Document Format (PDF) 3 for Microsoft Windows bitmap (BMP) Format 2 for Joint Photographic Experts Group (JPEG) Format 1 for Graphics Interchange Format (GIF) 0 for Portable Network Graphics (PNG) Format (default) --heatmap-format INTEGER: 2 do both 0 and 1 1 create one big file containing all heat maps from substitution tables 0 create a heat map file for each substitution table --heatmap INTEGER: --pidmax DOUBLE: count substitutions only for pairs with PID smaller than this value (default none) --pidmin DOUBLE: count substitutions only for pairs with PID equal to or greater than this value (default none) --add DOUBLE: add this value to raw counts when deriving log-odds without smoothing (default 1/#classes) --autosigma: automatically adjust the sigma value for smoothing --sigma DOUBLE: change the sigma value for smoothing (default 5.0) --scale INTEGER: log-odds matrices in 1/n bit units (default 3) --noroundoff: do not round off log odds ratio 2 for log-odds (default) 1 for probabilities 0 for raw counts (no smoothing performed) --output INTEGER: 2 for using only C for both (must be set when you have no 'disulphide' or 'disulfide' annotation in templates) 1 for both structure and sequence 0 for using C and J only for structure (default) --cys (-y) INTEGER: --nosmooth: perform no smoothing operation --p1smooth: perform smoothing for p1 probability calculation when partial smoothing 1 for full smoothing 0 for partial smoothing (default) --smooth (-s) INTEGER: --noweight: calculate substitution counts with no weights --weight (-w) INTEGER: clustering level (PID) for the BLOSUM-like weighting (default: 60) --outfile (-o) FILE: output filename (default 'allmat.dat') --classdef (-c) FILE: a file for the defintion of environments (default: 'classdef.dat') --tem-list (-l) FILE: a list for tem files --tem-file (-f) FILE: a tem fileOptions: ulla [ options ] -f TEM-file -c CLASSDEF-file or ulla [ options ] -l TEMLIST-file -c CLASSDEF-fileUsage:ulla: a program to calculate environment-specific amino acid substitution tables. puts <<-USAGE def print_usage # # Ulla::CLI::print_usage # :call-seq: # # Print Ulla's Usage on the screen end puts VERSION def print_version # :nodoc: class << self class CLImodule Ulla# Copyright (C) 2008-9 Semin Lee# ---# This is a module for an actual command line interpreter for Ullarequire 'facets'require 'set'require 'bio'require 'narray'require 'logger'require 'getoptlong'require 'rubygems'ad@89}=v> o R "   B A & k # P  x 3 a _^~oe]\J87end # module Ulla end # class CLI end end exit 0 $outfh.close # # Part 7. END # end $logger.info "Calculating log odds ratios done." end $logger.info "Generating a heat map for #{stem} table done." :title => stem).write("#{stem}.#{$heatmapformat}") :print_value => $heatmapvalues, :min_val => -1 * tot_abs_max_val.ceil, :mid_val => 0, :max_val => tot_abs_max_val.ceil, :gradient_end_color => '#FF0000', :gradient_mid_color => '#FFFFFF', :gradient_beg_color => '#0000FF', :canvas_height => $canvas_height, :canvas_width => $canvas_width, :rvg_height => $rvg_height, :rvg_width => $rvg_width, :row_header => row_header, $tot_logo_mat.heatmap(:col_header => $amino_acids, tot_abs_max_val = [$tot_logo_mat.max.abs, $tot_logo_mat.min.abs].max stem = "#{group_matrices.size}. TOTAL" if $heatmap == 0 or $heatmap == 2 # for a heat map :row_header => row_header) $outfh.puts $tot_logo_mat.pretty_string(:col_header => $amino_acids, $outfh.puts ">Total #{grp_logo_mats.size}" end $tot_logo_mat = $tot_logo_mat.round unless $noroundoff # for a matrix file end $logger.info "Generating heat maps in a file, #{file} done." :min_val => -1 * abs_max_val.ceil).write(file) :mid_val => 0, :max_val => abs_max_val.ceil, :gradient_end_color => '#FF0000', :gradient_mid_color => '#FFFFFF', :gradient_beg_color => '#0000FF', :rvg_width => $rvg_width, heatmaps.heatmap(:columns => $heatmapcol,adi5{z5 xDC u 9 S R  t c % F 3 #  } = i env_ftr[3], env_ftr[2].split(''), env_ftr[1].split(''), $env_features << EnvironmentFeature.new(env_ftr[0], end $logger.warn "The environment feature, #{line} constrained." $cst_features << env_index if env_ftr[-2] == 'T' end next $logger.warn "The environment feature, #{line} silent." # skip silenced environment feature if env_ftr[-1] == 'T' $logger.info "An environment feature, #{line} detected." elsif (env_ftr = line.chomp.split(/;/)).length == 5 next if line.start_with?('#') line.chomp! IO.foreach($classdef) do |line| env_index = 1 # the hash prepared above # read environment class definiton file and store them into 'F') 'F', $amino_acids, $amino_acids, $env_features << EnvironmentFeature.new('sequence', # feature list # add substituted amino acid (aa1) in a substitution to the environment $cst_features = [] # an array for storing indexes of constrained environment features $env_features = EnvironmentFeatureArray.new # features # create an EnvironmentFeatureList object for storing all environment end $amino_acids = 'ACDEFGHIKLMNPQRSTVWY'.split('') if $cys == 2 # check --cys option and modify amino_acids set if necessary # # Reading Environment Class Definition File # # Part 3. # # Part 2 END #adFR>W0{C  z &  q b G 6 * ) n ] Q P $ h /    w v [ > !   qSR|B65T98WjH21 <]4O env_ftr[3], env_ftr[2].split(''), env_ftr[1].split(''), $env_features << EnvironmentFeature.new(env_ftr[0], end $logger.warn "The environment feature, #{line} constrained." $cst_features << env_index if env_ftr[-2] == 'T' end next $logger.warn "The environment feature, #{line} silent." # skip silenced environment feature if env_ftr[-1] == 'T' $logger.info "An environment feature, #{line} detected." elsif (env_ftr = line.chomp.split(/;/)).length == 5 next if line.start_with?('#') line.chomp! IO.foreach($classdef) do |line| env_index = 1 # the hash prepared above # read environment class definiton file and store them into 'F') 'F', $amino_acids, $amino_acids, $env_features << EnvironmentFeature.new('sequence', # feature list # add substituted amino acid (aa1) in a substitution to the environment $cst_features = [] # an array for storing indexes of constrained environment features $env_features = EnvironmentFeatureArray.new # features # create an EnvironmentFeatureList object for storing all environment end $amino require 'ulla/heatmap_array' require 'ulla/environment_feature_array' require 'ulla/environment_feature' require 'ulla/environment_class_hash' require 'ulla/environment' require 'nmatrix_extensions' require 'narray_extensions' require 'string_extensions' require 'math_extensions' end exit 1 warn "Cannot find environment class definition file, #{$classdef}" if $classdef && !File.exist?($classdef) end exit 1 warn "Cannot find template file, #{$tem_file}" if $tem_file && !File.exist?($tem_file) end exit 1 warn "Cannot find template list file, #{$tem_list}" if $tem_list && !File.exist?($tem_list) # warn if any input file is missing end exit 1 print_usage ($tem_list && $tem_file)) (!$tem_list && !$tem_file) || if ((ARGV.length != 0) || # when arguments are nonsense, print usage end exit 1 # invalid option rescue end end exit 0 print_version when '--version' end exit 1 warn "--verbose (-v) #{arg.to_i} is not supported." else when 3 then Logger::DEBUG when 2 then Logger::INFO when 1 then Logger::WARN when 0 then Logger::ERROR $logger.level = case arg.to_i when '--verbose' $heatmapvalues = true when '--heatmap-values' end exit 1 warn "--heatmap-format #{arg.to_i} is not supported." else when 4 then 'pdf'adnE vu' H p f    v j i T &   n m L y )  ZD!>,d~K1W<aJ rdcL# if id1 != id2 ali.each_pair do |id2, seq2| ali.each_pair do |id1, seq1| if $noweight end end end end end } env_labels[key][i] = e + labels[i] env_labels[key].each_with_index { |e, i| else env_labels[key] = labels if env_labels[key].empty? end end end ec.labels[ec.symbols.index(sym)] else (disulphide.has_key?(key) && (disulphide[key][pos] == 'F') && (sym == 'C') && ($cys != 2)) ? 'J' : sym if ei == 0 # Amino Acid Environment Feature else 'X' elsif sym == 'X' || sym == 'x' '-' if sym == '-' labels = pir.data.remove_internal_spaces.split('').map_with_index do |sym, pos| if (pir.entry_id == key) && (pir.definition == ec.name) ff.each_entry do |pir| ff.rewind env_labels[key] = [] unless env_labels.has_key?(key) $env_features.each_with_index do |ec, ei| end end disulphide[key] = pir.data.remove_internal_spaces.split('') (pir.definition == "disulfide"))) ((pir.definition == "disulphide") || if ((pir.entry_id == key) && ff.each_entry do |pir| ff.rewind # check disulphide bond environment first! ali.each_pair do |key, seq| disulphide = {} env_labels = {} $ali_size += 1 end next $logger.warn "Skipped #{tem_file} which has only one unique entry." if ali.size < 2 end end ali.add_seq(pir.data.remove_internal_spaces, pir.entry_id) if (pir.definition == 'sequence') || (pir.definition == 'structure') ff.each_entry do |pir| ff = Bio::FlatFile.auto(tem_file) ali = Bio::Alignment::OriginalAlignment.new tem_file.chomp! $tem_list_io.each_line do |tem_file| end $tem_list_io = File.open($tem_list) if $tem_list end $tem_list_io = StringIO.new($tem_file) if $tem_file $outfh = File.open($outfile, 'w') # a global file handle for output # # Reading TEM file or TEMLIST list file and couting substitutions # # Part 4. # # Part 3 END # } $amino_acids) e.flatten.join, $env_classes[e.flatten.join] = Environment.new(i, $env_features.label_combinations.each_with_index { |e, i| # as a key # every environment class into the hash prepared above with the label # generate all possible combinations of environment labels, and store $env_classes = EnvironmentClassHash.new # a hash for storing all environment classes end end exit 1 "a environment class definition." $logger.error "\"#{line}\" doesn't seem to be a proper format for" + else env_index += 1 env_ftr[4])adwU^.nXW5 [ B , + t  T 9 ! e M L  y x vuIcK0g.m7~feCiwv end end end end $logger.debug "#{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substitution count (1) was added to the environments class, #{env_labels[id1][pos]}." end end $aa_mut_cnt[aa1] = 1 else $aa_mut_cnt[aa1] += 1 if $aa_mut_cnt.has_key? aa1 if aa1 != aa2 end $aa_tot_cnt[aa1] = 1 else $aa_tot_cnt[aa1] += 1 if $aa_tot_cnt.has_key? aa1 end $aa_env_cnt[grp_label][aa1] = 1 $aa_env_cnt[grp_label] = Hash.new(0) else end $aa_env_cnt[grp_label][aa1] = 1 else $aa_env_cnt[grp_label][aa1] += 1 if $aa_env_cnt[grp_label].has_key? aa1 if $aa_env_cnt.has_key? grp_label grp_label = env_labels[id1][pos][1..-1] end next $logger.debug "Skipped #{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2}, they have different symbols for constrained environment features each other." else $env_classes[env_labels[id1][pos]].increase_residue_count(aa2) elsif (env_labels[id1][pos].split('').values_at(*$cst_features) == env_labels[id2][pos].split('').values_at(*$cst_features)) $env_classes[env_labels[id1][pos]].increase_residue_count(aa2) if $cst_features.empty? aa2 = (disulphide.has_key?(id2) && (disulphide[id2][pos] == 'F') && (aa2 == 'C') && ($cys != 2)) ? 'J' : aa2 aa1 = (disulphide.has_key?(id1) && (disulphide[id1][pos] == 'F') && (aa1 == 'C') && ($cys != 2)) ? 'J' : aa1 end next $logger.warn "#{id1}-#{pos}-#{aa2} is not a standard amino acid." unless aa2 == "-" unless $amino_acids.include?(aa2) end next $logger.warn "#{id1}-#{pos}-#{aa1} is not a standard amino acid." unless aa1 == "-" unless $amino_acids.include?(aa1) end next $logger.info "Substitutions to #{id2}-#{pos}-#{aa2} were masked." if env_labels[id2][pos].include?('X') end next $logger.info "Substitutions from #{id1}-#{pos}-#{aa1} were masked." if env_labels[id1][pos].include?('X') aa2 = s2[pos].upcase aa1.upcase! s1.each_with_index do |aa1, pos| end next "having PID, #{pid}% greater than PID_MAX, #{$pidmax}." $logger.info "Skip alignment between #{id1} and #{id2} " + if $pidmax && (pid > $pidmax) # check PID_MAX end next "having PID, #{pid}% less than PID_MIN, #{$pidmin}." $logger.info "Skip alignment between #{id1} and #{id2} " + if $pidmin && (pid < $pidmin) # check PID_MIN s2 = seq2.split('') s1 = seq1.split('') pid = calculate_pid(seq1, seq2)adU~}J8vK r X @    Y  q Q P / ` ;  f  uZBAxwAoPmCBZk  else $env_classes[env_labels[id2][pos]].increase_residue_count(aa1, jnt_cnt) $env_classes[env_labels[id1][pos]].increase_residue_count(aa2, jnt_cnt) elsif (env_labels[id1][pos].split('').values_at(*$cst_features) == env_labels[id2][pos].split('').values_at(*$cst_features)) $env_classes[env_labels[id2][pos]].increase_residue_count(aa1, jnt_cnt) $env_classes[env_labels[id1][pos]].increase_residue_count(aa2, jnt_cnt) if $cst_features.empty? jnt_cnt = cnt1 * cnt2 cnt2 = 1.0 / cluster2.size cnt1 = 1.0 / cluster1.size aa2 = (disulphide.has_key?(id2) && (disulphide[id2][pos] == 'F') && (aa2 == 'C') && ($cys != 2)) ? 'J' : aa2 aa1 = (disulphide.has_key?(id1) && (disulphide[id1][pos] == 'F') && (aa1 == 'C') && ($cys != 2)) ? 'J' : aa1 end next $logger.warn "#{id2}-#{pos}-#{aa2} is not standard amino acid." unless aa2 == "-" unless $amino_acids.include?(aa2) end next $logger.warn "#{id1}-#{pos}-#{aa1} is not standard amino acid." unless aa1 == "-" unless $amino_acids.include?(aa1) end next $logger.debug "All substitutions to #{id2}-#{pos}-#{aa2} are masked." if env_labels[id2][pos].include?('X') end next $logger.debug "All substitutions from #{id1}-#{pos}-#{aa1} are masked." if env_labels[id1][pos].include?('X') aa2 = seq2[pos].upcase rescue next # should fix this in a sane way! aa1.upcase! seq1.each_with_index do |aa1, pos| seq2 = ali[id2].split('') seq1 = ali[id1].split('') cluster2.each do |id2| cluster1.each do |id1| clusters.combination(2).each do |cluster1, cluster2| end next $logger.debug "Skipped #{tem_file} which has only one cluster at the #{$weight} PID level." if clusters.size < 2 end while(continue) end end clusters.compact! clusters[i] = group end clusters[k] = nil group = group.concat(clusters[k]) indexes.each do |k| group = clusters[i] continue = true unless indexes.empty? end end break if found end end break found = true indexes << j if calculate_pid(ali[c1], ali[c2]) >= $weight clusters[j].each do |c2| clusters[i].each do |c1| found = false (i + 1).upto(clusters.size - 1) do |j| indexes = [] 0.upto(clusters.size - 2) do |i| continue = false begin # a loop for single linkage clustering ali.each_pair { |i, s| clusters << [i] } clusters = [] # BLOSUM-like weighting elseadhdI10~@ y ;  s 6  n 4   u ] \ , { I  P5bL8&N+SR5ki0}{thgR! end $outfh.puts "# Weighting scheme: clustering at PID #{$weight} level" else $outfh.puts '# Weighting scheme: none' if $noweight endHEADER## J: Cysteine (the free thiol form)# C: Cystine (the disulfide-bonded form) $outfh.puts < #{id1}-#{pos}-#{aa1} substitution count (#{"%.2f" % jnt_cnt}) was added to the environments class, #{env_labels[id2][pos]}." $logger.debug "#{id1}-#{pos}-#{aa1} -> #{id2}-#{pos}-#{aa2} substitution count (#{"%.2f" % jnt_cnt}) was added to the environments class, #{env_labels[id1][pos]}." end end $aa_mut_cnt[aa2] = cnt2 else $aa_mut_cnt[aa2] += cnt2 if $aa_mut_cnt.has_key? aa2 end $aa_mut_cnt[aa1] = cnt1 else $aa_mut_cnt[aa1] += cnt1 if $aa_mut_cnt.has_key? aa1 if aa1 != aa2 end $aa_tot_cnt[aa2] = cnt2 else $aa_tot_cnt[aa2] += cnt2 if $aa_tot_cnt.has_key? aa2 end $aa_tot_cnt[aa1] = cnt1 else $aa_tot_cnt[aa1] += cnt1 if $aa_tot_cnt.has_key? aa1 end $aa_env_cnt[grp_label2][aa2] = cnt2 $aa_env_cnt[grp_label2] = Hash.new(0.0) else end $aa_env_cnt[grp_label2][aa2] = cnt2 else $aa_env_cnt[grp_label2][aa2] += cnt2 if $aa_env_cnt[grp_label2].has_key? aa2 if $aa_env_cnt.has_key? grp_label2 end $aa_env_cnt[grp_label1][aa1] = cnt1 $aa_env_cnt[grp_label1] = Hash.new(0.0) else end $aa_env_cnt[grp_label1][aa1] = cnt1 else $aa_env_cnt[grp_label1][aa1] += cnt1 if $aa_env_cnt[grp_label1].has_key? aa1 if $aa_env_cnt.has_key? grp_label1 grp_label2 = env_labels[id2][pos][1..-1] grp_label1 = env_labels[id1][pos][1..-1] end next $logger.debug "#{id1}-#{pos}-#{aa1} and #{id2}-#{pos}-#{aa2} have different symbols for constrained environment features each other."ad]R6j=<$ t \ [ 8 { ;  ,   {    ,  qcWV>dE7+*g*zpon\RK=10Q[  end 0.upto($amino_acids.size - 1) { |i| grp_cnt_mat[aj, i] = freq_array[i] } freq_array = group[1].find { |e| e.label.start_with?(aa) }.freq_array $amino_acids.each_with_index do |aa, aj| grp_cnt_mat = NMatrix.send($noweight ? 'int' : 'float', $amino_acids.size, $amino_acids.size) $env_classes.groups_sorted_by_residue_labels.each_with_index do |group, group_no| # for each combination of environment features group_matrices = [] $tot_cnt_mat = NMatrix.send($noweight ? 'int' : 'float', $amino_acids.size, $amino_acids.size) # count raw frequencies end end e.prob_array = 100.0 * e.freq_array / e.freq_array.sum if e.freq_array.sum != 0 $env_classes.values.each do |e| # calculating probabilities for each environment # # Generating substitution frequency matrices # # Part 5. # # Part 4. END # $outfh.puts '#' $outfh.puts '# REL_FREQ: Relative frequency' $outfh.puts '# REL_MUTB: Relative mutability (ALA = 100)' $outfh.puts '# MUTB: Mutability (MUT_OBS / TOT_OBS)' $outfh.puts '# MUT_OBS: Total count of mutation' $outfh.puts '# TOT_OBS: Total count of incidence' $outfh.puts '# RES: Amino acid one letter code' $outfh.puts '#' end end $sigma = min_sigma $logger.warn "The sigma value has been changed from #{$sigma} to #{min_sigma}." if $autosigma $logger.warn "We recommend you to use a sigma value equal to or smaller than #{min_sigma}." if min_cnt > -1 end end [res, $aa_tot_cnt[res], $aa_mut_cnt[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_tot_freq[res]] $outfh.puts '# %-3s %9.2f %9.2f %5.2f %8d %8.4f' % else [res, $aa_tot_cnt[res], $aa_mut_cnt[res], $aa_mutb[res], $aa_rel_mutb[res], $aa_tot_freq[res]] $outfh.puts '# %-3s %9d %9d %5.2f %8d %8.4f' % if $noweight $amino_acids.each do |res| end $aa_tot_freq[res] = ($aa_tot_cnt[res] == 0) ? 0.0 : ($aa_tot_cnt[res] / $tot_aa.to_f) $aa_rel_mutb[res] = $aa_mutb[res] * ala_factor $aa_mutb[res] = ($aa_tot_cnt[res] == 0) ? 1.0 : ($aa_mut_cnt[res] / $aa_tot_cnt[res].to_f) end $logger.warn "The current sigma value, #{$sigma} seems to be too big for the total count (#{"%.2f" % $aa_tot_cnt[res]}) of amino acid, #{res}." end min_sigma = min_cnt / $min_cnt_sigma_ratio min_cnt = $aa_tot_cnt[res] elsif (min_cnt > 0) && (min_cnt > $aa_tot_cnt[res]) min_sigma = min_cnt / $min_cnt_sigma_ratio min_cnt = $aa_tot_cnt[res] if min_cnt < 0 if ($aa_tot_cnt[res] / $sigma) < $min_cnt_sigma_ratio $amino_acids.each do |res| min_sigma = nil min_cnt = -1 $outfh.puts "# %-3s %9s %9s %5s %8s %8s" % %w[RES TOT_OBS MUT_OBS MUTB REL_MUTB REL_FREQ] $outfh.puts "# Total amino acid frequencies:\n" $outfh.puts '#' $tot_aa = $aa_tot_cnt.values.sum end 100.0 * $aa_tot_cnt['A'] / $aa_mut_cnt['A'].to_f else 0.0 elsif $aa_mut_cnt['A'] == 0 0.0 ala_factor = if $aa_tot_cnt['A'] == 0 # print them as default statistics in the header part # calculate amino acid frequencies and mutabilities, andadDHedLlk N p - f ! ] t 4 CDV ^N@?m0lm"DC :rvg_height => $rvg_height, :rvg_width => $rvg_width, :row_header => $amino_acids, heatmap = $tot_cnt_mat.heatmap(:col_header => $amino_acids, stem = "#{group_matrices.size}. TOTAL" if $heatmap == 0 or $heatmap == 2 :row_header => $amino_acids) $outfh.puts $tot_cnt_mat.pretty_string(:col_header => $amino_acids, $outfh.puts '>Total' # total end $logger.info "Generating heat maps in a file, #{file} done." :min_val => 0).write(file) :max_val => grp_max_val.ceil, :rvg_width => $rvg_width, heatmaps.heatmap(:columns => $heatmapcol, file = "#{$heatmapstem}.#{$heatmapformat}" if $heatmap == 1 or $heatmap == 2 end end :title_font_size => $rvg_width * $heatmapcol / 100.0) :title => stem, :print_gradient => false, :print_value => $heatmapvalues, :min_val => 0, :max_val => grp_max_val.ceil, :canvas_height => $canvas_height - 50, :canvas_width => $canvas_width, :rvg_height => $rvg_height - 50, :rvg_width => $rvg_width, :row_header => $amino_acids, heatmaps << grp_cnt_mat.heatmap(:col_header => $amino_acids, title_font_size = $rvg_width * $heatmapcol / 80.0 if $heatmap == 1 or $heatmap == 2 end $logger.info "Generating a heat map for #{stem} table done." :title => stem).write("#{stem}.#{$heatmapformat}") :print_value => $heatmapvalues, :min_val => 0, :max_val => grp_max_val.ceil, :canvas_height => $canvas_height, :canvas_width => $canvas_width, :rvg_height => $rvg_height, :rvg_width => $rvg_width, :row_header => $amino_acids, grp_cnt_mat.heatmap(:col_header => $amino_acids, if $heatmap == 0 or $heatmap == 2 # for a heat map :row_header => $amino_acids) $outfh.puts grp_cnt_mat.pretty_string(:col_header => $amino_acids, $outfh.puts ">#{grp_label} #{grp_no}" stem = "#{grp_no}. #{grp_label}" # for a matrix file group_matrices.each_with_index do |(grp_label, grp_cnt_mat), grp_no| $heatmapcol ||= Math::sqrt(group_matrices.size).round grp_max_val = group_matrices.map { |l, m, n| m }.map { |m| m.max }.max heatmaps = HeatmapArray.new if $heatmap == 1 or $heatmap == 2 if $output == 0 $logger.info "Counting substitutions done." end group_matrices << [group[0], grp_cnt_mat] $tot_cnt_mat += grp_cnt_matad*Vg   ~ l b , " ! \  S   T  d c E D KJl q"1Zr-` :min_val => 0, :max_val => grp_max_val.ceil, :canvas_height => $canvas_height, :canvas_width => $canvas_width, :rvg_height => $rvg_height, :rvg_width => $rvg_width, :row_header => $amino_acids, grp_prob_mat.heatmap(:col_header => $amino_acids, if $heatmap == 0 or $heatmap == 2 # for a heat map :row_header => $amino_acids) $outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids, $outfh.puts ">#{grp_label} #{grp_no}" stem = "#{grp_no}. #{grp_label}" # for a matrix file group_matrices.each_with_index do |(grp_label, grp_prob_mat), grp_no| $heatmapcol ||= Math::sqrt(group_matrices.size).round grp_max_val = group_matrices.map { |l, m, n| m }.map { |m| m.max }.max || 100 heatmaps = HeatmapArray.new if $heatmap == 1 or $heatmap == 2 if $output == 1 end group_matrices << [group[0], grp_prob_mat] $tot_cnt_mat += grp_cnt_mat end 0.upto($amino_acids.size - 1) { |i| grp_prob_mat[aj, i] = env_class.prob_array[i] } 0.upto($amino_acids.size - 1) { |i| grp_cnt_mat[aj, i] = env_class.freq_array[i] } env_class = group[1].find { |e| e.label.start_with?(aa) } $amino_acids.each_with_index do |aa, aj| grp_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size) grp_cnt_mat = NMatrix.float($amino_acids.size, $amino_acids.size) $env_classes.groups_sorted_by_residue_labels.each_with_index do |group, group_no| group_matrices = [] $env_classes.values.each { |e| e.prob_array = 100.0 * e.freq_array / e.freq_array.sum } # re-calculate probability vector for each environment class $env_classes.values.each { |e| e.freq_array += pseudo_cnt } # add pseudo counts for each frequency vector pseudo_cnt = $add || (1.0 / $env_classes.group_size) # for each combination of environment features $tot_cnt_mat = NMatrix.float($amino_acids.size, $amino_acids.size) # reinitialize $tot_cnt_mat for pseudocounts if ($output > 0) && $nosmooth # when nosmoothing !!! endHEADER## any other residue type (i) and sums up to 100.# a particular structural environment (specified after >) leading to # likelihood of acceptance of a mutational event by a residue type j in # Each column (j) represents the probability distribution for the # $outfh.puts < stem).write("#{stem}.#{$heatmapformat}") :print_value => $heatmapvalues, :min_val => 0, :max_val => $tot_cnt_mat.max.ceil, :canvas_height => $canvas_height, :canvas_width => $canvas_width,ad)mLYX 3 8 3 J Y +  @   KJM?>$aHG*=p)Y|{&ml p1 = NArray.float($amino_acids.size) # # p1 probabilities # if ($output > 0) && !$nosmooth # when smoothing!!! end $logger.info 'Calculating substitution probabilities (no smoothing) done.' end exit 0 end $logger.info "Generating a heat map for #{stem} table done." :title => stem).write("#{stem}.#{$heatmapformat}") :print_value => $heatmapvalues, :min_val => 0, :max_val => $tot_prob_mat.max.ceil, :canvas_height => $canvas_height, :canvas_width => $canvas_width, :rvg_height => $rvg_height, :rvg_width => $rvg_width, :row_header => $amino_acids, $tot_prob_mat.heatmap(:col_header => $amino_acids, stem = "#{group_matrices.size}. TOTAL" if $heatmap == 0 or $heatmap == 2 # for a heat map $outfh.close :row_header => $amino_acids) $outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids, $outfh.puts '>Total' if $output == 1 end 0.upto($amino_acids.size - 1) { |i| $tot_prob_mat[aj, i] = 100.0 * $tot_cnt_mat[aj, i] / col_sum } col_sum = (0..$amino_acids.size - 1).inject(0) { |s, i| s + $tot_cnt_mat[aj, i] } 0.upto($amino_acids.size - 1) do |aj| $tot_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size) end end $logger.info "Generating heat maps in a file, #{file} done." :min_val => 0).write(file) :max_val => grp_max_val.ceil, :rvg_width => $rvg_width, heatmaps.heatmap(:columns => $heatmapcol, file = "#{$heatmapstem}.#{$heatmapformat}" if $heatmap == 1 or $heatmap == 2 # for heat maps in a single file end end :title_font_size => title_font_size) :title => stem, :print_gradient => false, :print_value => $heatmapvalues, :min_val => 0, :max_val => grp_max_val.ceil, :canvas_height => $canvas_height - 50, :canvas_width => $canvas_width, :rvg_height => $rvg_height - 50, :rvg_width => $rvg_width, :row_header => $amino_acids, heatmaps << grp_prob_mat.heatmap(:col_header => $amino_acids, title_font_size = $rvg_width * $heatmapcol / 80.0 if $heatmap == 1 or $heatmap == 2 end $logger.info "Generating a heat map for #{stem} table done." :title => stem).write("#{stem}.#{$heatmapformat}") :print_value => $heatmapvalues,ad[Zb'^ U k ] \ P 7 + n l e L ) l e T 1 { z Y <   ZWWTo8z?  nXW1P prob_arr = NArray.float($amino_acids.size) freq_arr = envs.inject(NArray.float($amino_acids.size)) { |sum, env| sum + env.freq_array } envs = $env_classes.values.select { |env| env.label.match(pattern.to_re) } # and calculate amino acid frequencies and their probabilities for all the environments # get environments matching the pattern created above end next $logger.debug "Skipped the environment class, #{pattern}, due to partial smoothing." if pattern =~ /^\./ end pattern[i] = l l = label[1].chr i = label[0].chr.to_i labels.each do |label| pattern = '.' * $env_features.size c1[0].product(*c1[1..-1]).each do |labels| env_labels.combination(ci) do |c1| end next $logger.debug "Skipped the level #{ci + 1} probabilities, due to partial smoothing." if (ci > 2) && (ci < $env_features.size) # for partial smoothing, only P1 ~ P3, and Pn are considered 1.upto($env_features.size) do |ci|HEADER## sigma value used is: #{$sigma}# # Weights (omegas) are calculated as in Topham et al. (1993)# # A3(ri|Rj,fq) = sum over fq omega_c * pc3(Rj,fq)# where# p5(ri|Rj,...) = omega1 * A3(ri|Rj,fq) + omega2 * W5(ri|Rj...)# p5(ri|Rj,...) is estimated as:# ^^^^^^^^^# The smoothing procedure is curtailed here and finally# # A2(ri|fq) = p2(ri|fq) (fixed fq to be Rj; partial smoothing)# where# p3(ri|Rj,fq) = omega1 * A2(ri|fq) + omega2 * W3(ri|Rj,fq)# p3(ri|Rj,fq) is estimated as:# # p2(ri|Rj) = omega1 * p1(ri) + omega2 * W2(ri|Rj)# p2(ri|Rj) is estimated as: $outfh.puts < 1 priors = [] # collect priors 0.upto($amino_acids.size - 1) { |i| prob_arr[i] = freq_arr[i] == 0 ? 0 : freq_arr[i] / freq_arr.sum.to_f } prob_arr = NArray.float($amino_acids.size) freq_arr = envs.inject(NArray.float($amino_acids.size)) { |sum, env| sum + env.freq_array } envs = $env_classes.values.select { |env| env.label.match(pattern.to_re) } # get environmetns, frequencies, and probabilities end pattern[j] = l l = label[1].chr j = label[0].chr.to_i labels.each do |label| pattern = '.' * $env_features.size c1[0].product(*c1[1..-1]).each do |labels| env_labels.combination(ci) do |c1| 1.upto($env_features.size) do |ci| # full smootingHEADER## sigma value used is: #{$sigma}# # Weights (omegas) are calculated as in Topham et al. (1993)# # An-1(ri|f1q,f2q,...,fn-2q) = sum over fq omega_c * pcn-1(f1q,f2q,...,fn-2q)# where# pn(ri|f1q,f2q,...,fn-1q) = omega1 * An-1(ri|f1q, f2q,...,fn-2q) + omega2 * Wn(ri|f1q,f2q,...,fn-1q)# pn(ri|f1q,f2q,...,fn-1q) is estimated as:## ^^^^^^^^^^^^^# The smoothing procedure is NOT curtailed here and it goes upto# # A2(ri|fq) = p2(ri|fq) (not fixed fq; full smoothing)# where# p3(ri|f1q,f2q) = omega1 * A2(ri|f1q) + omega2 * W3(ri|f1q,f2q)# p3(ri|f1q,f2q) is estimated as:# # (NOTE: f1q is not fixed to be Rj in the full smoothing procedure)## p2(ri|f1q) = omega1 * p1(ri) + omega2 * W2(ri|fq)# p2(ri|f1q) is estimated as:## p1(ri) = omega1 * A0 + omega2 * W1(ri)# p1(ri) is estimated as:## Full Smoothing:# $outfh.puts < $canvas_height - 50, :canvas_width => $canvas_width, :rvg_height => $rvg_height - 50, :rvg_width => $rvg_width, :row_header => $amino_acids, heatmaps << grp_prob_mat.heatmap(:col_header => $amino_acids, title_font_size = $rvg_width * $heatmapcol / 80.0 if $heatmap == 1 or $heatmap == 2 end $logger.info "Generating a heat map for #{stem} table done." :title => stem).write("#{stem}.#{$heatmapformat}") :print_value => $heatmapvalues, :min_val => 0, :max_val => grp_max_val.ceil, :canvas_height => $canvas_height, :canvas_width => $canvas_width, :rvg_height => $rvg_height, :rvg_width => $rvg_width, :row_header => $amino_acids, grp_prob_mat.heatmap(:col_header => $amino_acids, if $heatmap == 0 or $heatmap == 2 # for heat map generation :row_header => $amino_acids) $outfh.puts grp_prob_mat.pretty_string(:col_header => $amino_acids, $outfh.puts ">#{grp_label} #{grp_no}" stem = "#{grp_no}. #{grp_label}" # for a matrix file group_matrices.each_with_index do |(grp_label, grp_prob_mat), grp_no| $heatmapcol ||= Math::sqrt(group_matrices.size).round grp_max_val = group_matrices.map { |l, m, n| m }.map { |m| m.max }.max || 100 heatmaps = HeatmapArray.new if $heatmap == 1 or $heatmap == 2 if $output == 1 end group_matrices << [group[0], grp_prob_mat] end 0.upto($amino_acids.size - 1) { |j| grp_prob_mat[ai, j] = smooth_prob_arr[j] } smooth_prob_arr = group[1].find { |e| e.label.start_with?(aa) }.smooth_prob_array $amino_acids.each_with_index do |aa, ai| grp_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size) # calculating 21X21 substitution probability matrix for each envrionment $env_classes.groups_sorted_by_residue_labels.each do |group| group_matrices = [] # sorting environments and build 21X21 substitution matrices end env.smooth_prob_array = $smooth_prob[$env_features.size + 1][env.label_set] $env_classes.values.each do |env| # updating smoothed probability array for each envrionment end $logger.info 'Calculating substitution probabilities (full smoothing) done.' end end end end $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr $smooth_prob[ci + 1] = {} else $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr if $smooth_prob.has_key?(ci + 1) # store smoothed probabilties in a hash using a set of envrionment labels as a key 0.upto($amino_acids.size - 1) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / smooth_prob_arr_sum) } smooth_prob_arr_sum = smooth_prob_arr.sumad[a u  h . z y .    U # | + u @ s/Srq&j`H)&Y%jcUT7[6 NMatrix.float($amino_acids.size, $amino_acids.size + 1) : grp_logo_mat = $cys == 0 ? grp_envs = group[1] grp_label = group[0] # calculating substitution probability matrix for each envrionment $env_classes.groups_sorted_by_residue_labels.each_with_index do |group, group_no| factor = $scale / Math::log(2) grp_logo_mats = [] endHEADER# ^^^^^^^^^^^^^^^^^^^^^^^# which were derived from the environment-independent amino acid frequencies. $outfh.puts < stem).write("#{stem}.#{$heatmapformat}") :print_value => $heatmapvalues, :min_val => 0, :max_val => $tot_prob_mat.max.ceil, :canvas_height => $canvas_height, :canvas_width => $canvas_width, :rvg_height => $rvg_height, :rvg_width => $rvg_width, :row_header => $amino_acids, $tot_prob_mat.heatmap(:col_header => $amino_acids, stem = "#{group_matrices.size}. TOTAL" if $heatmap == 0 or $heatmap == 2 # for a heat map $outfh.close :row_header => $amino_acids) $outfh.puts $tot_prob_mat.pretty_string(:col_header => $amino_acids, $outfh.puts '>Total' if $output == 1 end end $tot_prob_mat[aj, ai] = $smooth_prob[2][["0#{aa}"].to_set][ai] 0.upto($amino_acids.size - 1) do |ai| $amino_acids.each_with_index do |aa, aj| $tot_prob_mat = NMatrix.float($amino_acids.size, $amino_acids.size) # for a total substitution probability matrix end end $logger.info "Generating heat maps in a file, #{file} done." :min_val => 0).write(file) :max_val => grp_max_val.ceil, :rvg_width => $rvg_width, heatmaps.heatmap(:columns => $heatmapcol, file = "#{$heatmapstem}.#{$heatmapformat}" if $heatmap == 1 or $heatmap == 2 # for heat maps in a single file end end :title_font_size => title_font_size) :title => stem, :print_gradient => false, :print_value => $heatmapvalues, :min_val => 0, :max_val => grp_max_val.ceil,adtSz*xw  J  4 " : (   W  r5f C3%$#LkK9)g~|utsHEADER## For total (composite) matrix, Entropy = #{"%5.4f" % tot_H} bits, Expected score = #{"%5.4f" % tot_E} $outfh.puts < title_font_size) :title => stem, :print_gradient => false, :print_value => $heatmapvalues, :min_val => -1 * abs_max_val.ceil, :mid_val => 0, :max_val => abs_max_val.ceil, :gradient_end_color => '#FF0000', :gradient_mid_color => '#FFFFFF', :gradient_beg_color => '#0000FF', :canvas_height => $canvas_height - 50, :canvas_width => $canvas_width, :rvg_height => $rvg_height - 50, :rvg_width => $rvg_width, :row_header => row_header, heatmaps << grp_logo_mat.heatmap(:col_header => $amino_acids, title_font_size = $rvg_width * $heatmapcol / 80.0 if $heatmap == 1 or $heatmap == 2 end $logger.info "Generating a heat map for #{stem} table done." :title => stem).write("#{stem}.#{$heatmapformat}") :print_value => $heatmapvalues, :min_val => -1 * abs_max_val.ceil, :mid_val => 0, :max_val => abs_max_val.ceil, :gradient_end_color => '#FF0000', :gradient_mid_color => '#FFFFFF', :gradient_beg_color => '#0000FF', :canvas_height => $canvas_height, :canvas_width => $canvas_width, :rvg_height => $rvg_height, :rvg_width => $rvg_width, :row_header => row_header, grp_logo_mat.heatmap(:col_header => $amino_acids, if $heatmap == 0 or $heatmap == 2 # for a heat map :row_header => row_header) $outfh.puts grp_logo_mat.pretty_string(:col_header => $amino_acids, $outfh.puts ">#{grp_label} #{grp_no}" # for a matrix file end grp_logo_mat = grp_logo_mat.round unless $noroundoff stem = "#{grp_no}. #{grp_label}" grp_logo_mat = arr[1] grp_label = arr[0] grp_logo_mats.each_with_index do |arr, grp_no| $heatmapcol ||= Math::sqrt(grp_logo_mats.size).round heatmaps = HeatmapArray.new if $heatmap == 1 or $heatmap == 2 row_header = $cys ? $amino_acids + %w[U] : $amino_acids abs_max_val = [grp_max_val.abs, grp_min_val.abs].max grp_min_val = grp_logo_mats.map { |l, m| m }.map { |m| m.min }.min grp_max_val = grp_logo_mats.map { |l, m| m }.map { |m| m.max }.maxad-9~jK,_H( x j ^ ] *   y E , f J +   K A @  _ A # dH,nN.`@"bD& mR.e@?5 z98 [ '--tem-list', '-l', GetoptLong::REQUIRED_ARGUMENT ], [ '--help', '-h', GetoptLong::NO_ARGUMENT ], opts = GetoptLong.new( # # Parsing options # # Part 2. # # Part 1 END # $min_cnt_sigma_ratio = 500.0 # minimum ratio of amino acid count to sigma value $tot_smooth_prob = {} $tot_logo_mat = nil $tot_prob_mat = nil $tot_cnt_mat = nil $smooth_prob = {} $aa_env_cnt = Hash.new(0) $aa_tot_freq = {} $aa_rel_mutb = {} $aa_mutb = {} $aa_mut_cnt = Hash.new(0) $aa_tot_cnt = Hash.new(0) $cell_height = 20 $cell_width = 20 $canvas_height = 650 $canvas_width = 550 $rvg_height = 650 $rvg_width = 550 $heatmapvalues = false $heatmapstem = 'heatmaps' $heatmapformat = 'png' $heatmapcol = nil $heatmap = nil $penv = false $targetenv = false $cys = 0 $add = nil $scale = 3 $pidmax = nil $pidmin = nil $scale = 3 $p1smooth = false $noroundoff = false $nosmooth = false $smooth = :partial $noweight = false $weight = 60 $autosigma = false $sigma = 5.0 $tot_aa = 0 $ali_size = 0 $output = 2 # default: log odds matrix $outfh = nil # file hanfle for outfile $outfile = 'allmat.dat' $classdef = 'classdef.dat' $tem_file = nil $tem_list = nil $amino_acids = 'ACDEFGHIKLMNPQRSTVWYJ'.split('') # default set of 21 amino acids including J (Cysteine, the free thiol form) $logger.level = Logger::WARN $logger = Logger.new(STDOUT) # # Global variables and their default values # # Part 1. # arr: array # mat: matrix # ali: alignment # ff: flat file # fh: file handle # opts: options # logo: log odds ratio # prob: probability # freq: frequency # mutb: mutability # mut: mutation # cnt: count # jnt: joint # rel: relative # tot: total # aa: weighted amino acid # aa: amino acid # classdef: (envlironment) class definition # tem: (FUGUE) template # env: environment # # * Abbreviations in the codes # def execute(arguments=[]) # :nodoc: end pid = 100.0 * ident.to_f / (align + intgp) end end intgp += 1 ((col[0] != '-') && (col[1] == '-'))) elsif (((col[0] == '-') && (col[1] != '-')) || end ident += 1 if col[0] == col[1] align += 1 if (col[0] != '-') && (col[1] != '-') cols.each do |col| intgp = 0 # no. of internal gaps ident = 0 # no. of identical columns align = 0 # no. of aligned columns cols = aas1.zip(aas2) aas2 = seq2.split('') aas1 = seq1.split('') def calculate_pid(seq1, seq2) # # Ulla::CLI::calculate_pid(seq1, seq2) -> Float # :call-seq:adY~=K S  U  W  ` K 1   d B & a$]AvS7tY2kN"zT2 yJ when 3 then 'bmp' when 2 then 'jpg' when 1 then 'gif' when 0 then 'png' $heatmapformat = case arg.to_i when '--heatmap-format' $heatmapstem = arg.to_s when '--heatmap-stem' $heatmapcol = arg.to_i when '--heatmap-columns' end exit1 warn "--heatmap #{arg.to_i} is not allowed." else when (0..2) then arg.to_i $heatmap = case arg.to_i when '--heatmap' $penv = true exit 1 warn "--penv option is not supported." when '--penv' $add = arg.to_f when '--add' $scale = arg.to_f when '--scale' $p1smooth = true when '--p1smooth' $nosmooth = true when '--nosmooth' $smooth = (arg.to_i == 1) ? :full : :partial when '--smooth' $noroundoff = true when '--noroundoff' $noweight = true when '--noweight' $pidmax = arg.to_f when '--pidmax' $pidmin = arg.to_f when '--pidmin' $autosigma = true when '--autosigma' $sigma = arg.to_f when '--sigma' $weight = arg.to_i when '--weight' $targetenv = (arg.to_i == 1) ? true : false when '--targetenv' $cys = arg.to_i when '--cys' $outfile = arg when '--outfile' $output = arg.to_i when '--output' $classdef = arg when '--classdef' $tem_file = arg when '--tem-file' $tem_list = arg when '--tem-list' exit 0 print_usage when '--help' case opt opts.each do |opt, arg| begin ) [ '--version', GetoptLong::NO_ARGUMENT ] [ '--verbose', '-v', GetoptLong::REQUIRED_ARGUMENT ], [ '--outfile', '-o', GetoptLong::REQUIRED_ARGUMENT ], [ '--penv', GetoptLong::NO_ARGUMENT ], [ '--cys', '-y', GetoptLong::REQUIRED_ARGUMENT ], [ '--targetenv','-t', GetoptLong::REQUIRED_ARGUMENT ], [ '--output', GetoptLong::REQUIRED_ARGUMENT ], [ '--heatmap-values', GetoptLong::NO_ARGUMENT ], [ '--heatmap-columns',GetoptLong::REQUIRED_ARGUMENT ], [ '--heatmap-format', GetoptLong::REQUIRED_ARGUMENT ], [ '--heatmap-stem', GetoptLong::REQUIRED_ARGUMENT ], [ '--heatmap', GetoptLong::REQUIRED_ARGUMENT ], [ '--autosigma', GetoptLong::NO_ARGUMENT ], [ '--sigma', GetoptLong::REQUIRED_ARGUMENT ], [ '--noroundoff', GetoptLong::NO_ARGUMENT ], [ '--noweight', GetoptLong::NO_ARGUMENT ], [ '--weight', '-w', GetoptLong::REQUIRED_ARGUMENT ], [ '--p1smooth', GetoptLong::NO_ARGUMENT ], [ '--nosmooth', GetoptLong::NO_ARGUMENT ], [ '--smooth', '-s', GetoptLong::REQUIRED_ARGUMENT ], [ '--classdef', '-c', GetoptLong::REQUIRED_ARGUMENT ], [ '--tem-file', '-f', GetoptLong::REQUIRED_ARGUMENT ],