require "getoptlong"
require "rdoc/usage"
require "logger"
require "rubygems"
require "narray"
require "bio"
require "set"
require "facets"

require "narray_extensions"
require "nmatrix_extensions"
require "enumerable_extensions"
require "environment_feature"
require "environment"

module Rubst
  class CLI
    
    def self.print_version
      puts Rubst::VERSION
    end
    
    def self.print_usage
      puts <<-USAGE

  rubst [ options ] -f TEM-file
      or
  rubst [ options ] -l TEMLIST-file

  Available options:

  -h, --help
      show help
  -f, --tem-file FILE
      a tem file
  -l, --tem-list FILE
      a list for tem files
  -o, --outfile FILE
      output filename ("allmat.dat" if not specified)
  --weight INTEGER (PID)
      clustering level (PID) for the BLOSUM-like weighting (not supported yet)
  --noweight
      calculate substitution counts with no weights (default)
  -c, --classdef FILE
      a file for the defintion of environments (default: 'classdef.dat')
  -y, --cys INTEGER
      0 for using C and J only for structure
      1 for both structure and sequence (default)
  -output INTEGER
      0 for raw counts (no-smoothing performed)
      1 for probabilities
      2 for log-odds (default)
  --scale INTEGER
      log-odds matrices in 1/n bit units (default 3)
  --sigma DOUBLE
      change the sigma value for smoothing (default 5)
  --add DOUBLE
      add this value to raw counts when deriving log-odds without smoothing
      (default 1/#classes)
  --penv
      use environment-dependent frequencies for log-odds calculation (default false)
  --pidmin DOUBLE
      count substitutions only for pairs with PID equal to or greater than this value (default none)
  --pidmax DOUBLE
      count substitutions only for pairs with PID smaller than this value (default none)
  --analysis
      analyze structural environments (also -D)
  -v, --verbose INTEGER
      0 for WARN level (default)
      1 for INFO level or more severe
      2 for DEBUG level or more sever 
  --version
      print version

      USAGE
    end
    
    def self.execute(arguments=[])
      #
      # Abbreviations in the source codes
      #
      # * env: environment
      # * tem: (FUGUE) template
      # * classdef: (envlironment) class definition
      # * aa: amino acid
      # * tot: total
      # * rel: relative
      # * obs: observation (frequency)
      # * mut: mutation
      # * mutb: mutability
      # * freq: frequency
      # * prob: probability
      # * opts: options

      #
      # Global variables & Abbreviations
      #
      $logger       = Logger.new(STDOUT)
      $amino_acids  = "ACDEFGHIKLMNPQRSTVWYJ".split("")
      $tem_list     = nil
      $tem_file     = nil
      $classdef     = nil
      $outfile      = nil
      $format       = nil
      $aa_tot_obs   = {}
      $aa_mut_obs   = {}
      $aa_mutb      = {}
      $aa_rel_mutb  = {}
      $aa_rel_freq  = {}
      $tot_aa       = 0
      $sigma        = 5.0
      $smooth_prob  = {}

      #
      # Options parsing
      #
      opts = GetoptLong.new(
        [ '--help',     '-h', GetoptLong::NO_ARGUMENT ],
        [ '--tem-list', '-l', GetoptLong::REQUIRED_ARGUMENT ],
        [ '--tem-file', '-f', GetoptLong::REQUIRED_ARGUMENT ],
        [ '--classdef', '-c', GetoptLong::REQUIRED_ARGUMENT ],
        [ '--output',         GetoptLong::REQUIRED_ARGUMENT ],
        [ '--cys',      '-y', GetoptLong::REQUIRED_ARGUMENT ],
        [               '-o', GetoptLong::REQUIRED_ARGUMENT ],
        [ '--verbose',  '-v', GetoptLong::REQUIRED_ARGUMENT ],
        [ '--version',        GetoptLong::NO_ARGUMENT ]
      )

      opts.each do |opt, arg|
        case opt
        when '--help'
          print_usage
        when '--tem-list'
          $tem_list = arg
        when '--tem-file'
          $tem_file = arg
        when '--classdef'
          $classdef = arg       || 'classdef.dat'
        when '--output'
          $format   = arg.to_i  || 2
        when '-o'
          $outfile  = arg       || 'allmat.dat'
        when '--cyc'
          $cysteine = (arg.to_i == 1 ? false : true)
        when '--verbose'
          $logger.level = case arg.to_i
                          when 0 then Logger::WARN
                          when 1 then Logger::INFO
                          when 2 then Logger::DEBUG
                          else Logger::INFO
                          end
        when '--version'
          print_version
        end
      end

      if ARGV.length != 0 or
        !$tem_list && !$tem_file or
        $tem_list && $tem_file
        print_usage
        exit 0
      end

      #
      # Reading Environment Class Definition File
      #
      $env_features = []
      $env_features << EnvironmentFeature.new("sequence",
                                              $amino_acids,
                                              $amino_acids,
                                        "F",
                                        "F")

      IO.foreach($classdef) do |l|
        next if l =~ /^#/
          if (env_ftr = l.chomp.split(/;/)).length == 5
            $logger.info ">>> An environment feature, #{l.chomp} detected"
            if env_ftr[-1] == "T"
              $logger.warn "!!! The environment feature, #{l.chomp} silent"
              next
            end
            if env_ftr[-2] == "T"
              $logger.warn "!!! The environment feature, #{l.chomp} constrained"
            end
            $env_features << EnvironmentFeature.new(env_ftr[0],
                                                    env_ftr[1].split(""),
                                                    env_ftr[2].split(""),
                                                    env_ftr[3],
                                                    env_ftr[4])
          end
      end

      $envs = {}
      $env_features.inject([]) { |sum, ec|
        sum << ec.labels
      }.inject { |pro, lb|
        pro.product(lb)
      }.each_with_index { |e, i|
        $envs[e.flatten.join] = Environment.new(i, e.flatten.join)
      }

      #
      # Reading TEM list file
      #
      if $tem_list
        IO.foreach($tem_list) do |tem_file|
          tem_file.chomp!

          $logger.debug ">>> Processing #{tem_file} ..."

          ali = Bio::Alignment::OriginalAlignment.new
          ff  = Bio::FlatFile.auto(tem_file)
          ff.each_entry do |pir|
            if pir.definition == "sequence"
              ali.add_seq(pir.data.gsub("\n", ""), pir.entry_id)
            end
          end

          env_labels = {}
          disulphide = {}

          ali.each_pair do |key, seq|
            # check disulphide bond environment first!
            ff.rewind
            ff.each_entry do |pir|
              if (pir.entry_id == key) && (pir.definition == "disulphide")
                disulphide[key] = pir.data.gsub("\n", "").split("")
              end
            end

            $env_features.each_with_index do |ec, ei|
              env_labels[key] = [] unless env_labels.has_key?(key)

              ff.rewind
              ff.each_entry do |pir|
                if (pir.entry_id == key) && (pir.definition == ec.name)
                  labels = pir.data.gsub("\n", "").split("").map_with_index { |sym, pos|
                    if sym == "-"
                "-"
                    elsif sym == "X" || sym == "x"
                "X"
                    else
                      if ei == 0 # Amino Acid Environment Feature
                        ((disulphide[key][pos] == "F") && (sym == "C")) ? "J" : sym
                      else
                        ec.labels[ec.symbols.index(sym)]
                      end
                    end
                  }

                  if env_labels[key].empty?
                    env_labels[key] = labels
                  else
                    env_labels[key].each_with_index { |e, i| env_labels[key][i] = e + labels[i] }
                  end
                end
              end
            end
          end

          ali.each_pair do |key1, seq1|
            ali.each_pair do |key2, seq2|
              if key1 != key2
                s1 = seq1.split("")
                s2 = seq2.split("")
                s1.each_with_index do |source, pos|
                  if env_labels[key1][pos].include?("X")
                    $logger.info ">>> Substitutions from #{key1}-#{pos}-#{source} were masked"
                    next
                  end

                  source.upcase!
                  target = s2[pos].upcase

                  if !$amino_acids.include?(source)
                    $logger.info "!!! #{key1}-#{pos}-#{source} is not standard amino acid"
                    next
                  end

                  if !$amino_acids.include?(target)
                    $logger.info "!!! #{key1}-#{pos}-#{target} is not standard amino acid"
                    next
                  end

                  source = (((disulphide[key1][pos] == "F") && (source == "C")) ? "J" : source)
                  target = (((disulphide[key2][pos] == "F") && (target == "C")) ? "J" : target)

                  $envs[env_labels[key1][pos]].add_residue_count(target)

                  if $aa_tot_obs.has_key?(source)
                    $aa_tot_obs[source] += 1
                  else
                    $aa_tot_obs[source] = 1
                  end

                  if source != target
                    if $aa_mut_obs.has_key?(source)
                      $aa_mut_obs[source] += 1
                    else
                      $aa_mut_obs[source] = 1
                    end
                  end
                  $logger.debug ">>> Add #{key1}-#{pos}-#{source} -> #{key2}-#{pos}-#{target} substituion for #{env_labels[key1][pos]}"
                end
              end
            end
          end
        end # IO.foreach($tem_list)

        #  #
        #  # Frequency matrix
        #  #
        #  $envs.values.sort_by { |v| v.number }.each do |env|
        #    puts ">#{env.label} #{env.number}"
        #    puts env.freq_array.pretty_string(:col_header => $amino_acids,
        #                                      :row_header => "Prb")
        #  end

        $tot_freq_matrix = NMatrix.int(21,21)

        # for each combination of environment features
        env_groups = $envs.values.group_by { |env| env.label[1..-1] }
        env_groups.each_pair do |label, group|
          $grp_freq_matrix = NMatrix.int(21,21)

          $amino_acids.each_with_index do |aa, ai|
            freq_array = group.find { |e| e.label.start_with?(aa) }.freq_array
            0.upto($grp_freq_matrix.shape[1] - 1) do |j|
              $grp_freq_matrix[ai, j] = freq_array[j]
            end
          end

          $tot_freq_matrix += $grp_freq_matrix

          #    puts ">#{label}"
          #    puts $grp_freq_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
        end

        #  # for total
        #  puts ">Total"
        #  puts $tot_freq_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)

        #
        # Amino Acid Frequencies and Mutabilities
        #
        ala_factor  = 100 * $aa_tot_obs["A"] / $aa_mut_obs["A"].to_f
        $tot_aa     = $aa_tot_obs.values.sum

        #  puts ">Total amino acid frequencies"
        #  puts "%-3s %8s %8s %8s %8s %8s" % %w[RES MUT_OBS TOT_OBS MUT REL_MUT REL_FRQ]

        $aa_tot_obs.each_pair do |res, freq|
          $aa_mutb[res]      = $aa_mut_obs[res] / freq.to_f
          $aa_rel_mutb[res]  = $aa_mutb[res] * ala_factor
          $aa_rel_freq[res]  = freq / $tot_aa.to_f

          #    puts "%-3s %8d %8d %8.2f %8d %8.4f" % [res, $aa_mut_obs[res], freq, $aa_mutb[res], $aa_rel_mutb[res], $aa_rel_freq[res]]
        end

        #
        # Probability matrices
        #
        tot_prob_matrix = NMatrix.float(21, 21)

        0.upto($tot_freq_matrix.shape[0] - 1) do |i|
          col_sum = 0
          0.upto($tot_freq_matrix.shape[1] - 1) do |j|
            col_sum += $tot_freq_matrix[i, j]
          end
          0.upto($tot_freq_matrix.shape[1] - 1) do |k|
            # normalized substitutions probabilities with mutability
            #tot_prob_matrix[i, k] = $aa_rel_mutb[$amino_acids[k]] * $tot_freq_matrix[i,k] / col_sum.to_f

            # raw substitution probabilities with just frequencies
            tot_prob_matrix[i, k] = 100 * $tot_freq_matrix[i,k] / col_sum.to_f
          end
        end

        #  puts ">Total probability"
        #  puts tot_prob_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)

        #  # a new way of getting probability matrix
        #  new_tot_prob_matrix = NMatrix.float(21, 21)
        #
        #  0.upto($tot_freq_matrix.shape[0] - 1) do |i|
        #    col_sum = 0
        #    0.upto($tot_freq_matrix.shape[1] - 1) do |j|
        #      col_sum += ((1 - $aa_mutb[$amino_acids[j]]) / $aa_rel_freq[$amino_acids[j]]) * $tot_freq_matrix[i, j]
        #    end
        #    0.upto($tot_freq_matrix.shape[1] - 1) do |k|
        #      new_tot_prob_matrix[i, k] = 100 * ((1 - $aa_mutb[$amino_acids[k]]) / $aa_rel_freq[$amino_acids[k]]) * $tot_freq_matrix[i, k] / col_sum
        #    end
        #  end

        #
        # p1 probability
        #
        p1      = NArray.float(21)
        a0      = NArray.float(21).fill(1 / 21.0)
        big_N   = $tot_aa
        small_n = 21
        omega1  = 1.0 / (1 + big_N.to_f / ($sigma * small_n.to_f))
        omega2  = 1.0 - omega1

        0.upto(p1.shape[0] - 1) { |i| p1[i] = 100.0 * (omega1 * a0[i] + omega2 * $aa_rel_freq[$amino_acids[i]]) }
        $smooth_prob[1] = p1

        #  puts "P1 probability matrix"
        #  puts p1.pretty_string(:col_header => $amino_acids, :row_header => "Prb")
        #  puts p1.sum

        #
        # p2 and above
        #
        env_labels = $env_features.map_with_index {|ef, ei| ef.labels.map { |l| "#{ei}#{l}" } }

        1.upto($env_features.size) do |ci|
          env_labels.combination(ci) do |c1|
            Enumerable.cart_prod(*c1).each do |labels|
              pattern = "." * $env_features.size
              labels.each do |label|
                j = label[0].chr.to_i
                l = label[1].chr
                pattern[j] = l
              end

              # get environmetns, frequencies, and probabilities
              envs      = $envs.values.select { |env| env.label.match(pattern.to_re) }
              freq_arr  = envs.inject(NArray.int(21)) { |sum, env| sum + env.freq_array }

              #        if freq_arr.sum == 0
              #          $logger.warn "!!! Environment combination, #{labels.to_set} has no frequency"
              #
              #          # store smoothed probabilties in a hash using a set of envrionment labels as a key
              #          smooth_prob_arr = NArray.float(21).fill(0.0)
              #          if !$smooth_prob.has_key?(ci + 1)
              #            $smooth_prob[ci + 1] = {}
              #            $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
              #          else
              #            $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
              #          end
              #
              #          # print smoothed probabilties
              #          puts "P#{ci + 1} probability for the combination of environments, #{labels.join}"
              #          puts smooth_prob_arr.pretty_string(:col_header => $amino_acids, :row_header => "Prb")
              #          puts smooth_prob_arr.sum
              #          next
              #        end

              prob_arr = NArray.float(21)
              0.upto(20) { |i| prob_arr[i] = freq_arr[i] == 0 ? 0 : freq_arr[i] / freq_arr.sum.to_f }

              # collect priors
              priors  = []
              if ci > 1
                labels.combination(ci - 1).each { |c2| priors << $smooth_prob[ci][c2.to_set] }
              else
                priors << $smooth_prob[1]
              end

              # averaging priors... have a look at Entropy based normalization!
              #pri_avg         = priors.sum / priors.size

              # entropy based weighting priors
              entropy_max = Math::log(21)
              entropies = priors.map do |prior|
                (entropy_max + prior.to_a.inject(0.0) { |s, p| s + p * Math::log(p) }) / entropy_max
              end
              pri_avg = priors.map_with_index { |p, i| p * entropies[i] / entropies.sum }.sum


              smooth_prob_arr = NArray.float(21)
              big_N           = freq_arr.sum.to_f
              small_n         = 21.0
              omega1          = 1.0 / (1 + big_N / ($sigma * small_n))
              omega2          = 1.0 - omega1

              # smoothing step
              0.upto(20) { |i| smooth_prob_arr[i] = 100.0 * (omega1 * pri_avg[i] + omega2 * prob_arr[i]) }

              # normalization step
              smooth_prob_arr_sum = smooth_prob_arr.sum
              0.upto(20) { |i| smooth_prob_arr[i] = 100.0 * (smooth_prob_arr[i] / smooth_prob_arr_sum) }

              # store smoothed probabilties in a hash using a set of envrionment labels as a key
              if !$smooth_prob.has_key?(ci + 1)
                $smooth_prob[ci + 1] = {}
                $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
              else
                $smooth_prob[ci + 1][labels.to_set] = smooth_prob_arr
              end

              #        # print smoothed probabilties
              #        puts "P#{ci + 1} probability for the combination of environments, #{labels.join}"
              #        puts smooth_prob_arr.pretty_string(:col_header => $amino_acids, :row_header => "Prb")
              #        puts smooth_prob_arr.sum
            end
          end
        end

        # summarizing ...
        $envs.values.each { |e| e.smooth_prob_array = $smooth_prob[$env_features.size + 1][e.label_set] }
        #  $envs.values.each do |e|
        #    puts ">" + e.label
        #    puts e.smooth_prob_array.pretty_string(:col_header => $amino_acids, :row_header => "Prb")
        #  end

        tot_smooth_prob_matrix = NMatrix.float(21,21)

        # for each combination of environment features
        env_groups = $envs.values.group_by { |env| env.label[1..-1] }
        env_groups.sort.each do |label, group|
          grp_prob_matrix = NMatrix.float(21,21)

          $amino_acids.each_with_index do |aa, ai|
            smooth_prob_array = group.find { |e| e.label.start_with?(aa) }.smooth_prob_array
            0.upto(20) { |j| grp_prob_matrix[ai, j] = smooth_prob_array[j] }
          end

          tot_smooth_prob_matrix += grp_prob_matrix

          puts ">#{label}"
          puts grp_prob_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)
        end

        tot_smooth_prob_matrix /= env_groups.size

        # for total
        puts ">Total Probability"
        puts tot_smooth_prob_matrix.pretty_string(:col_header => $amino_acids, :row_header => $amino_acids)

      end

    end
  end
end