# encoding: utf-8
module Cheripic

  # Custom error handling for Pileup class
  class PileupError < CheripicError; end

  require 'bio-samtools'
  require 'bio/db/pileup'

  # An extension of Bio::DB::Pileup object to process pileup information at a given position
  class Pileup < Bio::DB::Pileup

    # creates a Pileup object using a pileup information as string
    # @param string [String] pileup information line for a given position
    def initialize(string)
      super(string)
      adj_read_bases
      @indelbases = 'acgtryswkmbdhvnACGTRYSWKMBDHVN'
    end

    # removes mapping quality information
    def adj_read_bases
      # mapping quality after '^' symbol is substituted
      # to avoid splitting at non indel + or - characters
      # read ends marking by '$' symbol is substituted
      # insertion and deletion marking by '*' symbol is substituted
      self.read_bases.gsub!(/\^./, '')
      self.read_bases.delete! '$'
      self.read_bases.delete! '*'
      # warn about reads with ambiguous codes
      # if self.read_bases.match(/[^atgcATGC,\.\+\-0-9]/)
      #   warn "Ambiguous nucleotide\t#{self.read_bases}"
      # end
    end

    # count bases matching reference and non-reference
    # from snp variant and make a hash of bases with counts
    # for indels return the read bases information instead
    def bases_hash
      if self.read_bases =~ /\+/
        bases_hash = indels_to_hash('+')
      elsif self.read_bases =~ /-/
        bases_hash = indels_to_hash('-')
      else
        bases_hash = snp_base_hash(self.read_bases)
      end
      # some indels will have ref base in the read and using
      # sum of hash values is going to give wrong additional coverage
      # from indels so including actual coverage from pileup
      # bases_hash keys are :A, :C, :G, :T, :N, :ref and :indel
      bases_hash
    end

    # count bases from indels
    # array of pileup bases is split at + / -
    # and number after each + / - is counted
    def count_indel_bases(delimiter)
      array = self.read_bases.split(delimiter)
      number = 0
      array.shift
      array.each do |element|
        # deletions in reference could contain ambiguous codes,
        number += /^(\d+)[#{@indelbases}]/.match(element)[1].to_i
      end
      number
    end

    # count bases matching reference and non-reference
    # and calculate ratio of non_ref allele to total bases
    def non_ref_count
      read_bases = self.read_bases
      if read_bases =~ /\+/
        non_ref_count = indel_non_ref_count('+')
      elsif read_bases =~ /-/
        non_ref_count = indel_non_ref_count('-')
      else
        non_ref_count = read_bases.count('atgcATGC')
      end
      non_ref_count
    end

    # check if the pileup has the parameters we are looking for
    def is_var
      ignore_reference_n = Options.ignore_reference_n
      min_depth  = Options.mindepth
      min_non_ref_count = Options.min_non_ref_count

      return false if self.ref_base == '*'
      return false if ignore_reference_n and self.ref_base =~ /^[nN]$/
      return true if self.coverage >= min_depth and self.non_ref_count >= min_non_ref_count
      false
    end

    # count bases matching reference and non-reference
    # and calculate ratio of non_ref allele to total bases
    def non_ref_ratio
      self.non_ref_count.to_f / self.coverage.to_f
    end

    # calculate var zygosity for non-polyploid variants
    # increased range is used for heterozygosity for RNA-seq data
    # def var_mode
    #   ht_low = @defaults[:ht_low]
    #   ht_high = @defaults[:ht_high]
    #   mode = ''
    #   if self.non_ref_ratio.between?(ht_low, ht_high)
    #     mode = :het
    #   elsif self.non_ref_ratio > ht_high
    #     mode = :hom
    #   end
    #   mode
    # end

    # form hash of base information, [ATGC] counts for snp
    # a hash of base proportion is calculated
    # base proportion hash below a selected depth is empty
    # base proportion below or equal to a noise factor are discarded
    def var_base_frac
      hash = self.bases_hash
      snp_hash = {}
      coverage = self.coverage
      return snp_hash if coverage < Options.mindepth
      # calculate proportion of each base in coverage
      hash.each_key do | base |
        freq = hash[base].to_f/coverage.to_f
        next if freq <= Options.noise
        snp_hash[base] = freq
      end
      snp_hash
    end


    private

    # count number of indels and number non-indel base
    # and return a hash with bases and indel counts
    def indels_to_hash(delimiter)
      non_indel_bases = String.new
      array = self.read_bases.split(delimiter)
      non_indel_bases << array.shift
      array.each do |element|
        # get number of nucleotides inserted or deleted
        number = /^(\d+)[#{@indelbases}]/.match(element)[1].to_i
        # capture remaining nucleotides
        non_indel_bases << element.gsub(/^#{number}\w{#{number}}/, '')
      end
      bases_hash = snp_base_hash(non_indel_bases)
      # check at least three reads are supporting indel
      indel_count = self.read_bases.count(delimiter)
      if indel_count >= Options.min_indel_count_support
        bases_hash[:indel] = indel_count
      end
      bases_hash
    end

    def snp_base_hash(readbases)
      non_indel_base_hash = {}
      non_indel_base_hash[:ref] = readbases.count('.,')
      non_indel_base_hash[:A] = readbases.count('aA')
      non_indel_base_hash[:C] = readbases.count('cC')
      non_indel_base_hash[:G] = readbases.count('gG')
      non_indel_base_hash[:T] = readbases.count('tT')
      # non_indel_base_hash[:N] = read_bases.count('nN')
      non_indel_base_hash
    end

    def indel_non_ref_count(delimitter)
      read_bases = self.read_bases
      non_ref_count = read_bases.count(@indelbases)
      indelcounts = read_bases.count(delimitter)
      indel_bases = count_indel_bases(delimitter)
      non_ref_count + indelcounts - indel_bases
    end

  end

end