lib/cheripic/pileup.rb in cheripic-1.0.0 vs lib/cheripic/pileup.rb in cheripic-1.1.0
- old
+ new
@@ -1,186 +1,177 @@
# encoding: utf-8
-require 'bio'
-require 'bio-samtools'
-require 'bio/db/pileup'
+module Cheripic
-class Pileup < Bio::DB::Pileup
+ # Custom error handling for Pileup class
+ class PileupError < CheripicError; end
- attr_accessor :defaults
+ require 'bio-samtools'
+ require 'bio/db/pileup'
- def initialize(string, opts={})
- super(string)
- set_defaults(opts)
- adj_read_bases
- @indelbases = 'acgtryswkmbdhvnACGTRYSWKMBDHVN'
- end
+ # An extension of Bio::DB::Pileup object to process pileup information at a given position
+ class Pileup < Bio::DB::Pileup
- def set_defaults(opts)
- @defaults = {
- noise: 0.1, # noise level for read depth
- ht_low: 0.2, # min allele freq for heterozygosity
- ht_high: 0.9, # max allele freq for heterozygosity
- min_depth: 6, # minimum coverage for variant
- min_non_ref_count: 3,
- ignore_reference_n: true,
- min_indel_count_support: 3,
- }
- @defaults.merge(opts)
- end
+ # creates a Pileup object using a pileup information as string
+ # @param string [String] pileup information line for a given position
+ def initialize(string)
+ super(string)
+ adj_read_bases
+ @indelbases = 'acgtryswkmbdhvnACGTRYSWKMBDHVN'
+ end
- # removes mapping quality information
- def adj_read_bases
- # mapping quality after '^' symbol is substituted
- # to avoid splitting at non indel + or - characters
- # read ends marking by '$' symbol is substituted
- # insertion and deletion marking by '*' symbol is substituted
- self.read_bases.gsub!(/\^./, '')
- self.read_bases.delete! '$'
- self.read_bases.delete! '*'
- # warn about reads with ambiguous codes
- # if self.read_bases.match(/[^atgcATGC,\.\+\-0-9]/)
- # warn "Ambiguous nucleotide\t#{self.read_bases}"
- # end
- end
+ # removes mapping quality information
+ def adj_read_bases
+ # mapping quality after '^' symbol is substituted
+ # to avoid splitting at non indel + or - characters
+ # read ends marking by '$' symbol is substituted
+ # insertion and deletion marking by '*' symbol is substituted
+ self.read_bases.gsub!(/\^./, '')
+ self.read_bases.delete! '$'
+ self.read_bases.delete! '*'
+ # warn about reads with ambiguous codes
+ # if self.read_bases.match(/[^atgcATGC,\.\+\-0-9]/)
+ # warn "Ambiguous nucleotide\t#{self.read_bases}"
+ # end
+ end
- # count bases matching reference and non-reference
- # from snp variant and make a hash of bases with counts
- # for indels return the read bases information instead
- def bases_hash
- if self.read_bases =~ /\+/
- bases_hash = indels_to_hash('+')
- elsif self.read_bases =~ /\-/
- bases_hash = indels_to_hash('-')
- else
- bases_hash = snp_base_hash(self.read_bases)
+ # count bases matching reference and non-reference
+ # from snp variant and make a hash of bases with counts
+ # for indels return the read bases information instead
+ def bases_hash
+ if self.read_bases =~ /\+/
+ bases_hash = indels_to_hash('+')
+ elsif self.read_bases =~ /-/
+ bases_hash = indels_to_hash('-')
+ else
+ bases_hash = snp_base_hash(self.read_bases)
+ end
+ # some indels will have ref base in the read and using
+ # sum of hash values is going to give wrong additional coverage
+ # from indels so including actual coverage from pileup
+ # bases_hash keys are :A, :C, :G, :T, :N, :ref and :indel
+ bases_hash
end
- # some indels will have ref base in the read and using
- # sum of hash values is going to give wrong additional coverage
- # from indels so including actual coverage from pileup
- # bases_hash keys are :A, :C, :G, :T, :N, :ref, :indel and :cov
- bases_hash[:cov] = self.coverage
- bases_hash
- end
- # count bases from indels
- # array of pileup bases is split at + / -
- # and number after each + / - is counted
- def count_indel_bases(delimiter)
- array = self.read_bases.split(delimiter)
- number = 0
- array.shift
- array.each do |element|
- # deletions in reference could contain ambiguous codes,
- number += /^(\d+)[#{@indelbases}]/.match(element)[1].to_i
+ # count bases from indels
+ # array of pileup bases is split at + / -
+ # and number after each + / - is counted
+ def count_indel_bases(delimiter)
+ array = self.read_bases.split(delimiter)
+ number = 0
+ array.shift
+ array.each do |element|
+ # deletions in reference could contain ambiguous codes,
+ number += /^(\d+)[#{@indelbases}]/.match(element)[1].to_i
+ end
+ number
end
- number
- end
- # count bases matching reference and non-reference
- # and calculate ratio of non_ref allele to total bases
- def non_ref_count
- read_bases = self.read_bases
- if read_bases =~ /\+/
- non_ref_count = indel_non_ref_count('+')
- elsif read_bases =~ /\-/
- non_ref_count = indel_non_ref_count('-')
- else
- non_ref_count = read_bases.count('atgcATGC')
+ # count bases matching reference and non-reference
+ # and calculate ratio of non_ref allele to total bases
+ def non_ref_count
+ read_bases = self.read_bases
+ if read_bases =~ /\+/
+ non_ref_count = indel_non_ref_count('+')
+ elsif read_bases =~ /-/
+ non_ref_count = indel_non_ref_count('-')
+ else
+ non_ref_count = read_bases.count('atgcATGC')
+ end
+ non_ref_count
end
- non_ref_count
- end
- # check if the pileup has the parameters we are looking for
- def is_var
- ignore_reference_n = @defaults[:ignore_reference_n]
- min_depth = @defaults[:min_depth]
- min_non_ref_count = @defaults[:min_non_ref_count]
+ # check if the pileup has the parameters we are looking for
+ def is_var
+ ignore_reference_n = Options.ignore_reference_n
+ min_depth = Options.mindepth
+ min_non_ref_count = Options.min_non_ref_count
- return false if self.ref_base == '*'
- return false if ignore_reference_n and self.ref_base =~ /^[nN]$/
- return true if self.coverage >= min_depth and self.non_ref_count >= min_non_ref_count
- false
- end
+ return false if self.ref_base == '*'
+ return false if ignore_reference_n and self.ref_base =~ /^[nN]$/
+ return true if self.coverage >= min_depth and self.non_ref_count >= min_non_ref_count
+ false
+ end
- # count bases matching reference and non-reference
- # and calculate ratio of non_ref allele to total bases
- def non_ref_ratio
- self.non_ref_count.to_f / self.coverage.to_f
- end
-
- # calculate var zygosity for non-polyploid variants
- # increased range is used for heterozygosity for RNA-seq data
- def var_mode
- ht_low = @defaults[:ht_low]
- ht_high = @defaults[:ht_high]
- mode = ''
- if self.non_ref_ratio.between?(ht_low, ht_high)
- mode = :het
- elsif self.non_ref_ratio > ht_high
- mode = :hom
+ # count bases matching reference and non-reference
+ # and calculate ratio of non_ref allele to total bases
+ def non_ref_ratio
+ self.non_ref_count.to_f / self.coverage.to_f
end
- mode
- end
- # form hash of base information, [ATGC] counts for snp
- # a hash of base proportion is calculated
- # base proportion hash below a selected depth is empty
- # base proportion below or equal to a noise factor are discarded
- def var_base_frac
- hash = self.bases_hash
- snp_hash = {}
- coverage = hash[:cov]
- return snp_hash if coverage < @defaults[:min_depth]
- # calculate proportion of each base in coverage
- hash.each_key do | base |
- next if base == :cov
- freq = hash[base].to_f/coverage.to_f
- next if freq <= @defaults[:noise]
- snp_hash[base] = freq
+ # calculate var zygosity for non-polyploid variants
+ # increased range is used for heterozygosity for RNA-seq data
+ # def var_mode
+ # ht_low = @defaults[:ht_low]
+ # ht_high = @defaults[:ht_high]
+ # mode = ''
+ # if self.non_ref_ratio.between?(ht_low, ht_high)
+ # mode = :het
+ # elsif self.non_ref_ratio > ht_high
+ # mode = :hom
+ # end
+ # mode
+ # end
+
+ # form hash of base information, [ATGC] counts for snp
+ # a hash of base proportion is calculated
+ # base proportion hash below a selected depth is empty
+ # base proportion below or equal to a noise factor are discarded
+ def var_base_frac
+ hash = self.bases_hash
+ snp_hash = {}
+ coverage = self.coverage
+ return snp_hash if coverage < Options.mindepth
+ # calculate proportion of each base in coverage
+ hash.each_key do | base |
+ freq = hash[base].to_f/coverage.to_f
+ next if freq <= Options.noise
+ snp_hash[base] = freq
+ end
+ snp_hash
end
- snp_hash
- end
- private
+ private
- # count number of indels and number non-indel base
- # and return a hash with bases and indel counts
- def indels_to_hash(delimiter)
- non_indel_bases = String.new
- array = self.read_bases.split(delimiter)
- non_indel_bases << array.shift
- array.each do |element|
- # get number of nucleotides inserted or deleted
- number = /^(\d+)[#{@indelbases}]/.match(element)[1].to_i
- # capture remaining nucleotides
- non_indel_bases << element.gsub(/^#{number}\w{#{number}}/, '')
+ # count number of indels and number non-indel base
+ # and return a hash with bases and indel counts
+ def indels_to_hash(delimiter)
+ non_indel_bases = String.new
+ array = self.read_bases.split(delimiter)
+ non_indel_bases << array.shift
+ array.each do |element|
+ # get number of nucleotides inserted or deleted
+ number = /^(\d+)[#{@indelbases}]/.match(element)[1].to_i
+ # capture remaining nucleotides
+ non_indel_bases << element.gsub(/^#{number}\w{#{number}}/, '')
+ end
+ bases_hash = snp_base_hash(non_indel_bases)
+ # check at least three reads are supporting indel
+ indel_count = self.read_bases.count(delimiter)
+ if indel_count >= Options.min_indel_count_support
+ bases_hash[:indel] = indel_count
+ end
+ bases_hash
end
- bases_hash = snp_base_hash(non_indel_bases)
- # check at least three reads are supporting indel
- indel_count = self.read_bases.count(delimiter)
- if indel_count >= @defaults[:min_indel_count_support]
- bases_hash[:indel] = indel_count
+
+ def snp_base_hash(readbases)
+ non_indel_base_hash = {}
+ non_indel_base_hash[:ref] = readbases.count('.,')
+ non_indel_base_hash[:A] = readbases.count('aA')
+ non_indel_base_hash[:C] = readbases.count('cC')
+ non_indel_base_hash[:G] = readbases.count('gG')
+ non_indel_base_hash[:T] = readbases.count('tT')
+ # non_indel_base_hash[:N] = read_bases.count('nN')
+ non_indel_base_hash
end
- bases_hash
- end
- def snp_base_hash(readbases)
- non_indel_base_hash = {}
- non_indel_base_hash[:ref] = readbases.count('.,')
- non_indel_base_hash[:A] = readbases.count('aA')
- non_indel_base_hash[:C] = readbases.count('cC')
- non_indel_base_hash[:G] = readbases.count('gG')
- non_indel_base_hash[:T] = readbases.count('tT')
- # non_indel_base_hash[:N] = read_bases.count('nN')
- non_indel_base_hash
- end
+ def indel_non_ref_count(delimitter)
+ read_bases = self.read_bases
+ non_ref_count = read_bases.count(@indelbases)
+ indelcounts = read_bases.count(delimitter)
+ indel_bases = count_indel_bases(delimitter)
+ non_ref_count + indelcounts - indel_bases
+ end
- def indel_non_ref_count(delimitter)
- read_bases = self.read_bases
- non_ref_count = read_bases.count(@indelbases)
- indelcounts = read_bases.count(delimitter)
- indel_bases = count_indel_bases(delimitter)
- non_ref_count + indelcounts - indel_bases
end
end