lib/cheripic/pileup.rb in cheripic-1.0.0 vs lib/cheripic/pileup.rb in cheripic-1.1.0

- old
+ new

@@ -1,186 +1,177 @@ # encoding: utf-8 -require 'bio' -require 'bio-samtools' -require 'bio/db/pileup' +module Cheripic -class Pileup < Bio::DB::Pileup + # Custom error handling for Pileup class + class PileupError < CheripicError; end - attr_accessor :defaults + require 'bio-samtools' + require 'bio/db/pileup' - def initialize(string, opts={}) - super(string) - set_defaults(opts) - adj_read_bases - @indelbases = 'acgtryswkmbdhvnACGTRYSWKMBDHVN' - end + # An extension of Bio::DB::Pileup object to process pileup information at a given position + class Pileup < Bio::DB::Pileup - def set_defaults(opts) - @defaults = { - noise: 0.1, # noise level for read depth - ht_low: 0.2, # min allele freq for heterozygosity - ht_high: 0.9, # max allele freq for heterozygosity - min_depth: 6, # minimum coverage for variant - min_non_ref_count: 3, - ignore_reference_n: true, - min_indel_count_support: 3, - } - @defaults.merge(opts) - end + # creates a Pileup object using a pileup information as string + # @param string [String] pileup information line for a given position + def initialize(string) + super(string) + adj_read_bases + @indelbases = 'acgtryswkmbdhvnACGTRYSWKMBDHVN' + end - # removes mapping quality information - def adj_read_bases - # mapping quality after '^' symbol is substituted - # to avoid splitting at non indel + or - characters - # read ends marking by '$' symbol is substituted - # insertion and deletion marking by '*' symbol is substituted - self.read_bases.gsub!(/\^./, '') - self.read_bases.delete! '$' - self.read_bases.delete! '*' - # warn about reads with ambiguous codes - # if self.read_bases.match(/[^atgcATGC,\.\+\-0-9]/) - # warn "Ambiguous nucleotide\t#{self.read_bases}" - # end - end + # removes mapping quality information + def adj_read_bases + # mapping quality after '^' symbol is substituted + # to avoid splitting at non indel + or - characters + # read ends marking by '$' symbol is substituted + # insertion and deletion marking by '*' symbol is substituted + self.read_bases.gsub!(/\^./, '') + self.read_bases.delete! '$' + self.read_bases.delete! '*' + # warn about reads with ambiguous codes + # if self.read_bases.match(/[^atgcATGC,\.\+\-0-9]/) + # warn "Ambiguous nucleotide\t#{self.read_bases}" + # end + end - # count bases matching reference and non-reference - # from snp variant and make a hash of bases with counts - # for indels return the read bases information instead - def bases_hash - if self.read_bases =~ /\+/ - bases_hash = indels_to_hash('+') - elsif self.read_bases =~ /\-/ - bases_hash = indels_to_hash('-') - else - bases_hash = snp_base_hash(self.read_bases) + # count bases matching reference and non-reference + # from snp variant and make a hash of bases with counts + # for indels return the read bases information instead + def bases_hash + if self.read_bases =~ /\+/ + bases_hash = indels_to_hash('+') + elsif self.read_bases =~ /-/ + bases_hash = indels_to_hash('-') + else + bases_hash = snp_base_hash(self.read_bases) + end + # some indels will have ref base in the read and using + # sum of hash values is going to give wrong additional coverage + # from indels so including actual coverage from pileup + # bases_hash keys are :A, :C, :G, :T, :N, :ref and :indel + bases_hash end - # some indels will have ref base in the read and using - # sum of hash values is going to give wrong additional coverage - # from indels so including actual coverage from pileup - # bases_hash keys are :A, :C, :G, :T, :N, :ref, :indel and :cov - bases_hash[:cov] = self.coverage - bases_hash - end - # count bases from indels - # array of pileup bases is split at + / - - # and number after each + / - is counted - def count_indel_bases(delimiter) - array = self.read_bases.split(delimiter) - number = 0 - array.shift - array.each do |element| - # deletions in reference could contain ambiguous codes, - number += /^(\d+)[#{@indelbases}]/.match(element)[1].to_i + # count bases from indels + # array of pileup bases is split at + / - + # and number after each + / - is counted + def count_indel_bases(delimiter) + array = self.read_bases.split(delimiter) + number = 0 + array.shift + array.each do |element| + # deletions in reference could contain ambiguous codes, + number += /^(\d+)[#{@indelbases}]/.match(element)[1].to_i + end + number end - number - end - # count bases matching reference and non-reference - # and calculate ratio of non_ref allele to total bases - def non_ref_count - read_bases = self.read_bases - if read_bases =~ /\+/ - non_ref_count = indel_non_ref_count('+') - elsif read_bases =~ /\-/ - non_ref_count = indel_non_ref_count('-') - else - non_ref_count = read_bases.count('atgcATGC') + # count bases matching reference and non-reference + # and calculate ratio of non_ref allele to total bases + def non_ref_count + read_bases = self.read_bases + if read_bases =~ /\+/ + non_ref_count = indel_non_ref_count('+') + elsif read_bases =~ /-/ + non_ref_count = indel_non_ref_count('-') + else + non_ref_count = read_bases.count('atgcATGC') + end + non_ref_count end - non_ref_count - end - # check if the pileup has the parameters we are looking for - def is_var - ignore_reference_n = @defaults[:ignore_reference_n] - min_depth = @defaults[:min_depth] - min_non_ref_count = @defaults[:min_non_ref_count] + # check if the pileup has the parameters we are looking for + def is_var + ignore_reference_n = Options.ignore_reference_n + min_depth = Options.mindepth + min_non_ref_count = Options.min_non_ref_count - return false if self.ref_base == '*' - return false if ignore_reference_n and self.ref_base =~ /^[nN]$/ - return true if self.coverage >= min_depth and self.non_ref_count >= min_non_ref_count - false - end + return false if self.ref_base == '*' + return false if ignore_reference_n and self.ref_base =~ /^[nN]$/ + return true if self.coverage >= min_depth and self.non_ref_count >= min_non_ref_count + false + end - # count bases matching reference and non-reference - # and calculate ratio of non_ref allele to total bases - def non_ref_ratio - self.non_ref_count.to_f / self.coverage.to_f - end - - # calculate var zygosity for non-polyploid variants - # increased range is used for heterozygosity for RNA-seq data - def var_mode - ht_low = @defaults[:ht_low] - ht_high = @defaults[:ht_high] - mode = '' - if self.non_ref_ratio.between?(ht_low, ht_high) - mode = :het - elsif self.non_ref_ratio > ht_high - mode = :hom + # count bases matching reference and non-reference + # and calculate ratio of non_ref allele to total bases + def non_ref_ratio + self.non_ref_count.to_f / self.coverage.to_f end - mode - end - # form hash of base information, [ATGC] counts for snp - # a hash of base proportion is calculated - # base proportion hash below a selected depth is empty - # base proportion below or equal to a noise factor are discarded - def var_base_frac - hash = self.bases_hash - snp_hash = {} - coverage = hash[:cov] - return snp_hash if coverage < @defaults[:min_depth] - # calculate proportion of each base in coverage - hash.each_key do | base | - next if base == :cov - freq = hash[base].to_f/coverage.to_f - next if freq <= @defaults[:noise] - snp_hash[base] = freq + # calculate var zygosity for non-polyploid variants + # increased range is used for heterozygosity for RNA-seq data + # def var_mode + # ht_low = @defaults[:ht_low] + # ht_high = @defaults[:ht_high] + # mode = '' + # if self.non_ref_ratio.between?(ht_low, ht_high) + # mode = :het + # elsif self.non_ref_ratio > ht_high + # mode = :hom + # end + # mode + # end + + # form hash of base information, [ATGC] counts for snp + # a hash of base proportion is calculated + # base proportion hash below a selected depth is empty + # base proportion below or equal to a noise factor are discarded + def var_base_frac + hash = self.bases_hash + snp_hash = {} + coverage = self.coverage + return snp_hash if coverage < Options.mindepth + # calculate proportion of each base in coverage + hash.each_key do | base | + freq = hash[base].to_f/coverage.to_f + next if freq <= Options.noise + snp_hash[base] = freq + end + snp_hash end - snp_hash - end - private + private - # count number of indels and number non-indel base - # and return a hash with bases and indel counts - def indels_to_hash(delimiter) - non_indel_bases = String.new - array = self.read_bases.split(delimiter) - non_indel_bases << array.shift - array.each do |element| - # get number of nucleotides inserted or deleted - number = /^(\d+)[#{@indelbases}]/.match(element)[1].to_i - # capture remaining nucleotides - non_indel_bases << element.gsub(/^#{number}\w{#{number}}/, '') + # count number of indels and number non-indel base + # and return a hash with bases and indel counts + def indels_to_hash(delimiter) + non_indel_bases = String.new + array = self.read_bases.split(delimiter) + non_indel_bases << array.shift + array.each do |element| + # get number of nucleotides inserted or deleted + number = /^(\d+)[#{@indelbases}]/.match(element)[1].to_i + # capture remaining nucleotides + non_indel_bases << element.gsub(/^#{number}\w{#{number}}/, '') + end + bases_hash = snp_base_hash(non_indel_bases) + # check at least three reads are supporting indel + indel_count = self.read_bases.count(delimiter) + if indel_count >= Options.min_indel_count_support + bases_hash[:indel] = indel_count + end + bases_hash end - bases_hash = snp_base_hash(non_indel_bases) - # check at least three reads are supporting indel - indel_count = self.read_bases.count(delimiter) - if indel_count >= @defaults[:min_indel_count_support] - bases_hash[:indel] = indel_count + + def snp_base_hash(readbases) + non_indel_base_hash = {} + non_indel_base_hash[:ref] = readbases.count('.,') + non_indel_base_hash[:A] = readbases.count('aA') + non_indel_base_hash[:C] = readbases.count('cC') + non_indel_base_hash[:G] = readbases.count('gG') + non_indel_base_hash[:T] = readbases.count('tT') + # non_indel_base_hash[:N] = read_bases.count('nN') + non_indel_base_hash end - bases_hash - end - def snp_base_hash(readbases) - non_indel_base_hash = {} - non_indel_base_hash[:ref] = readbases.count('.,') - non_indel_base_hash[:A] = readbases.count('aA') - non_indel_base_hash[:C] = readbases.count('cC') - non_indel_base_hash[:G] = readbases.count('gG') - non_indel_base_hash[:T] = readbases.count('tT') - # non_indel_base_hash[:N] = read_bases.count('nN') - non_indel_base_hash - end + def indel_non_ref_count(delimitter) + read_bases = self.read_bases + non_ref_count = read_bases.count(@indelbases) + indelcounts = read_bases.count(delimitter) + indel_bases = count_indel_bases(delimitter) + non_ref_count + indelcounts - indel_bases + end - def indel_non_ref_count(delimitter) - read_bases = self.read_bases - non_ref_count = read_bases.count(@indelbases) - indelcounts = read_bases.count(delimitter) - indel_bases = count_indel_bases(delimitter) - non_ref_count + indelcounts - indel_bases end end