lib/cheripic/variants.rb in cheripic-1.2.5 vs lib/cheripic/variants.rb in cheripic-1.2.6

- old
+ new

@@ -2,40 +2,11 @@ require 'bio' require 'forwardable' module Cheripic - require 'bio-samtools' - require 'bio/db/sam' - require 'open3' - # An extension of Bio::DB::Sam object to modify depth method - class Bio::DB::Sam - - # A method to retrieve depth information from bam object - # @param opts [Hash] a hash of following input options - # b [File] list of positions or regions in BED format - # l [INT] minQLen - # q [INT] base quality threshold - # Q [INT] mapping quality threshold - # r [chr:from-to] region - # @returns a block with each line reporting sequence_name, position and depth - def depth(opts={}) - command = form_opt_string(self.samtools, 'depth', opts) - # capture returns string output, so careful not to give whole genome or big contigs for depth analysis - stdout, stderr, status = Open3.capture3(command) - unless status.success? - logger.error "resulted in exit code #{status.exitstatus} using #{command}" - logger.error "stderr output is: #{stderr}" - raise CheripicError - end - # return stdout - stdout - end - - end - # Custom error handling for Variants class class VariantsError < CheripicError; end # A Variants object for each analysis pipeline that stores # assembly details and extracts pileups for each contig @@ -108,48 +79,36 @@ infile = @params[input] if infile != '' logger.info "processing #{input} file" if @params.input_format == 'pileup' extract_pileup(infile, input) + elsif @params.input_format == 'vcf' + extract_vcfs(infile, input) else extract_bam_pileup(infile, input) end end end @pileups_analyzed = true end - # Bam object is read and each contig mean and std deviation of depth calculated - # @param bamobject [Bio::DB::Sam] - # Open3 capture returns string output, so careful not to give whole genome or big contigs for depth analysis - def set_max_depth(bamobject, bamfile) - logger.info "processing #{bamfile} file for depth" - all_depths = [] - bq = Options.base_quality - mq = Options.mapping_quality - @assembly.each_key do | id | - contig_obj = @assembly[id] - len = contig_obj.length - data = bamobject.depth(:r => "#{id}", :Q => bq, :q => mq) - depths = [] - data.split("\n").each do |line| - info = line.split("\t") - depths << info[2].to_i + # Input vcf file is read and positions are selected that pass the thresholds + # @param vcffile [String] path to the pileup file to read + # @param sym [Symbol] Symbol of the pileup file used to write selected variants + # pileup information to respective ContigPileups object + def extract_vcfs(vcffile, sym) + # read vcf file and process each variant + File.foreach(vcffile) do |line| + next if line =~ /^#/ + v = Bio::DB::Vcf.new(line) + unless v.alt == '.' + pileup_string = Vcf.to_pileup(v) + pileup = Pileup.new(pileup_string) + store_pileup_info(pileup, sym) end - variance = 0 - mean_depth = depths.reduce(0, :+) / len.to_f - depths.each do |value| - variance += (value.to_f - mean_depth)**2 - end - all_depths << mean_depth - contig_obj.sd_depth = Math.sqrt(variance) - contig_obj.mean_depth = mean_depth end - # setting max depth as 3 times the average depth - mean_coverage = all_depths.reduce(0, :+) / @assembly.length.to_f - Options.maxdepth = Options.max_d_multiple * mean_coverage end # Input pileup file is read and positions are selected that pass the thresholds # @param pileupfile [String] path to the pileup file to read # @param sym [Symbol] Symbol of the pileup file used to write selected variants @@ -157,12 +116,11 @@ def extract_pileup(pileupfile, sym) # read mpileup file and process each variant File.foreach(pileupfile) do |line| pileup = Pileup.new(line) if pileup.is_var - contig_obj = @pileups[pileup.ref_name] - contig_obj.send(sym).store(pileup.pos, pileup) + store_pileup_info(pileup, sym) end end end # Input bamfile is read and selected positions pileups are stored @@ -173,45 +131,94 @@ bq = Options.base_quality mq = Options.mapping_quality bamobject = Bio::DB::Sam.new(:bam=>bamfile, :fasta=>@params.assembly) bamobject.index unless bamobject.indexed? + # or calculate from bamfile + set_max_depth(bamobject, bamfile) if Options.max_d_multiple > 0 and sym == :mut_bulk # check if user has set max depth or set to zero to ignore max_d = Options.maxdepth - # or calculate from bamfile - if Options.max_d_multiple > 0 - set_max_depth(bamobject, bamfile) - max_d = Options.maxdepth - logger.info "max depth used for #{sym} file\t#{max_d}" - end + logger.info "max depth used for #{sym} file\t#{max_d}" @vcf_hash.each_key do | id | positions = @vcf_hash[id][:het].keys positions << @vcf_hash[id][:hom].keys positions.flatten! next if positions.empty? - contig_obj = @pileups[id] positions.each do | pos | command = "#{bamobject.samtools} mpileup -r #{id}:#{pos}-#{pos} -Q #{bq} -q #{mq} -B -f #{@params.assembly} #{bamfile}" - stdout, stderr, status = Open3.capture3(command) - unless status.success? - logger.error "resulted in exit code #{status.exitstatus} using #{command}" - logger.error "stderr output is: #{stderr}" - raise CheripicError - end - stdout.chomp! + stdout = capture_command(command) if stdout == '' or stdout.split("\t")[3].to_i == 0 or stdout =~ /^\t0/ logger.info "pileup data empty for\t#{id}\t#{pos}" else pileup = Pileup.new(stdout) - unless max_d == 0 or pileup.coverage <= max_d - logger.info "pileup coverage is higher than max\t#{pileup.to_s}" - next - end - contig_obj.send(sym).store(pos, pileup) + store_pileup_info(pileup, sym) end end end + end + + # Bam object is read and each contig mean and std deviation of depth calculated + # @param bamobject [Bio::DB::Sam] + # Open3 capture returns string output, so careful not to give whole genome or big contigs for depth analysis + def set_max_depth(bamobject, bamfile) + logger.info "processing #{bamfile} file for depth" + all_depths = [] + bq = Options.base_quality + mq = Options.mapping_quality + @assembly.each_key do | id | + contig_obj = @assembly[id] + len = contig_obj.length + command = "#{bamobject.samtools} depth -r #{id} -Q #{bq} -q #{mq} #{bamfile}" + data = capture_command(command) + if data == '' + logger.info "depth data empty for\t#{id}" + next + end + depths = [] + data.split("\n").each do |line| + info = line.split("\t") + depths << info[2].to_i + end + variance = 0 + mean_depth = depths.reduce(0, :+) / len.to_f + depths.each do |value| + variance += (value.to_f - mean_depth)**2 + end + all_depths << mean_depth + contig_obj.sd_depth = Math.sqrt(variance) + contig_obj.mean_depth = mean_depth + end + # setting max depth as 3 times the average depth + mean_coverage = all_depths.reduce(0, :+) / @assembly.length.to_f + Options.maxdepth = Options.max_d_multiple * mean_coverage + end + + def capture_command(command) + stdout, stderr, status = Open3.capture3(command) + unless status.success? + logger.error "resulted in exit code #{status.exitstatus} using #{command}" + logger.error "stderr output is: #{stderr}" + raise CheripicError + end + stdout.chomp + end + + # stores pileup information provided to respective contig_pileup object using sym input + # @param pileup [Pileup] Pileup objects + # @param sym [Symbol] Symbol of the input file used to write selected variants + # pileup information stored to respective ContigPileups object + def store_pileup_info(pileup, sym) + # discarding variants with higher than max depth only for mut_bulk + if sym == :mut_bulk + unless Options.maxdepth == 0 or pileup.coverage <= Options.maxdepth + logger.info "pileup coverage is higher than max\t#{pileup.ref_name}\t#{pileup.pos}\t#{pileup.coverage}" + return nil + end + end + contig_obj = @pileups[pileup.ref_name] + contig_obj.send(sym).store(pileup.pos, pileup) + nil end # Once pileup files are analysed and variants are extracted from each bulk; # bulks are compared to identify and isolate variants for downstream analysis. # If polyploidy set to trye and mut_parent and bg_parent bulks are provided