lib/cheripic/variants.rb in cheripic-1.2.0 vs lib/cheripic/variants.rb in cheripic-1.2.5

- old
+ new

@@ -2,10 +2,40 @@ require 'bio' require 'forwardable' module Cheripic + require 'bio-samtools' + require 'bio/db/sam' + require 'open3' + + # An extension of Bio::DB::Sam object to modify depth method + class Bio::DB::Sam + + # A method to retrieve depth information from bam object + # @param opts [Hash] a hash of following input options + # b [File] list of positions or regions in BED format + # l [INT] minQLen + # q [INT] base quality threshold + # Q [INT] mapping quality threshold + # r [chr:from-to] region + # @returns a block with each line reporting sequence_name, position and depth + def depth(opts={}) + command = form_opt_string(self.samtools, 'depth', opts) + # capture returns string output, so careful not to give whole genome or big contigs for depth analysis + stdout, stderr, status = Open3.capture3(command) + unless status.success? + logger.error "resulted in exit code #{status.exitstatus} using #{command}" + logger.error "stderr output is: #{stderr}" + raise CheripicError + end + # return stdout + stdout + end + + end + # Custom error handling for Variants class class VariantsError < CheripicError; end # A Variants object for each analysis pipeline that stores # assembly details and extracts pileups for each contig @@ -25,14 +55,14 @@ class Variants include Enumerable extend Forwardable def_delegators :@assembly, :each, :each_key, :each_value, :size, :length, :[] - attr_reader :assembly, :pileups, :hmes_frags, :bfr_frags, :pileups_analyzed + attr_reader :assembly, :pileups, :pileups_analyzed # creates a Variants object using user input files - # @param options [Hash] a hash of required input files as keys and file paths as values + # @param options [OpenStruct] a hash of required input files as keys and file paths as values def initialize(options) @params = options @assembly = {} @pileups = {} Bio::FastaFormat.open(@params.assembly).each do |entry| @@ -48,29 +78,80 @@ end @assembly[contig.id] = contig @pileups[contig.id] = ContigPileups.new(contig.id) end @pileups_analyzed = false + unless @params.repeats_file == '' + store_repeat_regions + end end + # reads repeat masker output file and stores masked regions to ignore variants in thos regions + def store_repeat_regions + File.foreach(@params.repeats_file) do |line| + line.strip! + next if line =~ /^SW/ or line =~ /^score/ or line == '' + info = line.split("\s") + pileups_obj = @pileups[info[4]] + index = pileups_obj.masked_regions.length + pileups_obj.masked_regions[index + 1][:begin] = info[5].to_i + pileups_obj.masked_regions[index + 1][:end] = info[6].to_i + end + end + # Reads and store pileup data for each of input bulk and parents pileup files # And sets pileups_analyzed to true that pileups files are processed def analyse_pileups - @bg_bulk = @params.bg_bulk - @mut_parent = @params.mut_parent - @bg_parent = @params.bg_parent - + if @params.input_format == 'bam' + @vcf_hash = Vcf.filtering(@params.mut_bulk_vcf, @params.bg_bulk_vcf) + end %i{mut_bulk bg_bulk mut_parent bg_parent}.each do | input | infile = @params[input] if infile != '' - extract_pileup(infile, input) + logger.info "processing #{input} file" + if @params.input_format == 'pileup' + extract_pileup(infile, input) + else + extract_bam_pileup(infile, input) + end end end @pileups_analyzed = true end + # Bam object is read and each contig mean and std deviation of depth calculated + # @param bamobject [Bio::DB::Sam] + # Open3 capture returns string output, so careful not to give whole genome or big contigs for depth analysis + def set_max_depth(bamobject, bamfile) + logger.info "processing #{bamfile} file for depth" + all_depths = [] + bq = Options.base_quality + mq = Options.mapping_quality + @assembly.each_key do | id | + contig_obj = @assembly[id] + len = contig_obj.length + data = bamobject.depth(:r => "#{id}", :Q => bq, :q => mq) + depths = [] + data.split("\n").each do |line| + info = line.split("\t") + depths << info[2].to_i + end + variance = 0 + mean_depth = depths.reduce(0, :+) / len.to_f + depths.each do |value| + variance += (value.to_f - mean_depth)**2 + end + all_depths << mean_depth + contig_obj.sd_depth = Math.sqrt(variance) + contig_obj.mean_depth = mean_depth + end + # setting max depth as 3 times the average depth + mean_coverage = all_depths.reduce(0, :+) / @assembly.length.to_f + Options.maxdepth = Options.max_d_multiple * mean_coverage + end + # Input pileup file is read and positions are selected that pass the thresholds # @param pileupfile [String] path to the pileup file to read # @param sym [Symbol] Symbol of the pileup file used to write selected variants # pileup information to respective ContigPileups object def extract_pileup(pileupfile, sym) @@ -82,10 +163,58 @@ contig_obj.send(sym).store(pileup.pos, pileup) end end end + # Input bamfile is read and selected positions pileups are stored + # @param bamfile [String] path to the bam file to read + # @param sym [Symbol] Symbol of the bam file used to write selected variants + # pileup information to respective ContigPileups object + def extract_bam_pileup(bamfile, sym) + bq = Options.base_quality + mq = Options.mapping_quality + bamobject = Bio::DB::Sam.new(:bam=>bamfile, :fasta=>@params.assembly) + bamobject.index unless bamobject.indexed? + + # check if user has set max depth or set to zero to ignore + max_d = Options.maxdepth + # or calculate from bamfile + if Options.max_d_multiple > 0 + set_max_depth(bamobject, bamfile) + max_d = Options.maxdepth + logger.info "max depth used for #{sym} file\t#{max_d}" + end + + @vcf_hash.each_key do | id | + positions = @vcf_hash[id][:het].keys + positions << @vcf_hash[id][:hom].keys + positions.flatten! + next if positions.empty? + contig_obj = @pileups[id] + positions.each do | pos | + command = "#{bamobject.samtools} mpileup -r #{id}:#{pos}-#{pos} -Q #{bq} -q #{mq} -B -f #{@params.assembly} #{bamfile}" + stdout, stderr, status = Open3.capture3(command) + unless status.success? + logger.error "resulted in exit code #{status.exitstatus} using #{command}" + logger.error "stderr output is: #{stderr}" + raise CheripicError + end + stdout.chomp! + if stdout == '' or stdout.split("\t")[3].to_i == 0 or stdout =~ /^\t0/ + logger.info "pileup data empty for\t#{id}\t#{pos}" + else + pileup = Pileup.new(stdout) + unless max_d == 0 or pileup.coverage <= max_d + logger.info "pileup coverage is higher than max\t#{pileup.to_s}" + next + end + contig_obj.send(sym).store(pos, pileup) + end + end + end + end + # Once pileup files are analysed and variants are extracted from each bulk; # bulks are compared to identify and isolate variants for downstream analysis. # If polyploidy set to trye and mut_parent and bg_parent bulks are provided # hemisnps in parents are extracted for bulk frequency ratio analysis def compare_pileups @@ -93,11 +222,13 @@ self.analyse_pileups end @assembly.each_key do | id | contig = @assembly[id] # extract parental hemi snps for polyploids before bulks are compared - if @mut_parent != '' or @bg_parent != '' - @pileups[id].hemisnps_in_parent + if Options.polyploidy + if @params.mut_parent != '' or @params.bg_parent != '' + @pileups[id].hemisnps_in_parent + end end contig.hm_pos, contig.ht_pos, contig.hemi_pos = @pileups[id].bulks_compared end end