lib/cheripic/variants.rb in cheripic-1.2.0 vs lib/cheripic/variants.rb in cheripic-1.2.5
- old
+ new
@@ -2,10 +2,40 @@
require 'bio'
require 'forwardable'
module Cheripic
+ require 'bio-samtools'
+ require 'bio/db/sam'
+ require 'open3'
+
+ # An extension of Bio::DB::Sam object to modify depth method
+ class Bio::DB::Sam
+
+ # A method to retrieve depth information from bam object
+ # @param opts [Hash] a hash of following input options
+ # b [File] list of positions or regions in BED format
+ # l [INT] minQLen
+ # q [INT] base quality threshold
+ # Q [INT] mapping quality threshold
+ # r [chr:from-to] region
+ # @returns a block with each line reporting sequence_name, position and depth
+ def depth(opts={})
+ command = form_opt_string(self.samtools, 'depth', opts)
+ # capture returns string output, so careful not to give whole genome or big contigs for depth analysis
+ stdout, stderr, status = Open3.capture3(command)
+ unless status.success?
+ logger.error "resulted in exit code #{status.exitstatus} using #{command}"
+ logger.error "stderr output is: #{stderr}"
+ raise CheripicError
+ end
+ # return stdout
+ stdout
+ end
+
+ end
+
# Custom error handling for Variants class
class VariantsError < CheripicError; end
# A Variants object for each analysis pipeline that stores
# assembly details and extracts pileups for each contig
@@ -25,14 +55,14 @@
class Variants
include Enumerable
extend Forwardable
def_delegators :@assembly, :each, :each_key, :each_value, :size, :length, :[]
- attr_reader :assembly, :pileups, :hmes_frags, :bfr_frags, :pileups_analyzed
+ attr_reader :assembly, :pileups, :pileups_analyzed
# creates a Variants object using user input files
- # @param options [Hash] a hash of required input files as keys and file paths as values
+ # @param options [OpenStruct] a hash of required input files as keys and file paths as values
def initialize(options)
@params = options
@assembly = {}
@pileups = {}
Bio::FastaFormat.open(@params.assembly).each do |entry|
@@ -48,29 +78,80 @@
end
@assembly[contig.id] = contig
@pileups[contig.id] = ContigPileups.new(contig.id)
end
@pileups_analyzed = false
+ unless @params.repeats_file == ''
+ store_repeat_regions
+ end
end
+ # reads repeat masker output file and stores masked regions to ignore variants in thos regions
+ def store_repeat_regions
+ File.foreach(@params.repeats_file) do |line|
+ line.strip!
+ next if line =~ /^SW/ or line =~ /^score/ or line == ''
+ info = line.split("\s")
+ pileups_obj = @pileups[info[4]]
+ index = pileups_obj.masked_regions.length
+ pileups_obj.masked_regions[index + 1][:begin] = info[5].to_i
+ pileups_obj.masked_regions[index + 1][:end] = info[6].to_i
+ end
+ end
+
# Reads and store pileup data for each of input bulk and parents pileup files
# And sets pileups_analyzed to true that pileups files are processed
def analyse_pileups
- @bg_bulk = @params.bg_bulk
- @mut_parent = @params.mut_parent
- @bg_parent = @params.bg_parent
-
+ if @params.input_format == 'bam'
+ @vcf_hash = Vcf.filtering(@params.mut_bulk_vcf, @params.bg_bulk_vcf)
+ end
%i{mut_bulk bg_bulk mut_parent bg_parent}.each do | input |
infile = @params[input]
if infile != ''
- extract_pileup(infile, input)
+ logger.info "processing #{input} file"
+ if @params.input_format == 'pileup'
+ extract_pileup(infile, input)
+ else
+ extract_bam_pileup(infile, input)
+ end
end
end
@pileups_analyzed = true
end
+ # Bam object is read and each contig mean and std deviation of depth calculated
+ # @param bamobject [Bio::DB::Sam]
+ # Open3 capture returns string output, so careful not to give whole genome or big contigs for depth analysis
+ def set_max_depth(bamobject, bamfile)
+ logger.info "processing #{bamfile} file for depth"
+ all_depths = []
+ bq = Options.base_quality
+ mq = Options.mapping_quality
+ @assembly.each_key do | id |
+ contig_obj = @assembly[id]
+ len = contig_obj.length
+ data = bamobject.depth(:r => "#{id}", :Q => bq, :q => mq)
+ depths = []
+ data.split("\n").each do |line|
+ info = line.split("\t")
+ depths << info[2].to_i
+ end
+ variance = 0
+ mean_depth = depths.reduce(0, :+) / len.to_f
+ depths.each do |value|
+ variance += (value.to_f - mean_depth)**2
+ end
+ all_depths << mean_depth
+ contig_obj.sd_depth = Math.sqrt(variance)
+ contig_obj.mean_depth = mean_depth
+ end
+ # setting max depth as 3 times the average depth
+ mean_coverage = all_depths.reduce(0, :+) / @assembly.length.to_f
+ Options.maxdepth = Options.max_d_multiple * mean_coverage
+ end
+
# Input pileup file is read and positions are selected that pass the thresholds
# @param pileupfile [String] path to the pileup file to read
# @param sym [Symbol] Symbol of the pileup file used to write selected variants
# pileup information to respective ContigPileups object
def extract_pileup(pileupfile, sym)
@@ -82,10 +163,58 @@
contig_obj.send(sym).store(pileup.pos, pileup)
end
end
end
+ # Input bamfile is read and selected positions pileups are stored
+ # @param bamfile [String] path to the bam file to read
+ # @param sym [Symbol] Symbol of the bam file used to write selected variants
+ # pileup information to respective ContigPileups object
+ def extract_bam_pileup(bamfile, sym)
+ bq = Options.base_quality
+ mq = Options.mapping_quality
+ bamobject = Bio::DB::Sam.new(:bam=>bamfile, :fasta=>@params.assembly)
+ bamobject.index unless bamobject.indexed?
+
+ # check if user has set max depth or set to zero to ignore
+ max_d = Options.maxdepth
+ # or calculate from bamfile
+ if Options.max_d_multiple > 0
+ set_max_depth(bamobject, bamfile)
+ max_d = Options.maxdepth
+ logger.info "max depth used for #{sym} file\t#{max_d}"
+ end
+
+ @vcf_hash.each_key do | id |
+ positions = @vcf_hash[id][:het].keys
+ positions << @vcf_hash[id][:hom].keys
+ positions.flatten!
+ next if positions.empty?
+ contig_obj = @pileups[id]
+ positions.each do | pos |
+ command = "#{bamobject.samtools} mpileup -r #{id}:#{pos}-#{pos} -Q #{bq} -q #{mq} -B -f #{@params.assembly} #{bamfile}"
+ stdout, stderr, status = Open3.capture3(command)
+ unless status.success?
+ logger.error "resulted in exit code #{status.exitstatus} using #{command}"
+ logger.error "stderr output is: #{stderr}"
+ raise CheripicError
+ end
+ stdout.chomp!
+ if stdout == '' or stdout.split("\t")[3].to_i == 0 or stdout =~ /^\t0/
+ logger.info "pileup data empty for\t#{id}\t#{pos}"
+ else
+ pileup = Pileup.new(stdout)
+ unless max_d == 0 or pileup.coverage <= max_d
+ logger.info "pileup coverage is higher than max\t#{pileup.to_s}"
+ next
+ end
+ contig_obj.send(sym).store(pos, pileup)
+ end
+ end
+ end
+ end
+
# Once pileup files are analysed and variants are extracted from each bulk;
# bulks are compared to identify and isolate variants for downstream analysis.
# If polyploidy set to trye and mut_parent and bg_parent bulks are provided
# hemisnps in parents are extracted for bulk frequency ratio analysis
def compare_pileups
@@ -93,11 +222,13 @@
self.analyse_pileups
end
@assembly.each_key do | id |
contig = @assembly[id]
# extract parental hemi snps for polyploids before bulks are compared
- if @mut_parent != '' or @bg_parent != ''
- @pileups[id].hemisnps_in_parent
+ if Options.polyploidy
+ if @params.mut_parent != '' or @params.bg_parent != ''
+ @pileups[id].hemisnps_in_parent
+ end
end
contig.hm_pos, contig.ht_pos, contig.hemi_pos = @pileups[id].bulks_compared
end
end