lib/cheripic/variants.rb in cheripic-1.2.5 vs lib/cheripic/variants.rb in cheripic-1.2.6
- old
+ new
@@ -2,40 +2,11 @@
require 'bio'
require 'forwardable'
module Cheripic
- require 'bio-samtools'
- require 'bio/db/sam'
- require 'open3'
- # An extension of Bio::DB::Sam object to modify depth method
- class Bio::DB::Sam
-
- # A method to retrieve depth information from bam object
- # @param opts [Hash] a hash of following input options
- # b [File] list of positions or regions in BED format
- # l [INT] minQLen
- # q [INT] base quality threshold
- # Q [INT] mapping quality threshold
- # r [chr:from-to] region
- # @returns a block with each line reporting sequence_name, position and depth
- def depth(opts={})
- command = form_opt_string(self.samtools, 'depth', opts)
- # capture returns string output, so careful not to give whole genome or big contigs for depth analysis
- stdout, stderr, status = Open3.capture3(command)
- unless status.success?
- logger.error "resulted in exit code #{status.exitstatus} using #{command}"
- logger.error "stderr output is: #{stderr}"
- raise CheripicError
- end
- # return stdout
- stdout
- end
-
- end
-
# Custom error handling for Variants class
class VariantsError < CheripicError; end
# A Variants object for each analysis pipeline that stores
# assembly details and extracts pileups for each contig
@@ -108,48 +79,36 @@
infile = @params[input]
if infile != ''
logger.info "processing #{input} file"
if @params.input_format == 'pileup'
extract_pileup(infile, input)
+ elsif @params.input_format == 'vcf'
+ extract_vcfs(infile, input)
else
extract_bam_pileup(infile, input)
end
end
end
@pileups_analyzed = true
end
- # Bam object is read and each contig mean and std deviation of depth calculated
- # @param bamobject [Bio::DB::Sam]
- # Open3 capture returns string output, so careful not to give whole genome or big contigs for depth analysis
- def set_max_depth(bamobject, bamfile)
- logger.info "processing #{bamfile} file for depth"
- all_depths = []
- bq = Options.base_quality
- mq = Options.mapping_quality
- @assembly.each_key do | id |
- contig_obj = @assembly[id]
- len = contig_obj.length
- data = bamobject.depth(:r => "#{id}", :Q => bq, :q => mq)
- depths = []
- data.split("\n").each do |line|
- info = line.split("\t")
- depths << info[2].to_i
+ # Input vcf file is read and positions are selected that pass the thresholds
+ # @param vcffile [String] path to the pileup file to read
+ # @param sym [Symbol] Symbol of the pileup file used to write selected variants
+ # pileup information to respective ContigPileups object
+ def extract_vcfs(vcffile, sym)
+ # read vcf file and process each variant
+ File.foreach(vcffile) do |line|
+ next if line =~ /^#/
+ v = Bio::DB::Vcf.new(line)
+ unless v.alt == '.'
+ pileup_string = Vcf.to_pileup(v)
+ pileup = Pileup.new(pileup_string)
+ store_pileup_info(pileup, sym)
end
- variance = 0
- mean_depth = depths.reduce(0, :+) / len.to_f
- depths.each do |value|
- variance += (value.to_f - mean_depth)**2
- end
- all_depths << mean_depth
- contig_obj.sd_depth = Math.sqrt(variance)
- contig_obj.mean_depth = mean_depth
end
- # setting max depth as 3 times the average depth
- mean_coverage = all_depths.reduce(0, :+) / @assembly.length.to_f
- Options.maxdepth = Options.max_d_multiple * mean_coverage
end
# Input pileup file is read and positions are selected that pass the thresholds
# @param pileupfile [String] path to the pileup file to read
# @param sym [Symbol] Symbol of the pileup file used to write selected variants
@@ -157,12 +116,11 @@
def extract_pileup(pileupfile, sym)
# read mpileup file and process each variant
File.foreach(pileupfile) do |line|
pileup = Pileup.new(line)
if pileup.is_var
- contig_obj = @pileups[pileup.ref_name]
- contig_obj.send(sym).store(pileup.pos, pileup)
+ store_pileup_info(pileup, sym)
end
end
end
# Input bamfile is read and selected positions pileups are stored
@@ -173,45 +131,94 @@
bq = Options.base_quality
mq = Options.mapping_quality
bamobject = Bio::DB::Sam.new(:bam=>bamfile, :fasta=>@params.assembly)
bamobject.index unless bamobject.indexed?
+ # or calculate from bamfile
+ set_max_depth(bamobject, bamfile) if Options.max_d_multiple > 0 and sym == :mut_bulk
# check if user has set max depth or set to zero to ignore
max_d = Options.maxdepth
- # or calculate from bamfile
- if Options.max_d_multiple > 0
- set_max_depth(bamobject, bamfile)
- max_d = Options.maxdepth
- logger.info "max depth used for #{sym} file\t#{max_d}"
- end
+ logger.info "max depth used for #{sym} file\t#{max_d}"
@vcf_hash.each_key do | id |
positions = @vcf_hash[id][:het].keys
positions << @vcf_hash[id][:hom].keys
positions.flatten!
next if positions.empty?
- contig_obj = @pileups[id]
positions.each do | pos |
command = "#{bamobject.samtools} mpileup -r #{id}:#{pos}-#{pos} -Q #{bq} -q #{mq} -B -f #{@params.assembly} #{bamfile}"
- stdout, stderr, status = Open3.capture3(command)
- unless status.success?
- logger.error "resulted in exit code #{status.exitstatus} using #{command}"
- logger.error "stderr output is: #{stderr}"
- raise CheripicError
- end
- stdout.chomp!
+ stdout = capture_command(command)
if stdout == '' or stdout.split("\t")[3].to_i == 0 or stdout =~ /^\t0/
logger.info "pileup data empty for\t#{id}\t#{pos}"
else
pileup = Pileup.new(stdout)
- unless max_d == 0 or pileup.coverage <= max_d
- logger.info "pileup coverage is higher than max\t#{pileup.to_s}"
- next
- end
- contig_obj.send(sym).store(pos, pileup)
+ store_pileup_info(pileup, sym)
end
end
end
+ end
+
+ # Bam object is read and each contig mean and std deviation of depth calculated
+ # @param bamobject [Bio::DB::Sam]
+ # Open3 capture returns string output, so careful not to give whole genome or big contigs for depth analysis
+ def set_max_depth(bamobject, bamfile)
+ logger.info "processing #{bamfile} file for depth"
+ all_depths = []
+ bq = Options.base_quality
+ mq = Options.mapping_quality
+ @assembly.each_key do | id |
+ contig_obj = @assembly[id]
+ len = contig_obj.length
+ command = "#{bamobject.samtools} depth -r #{id} -Q #{bq} -q #{mq} #{bamfile}"
+ data = capture_command(command)
+ if data == ''
+ logger.info "depth data empty for\t#{id}"
+ next
+ end
+ depths = []
+ data.split("\n").each do |line|
+ info = line.split("\t")
+ depths << info[2].to_i
+ end
+ variance = 0
+ mean_depth = depths.reduce(0, :+) / len.to_f
+ depths.each do |value|
+ variance += (value.to_f - mean_depth)**2
+ end
+ all_depths << mean_depth
+ contig_obj.sd_depth = Math.sqrt(variance)
+ contig_obj.mean_depth = mean_depth
+ end
+ # setting max depth as 3 times the average depth
+ mean_coverage = all_depths.reduce(0, :+) / @assembly.length.to_f
+ Options.maxdepth = Options.max_d_multiple * mean_coverage
+ end
+
+ def capture_command(command)
+ stdout, stderr, status = Open3.capture3(command)
+ unless status.success?
+ logger.error "resulted in exit code #{status.exitstatus} using #{command}"
+ logger.error "stderr output is: #{stderr}"
+ raise CheripicError
+ end
+ stdout.chomp
+ end
+
+ # stores pileup information provided to respective contig_pileup object using sym input
+ # @param pileup [Pileup] Pileup objects
+ # @param sym [Symbol] Symbol of the input file used to write selected variants
+ # pileup information stored to respective ContigPileups object
+ def store_pileup_info(pileup, sym)
+ # discarding variants with higher than max depth only for mut_bulk
+ if sym == :mut_bulk
+ unless Options.maxdepth == 0 or pileup.coverage <= Options.maxdepth
+ logger.info "pileup coverage is higher than max\t#{pileup.ref_name}\t#{pileup.pos}\t#{pileup.coverage}"
+ return nil
+ end
+ end
+ contig_obj = @pileups[pileup.ref_name]
+ contig_obj.send(sym).store(pileup.pos, pileup)
+ nil
end
# Once pileup files are analysed and variants are extracted from each bulk;
# bulks are compared to identify and isolate variants for downstream analysis.
# If polyploidy set to trye and mut_parent and bg_parent bulks are provided