require 'bio' require 'parallel' require 'bio-samtools' require 'ruby-prof' module NGSCI # A calculator calculates the sequencing complexity index. # # @!attribute [r] sci class Calculator attr_reader :sci, :block_size, :buffer, :chroms # A new calculator to compute the sequencing complexity index given # a loaded Bio::DB::Sam object and optional thread argument. # # @param bam [Bio::DB::Sam] Opened bam file with loaded reference. # @param threads [Int] The number of threads used to compute NGSCI. # @param strand [String] One of [FR RF F] or nil for strandedness. def initialize(bam, reference, strand: nil, threads: 1) @block_size = 1600 @results = nil @reference=reference @bam = Bio::DB::Sam.new(:bam=>bam,:fasta=>reference) unless @bam.indexed? @bam.index end @bam.open @threads = threads @chroms = reference_sequences(reference) read_length if strand unless %w(FR RF F).include?(strand) raise NGSCI::NGSCIError.new "Strand specific option #{opts.strand} is invalid." + " It must be one of: [FR, RF, F]" end @strand = strand.downcase else @strand = nil end end # Calculation of the sequencing complexity index # def run(runtime: false) RubyProf.start if runtime # Convert each aligned read to Read clas chroms={} @chroms.each do |chrom,size| chroms[chrom] = @strand ? {"+"=>[],"-"=>[]} : {nil=>[]} disk_accesses = (size/@block_size.to_f).ceil =begin # N O N - P A R A L L E L i=0 while i < disk_accesses readblock(chrom,i).each do |key,val| chroms[chrom][key] += val end i+=1 end =end data = Parallel.map((0...disk_accesses).to_a,:in_processes => @threads) do |i| readblock(chrom,i) end chroms[chrom].keys.each do |key| chroms[chrom][key] = data.map{|x| x[key]}.flatten(1) end end # Printing runtime information for optimization if runtime runtime=RubyProf.stop printer=RubyProf::FlatPrinter.new(runtime) printer.print(STDOUT) end @results = chroms end # Reads a single block from the disk and calculates the NGSCI # # @param chrom [String] The chromosome from the bam file # @param i [Integer] The number of blocks that have been read # @return localNGSCI [Hash] # * :+ (Array[Integer]) The NGSCI for the + strand # * :- (Array[Integer]) The NGSCI for the - strand def readblock(chrom,i) reads=[] results = @strand ? {"+" => [],"-" => []}: {nil => []} start = [0,(i * @block_size) - @buffer].max stop = [(i + 1) * @block_size, self.chroms[chrom]].min @bam.fetch(chrom,start,stop) {|read| reads << convert(read)} start += @buffer unless start == 0 reads.compact! reads.sort_by!(&:start) unless reads.empty? x=0 bases = (start...stop).to_a block = stop - start while x < block b = bases[x] aligned = reads.select{|r| r.start <= b && r.stop - 1 >= b}.group_by &:strand results.keys.each do|key| results[key] << [b,*sci(aligned[key] || [])] end x+=1 end return results end # Calculates sequencing complexity index for a single base # # @param reads [Array] A group of reads aligned to a single base. # @return sci [Float] def sci(reads) numreads=reads.size # Groups reads by start site # selects the largest read length from the groups reads = reads.group_by(&:start).map{|k,v| v.max{|x,y| (x.stop-x.start).abs <=> (y.stop-y.start).abs}} o = summed_overlaps(reads) uniquereads = reads.size return [numreads,uniquereads,(@buffer*o.to_f/@denom).round(4),(300*uniquereads*o/(2*@denom)).round(4)] end # Calculates summed overlap between a group of reads # # @param reads [Array] Array of reads # @return avg_overlap [Integer] Summed overlap between reads def summed_overlaps(reads) numreads = reads.size sum=0 unless numreads == 1 i = 0 while i < numreads r1 = reads[i] # for each of n reads sum+=reads. reject{|r| r == r1}. # select the n-1 other reads map{|r| overlap(r,r1)}. # calculate their overlap to r1 reduce(:+) i+=1 end end return sum end # Calculation of the overlap between two reads # # @param read1 [NGSCI::Read] First read to be compared # @param read2 [NGSCI::Read] First read to be compared # @return overlap_length [Integer] Length of overlap def overlap(read1,read2) if read1.start > read2.start if read1.stop < read2.stop # Read 1 is inside read 2 read1.stop - read1.start else # Normal overlap read2.stop - read1.start end else if read1.stop > read2.stop # Read 2 is inside read 1 read2.stop - read2.start else # Normal overlap read1.stop - read2.start end end end # Loads the read length from a bam file into the @buffer variable # def read_length buffer=0 stats=@bam.index_stats.select {|k,v| k != "*" && v[:mapped_reads] > 0} if stats.empty? raise NGSCIIOError.new "BAM file is empty! Check samtools idxstats." else i=0 lengths=[] test = @block_size while i <= test @bam.view do |read| lengths << read.seq.size i +=1 end if i == test && lengths.size < 100 test += @block_size end end @buffer = lengths.max @denom = @buffer**2 * (@buffer - 1)**2 end end # Converts strand specific BAM read into a sequence object format # Uses the @strand instance variable to determine the strand of conversion # # @param read [Bio::DB::Alignment] Read to be converted. # @return read [NGSCI::Read] Converted Read object def convert(read) unless read.query_unmapped if @strand return self.send(@strand.to_sym,read) else return newread(read) end end return nil end # Converts strand specific BAM read into a sequence object format # Assumes paired-end strand-specific sequencing with "fr" chemistry # # @param read [Bio::DB::Alignment] Read to be converted. # @return read [NGSCI::Read] Converted Read object def fr(read) if read.first_in_pair read.query_strand ? newread(read,strand:"+") : newread(read,strand:"-") else read.query_strand ? newread(read,strand:"-") : newread(read,strand:"+") end end # Converts strand specific BAM read into a sequence object format # Assumes paired-end strand-specific sequencing with "rf" chemistry # # @param read [Bio::DB::Alignment] Read to be converted. # @return read [NGSCI::Read] Converted Read object def rf(read) if read.first_in_pair read.query_strand ? newread(read,strand:"-") : newread(read,strand:"+") else read.query_strand ? newread(read,strand:"+") : newread(read,strand:"-") end end # Converts strand specific BAM read into a sequence object format # Assumes single-end strand-specific sequencing with "f" chemistry # # @param read [Bio::DB::Alignment] Read to be converted. # @return read [NGSCI::Read] Converted Read object def f(read) read.query_strand ? newread(read,strand:"+") : newread(read,strand:"-") end # Creates a new read with optional strand argument # # @param read [Bio::DB::Alignment] Aligned read to be converted # @param strand [String] Strand of read # @return read [NGSCI::Read] Converted Read object def newread(read,strand: nil) Read.new(read.pos,read.pos+read.seq.size,strand: strand) end # Acquires names and sizes of reference sequences included in the bam file # # @param reference [String] Path to reference fasta file. # @return chromosomes [Hash] A dictionary of chromosome sizes def reference_sequences(reference) chromosomes={} Bio::FastaFormat.open(@reference).each_entry do |f| chromosomes[f.entry_id]=f.seq.size end chromosomes.select {|chrom| @bam.index_stats.keys.include?(chrom)} end # Exports the results to outfile # # @param outfile [String] Path to outfile def export(outfile) if @results File.open(outfile,'w') do |file| file.puts("Chrom,Base,Strand,Depth,Unique_Reads,Overlap,NGS-CI") @results.each do |chrom,results| results.each do |strand,val| val.each do |x| file.puts([chrom,x[0],strand,*x[1..-1]].join(",")) end end end end return outfile else return nil end end end # End calculator class end