require 'bio' require 'parallel' require 'bio-samtools' require 'ruby-prof' module NGSCI # A calculator calculates the sequencing complexity index. # # @author Matthew Ralston # @abstract A class for calculating the complexity index on next generation sequencing reads # @attr_reader [Integer] block_size The block size for parallelizing disk access # @attr_reader [Hash] chroms A hash of chromosomes and their sizes # @attr_reader [Integer] read_length The read length obtained from a bam file # @attr_reader [Integer] denominator The denominator and normalization factors calculated from the read length class Calculator attr_reader :block_size, :chroms, :read_length, :denominator # A new calculator to compute the sequencing complexity index given # a loaded Bio::DB::Sam object and optional thread argument. # # @param [Bio::DB::Sam] bam Opened bam file with loaded reference. # @param [Int] threads The number of threads used to compute NGSCI. # @param [String] strand One of [FR RF F] or nil for strandedness. def initialize(bam, reference, strand: nil, threads: 1) @block_size = 1600 @results = nil @reference=reference @bam = Bio::DB::Sam.new(:bam=>bam,:fasta=>reference) unless @bam.indexed? @bam.index end @bam.open @threads = threads @chroms = reference_sequences(reference) @read_length = NGSCI::Calculator.read_length_calc(@bam,@block_size) @denominator = denominator_calc(@read_length) if strand unless %w(FR RF F).include?(strand) raise NGSCI::NGSCIError.new "Strand specific option #{opts.strand} is invalid." + " It must be one of: [FR, RF, F]" end @strand = strand.downcase else @strand = nil end end # Calculation of the sequencing complexity index # # @param runtime [false] Print profiling information? def run(runtime: false) RubyProf.start if runtime # Convert each aligned read to Read clas chroms={} @chroms.each do |chrom,size| chroms[chrom] = @strand ? {"+"=>[],"-"=>[]} : {nil=>[]} disk_accesses = (size/@block_size.to_f).ceil =begin # N O N - P A R A L L E L i=0 while i < disk_accesses readblock(chrom,i).each do |key,val| chroms[chrom][key] += val end i+=1 end =end data = Parallel.map((0...disk_accesses).to_a,:in_processes => @threads) do |i| readblock(chrom,i) end chroms[chrom].keys.each do |key| chroms[chrom][key] = data.map{|x| x[key]}.flatten(1) end end # Printing runtime information for optimization if runtime runtime=RubyProf.stop printer=RubyProf::FlatPrinter.new(runtime) printer.print(STDOUT) end @results = chroms end # Reads a single block from the disk and calculates the NGSCI # # @param chrom [String] The chromosome from the bam file # @param i [Integer] The number of blocks that have been read # @return [Hash] # * :+ (Array[Array]) The NGSCI for the + strand along the # * :- (Array[Array]) The NGSCI for the - strand def readblock(chrom,i) reads=[] results = @strand ? {"+" => [],"-" => []}: {nil => []} start = [0,(i * @block_size) - @read_length].max stop = [(i + 1) * @block_size, self.chroms[chrom]].min @bam.fetch(chrom,start,stop) {|read| reads << convert(read)} start += @read_length unless start == 0 reads.compact! reads.sort_by!(&:start) unless reads.empty? x=0 bases = (start...stop).to_a block = stop - start while x < block b = bases[x] aligned = reads.select{|r| r.start <= b && r.stop - 1 >= b}.group_by &:strand results.keys.each do|key| results[key] << [b,*sci(aligned[key] || [])] end x+=1 end return results end # Calculates sequencing complexity index for a single base # # @param reads [Array] A group of reads aligned to a single base. # @return [Array] def sci(reads) numreads=reads.size # Groups reads by start site # selects the largest read length from the groups reads = reads.group_by(&:start).map{|k,v| v.max{|x,y| x.length <=> y.length}} d = summed_dissimilarity(reads) uniquereads = reads.size return [numreads,uniquereads,(d.to_f/@read_length).round(4),(100*uniquereads*d/@denominator).round(4)] end # Calculation of the dissimilarity between two reads # # @param read1 [NGSCI::Read] First read to be compared # @param read2 [NGSCI::Read] Second read to be compared # @return [Integer] Length of non-overlapping/unique bases def dissimilarity(read1,read2) if read1.start > read2.start if read1.stop < read2.stop # Read 1 is inside read 2 (read1.start - read2.start) + (read2.stop - read1.stop) else # Normal overlap read1.start - read2.start end else if read1.stop > read2.stop # Read 2 is inside read 1 (read2.start - read1.start) + (read1.stop - read2.stop) else # Normal overlap read2.start - read1.start end end end # Calculates summed dissimilarity between a group of reads # # @param reads [Array] Array of reads # @return [Integer] Sum of all dissimilarities between the group of reads def summed_dissimilarity(reads) numreads = reads.size sum=0 unless numreads <= 1 i = 0 while i < numreads r1 = reads[i] # for each of n reads sum+=reads. reject{|r| r == r1}. # select the n-1 other reads map{|r| dissimilarity(r,r1)}. # calculate their overlap to r1 reduce(:+) i+=1 end end return sum end # Calculates the average summed dissimilarity (per read) of that read to all other reads # # @param [Integer] read_length The read length # @return [Integer] avg_summed_dissimilarity def max_summed_dissimilarity(read_length) # For each unique read under maximum saturation, calculate the sum of dissimilarities for that read to all other reads summed_dissimilarities = (1..read_length).to_a.map { |r| (read_length ** 2) / 2 - read_length*r + read_length/2 + r**2 - r }.reduce(:+) end # Calculates the denominator for the complexity index from the read length, assuming maximum saturation (i.e. number of unique reads == read_length) # unique reads /read length * summed_dissimilarity / (max_summed_dissimilarity/(read length * read length) # Denomiator = read length * max_summed_dissimilarity / (read_length * read_length) # # @param [Integer] read_length The read length # @return [Float] denominator The denominator including normalization factors for the complexity index 349184 def denominator_calc(read_length) read_length*max_summed_dissimilarity(read_length) end # Calculates the read length of a bam file by sampling at least on full block of reads # # @param [Bio::DB::Sam] bam A bam reader object # @param [Integer] block_size The number of reads to read from a bam file # @return [Integer] read_length The read length acquired from reading a block at a time until at least 100 reads are acquired def self.read_length_calc(bam,block_size) stats=bam.index_stats.select {|k,v| k != "*" && v[:mapped_reads] > 0} if stats.empty? raise NGSCIIOError.new "BAM file is empty! Check samtools idxstats." end i=0 lengths=[] test = block_size while i <= test bam.view do |read| lengths << read.seq.size i +=1 end if i == test && lengths.size < 100 test += block_size end end lengths.max end # Converts strand specific BAM read into a sequence object format # Uses the @strand instance variable to determine the strand of conversion # # @param [Bio::DB::Alignment] read Read to be converted. # @return [NGSCI::Read] read Converted Read object def convert(read) unless read.query_unmapped if @strand return self.send(@strand.to_sym,read) else return newread(read) end end return nil end # Converts strand specific BAM read into a sequence object format # Assumes paired-end strand-specific sequencing with "fr" chemistry # # @param read [Bio::DB::Alignment] Read to be converted. # @return [NGSCI::Read] Converted Read object def fr(read) if read.first_in_pair read.query_strand ? newread(read,strand:"+") : newread(read,strand:"-") else read.query_strand ? newread(read,strand:"-") : newread(read,strand:"+") end end # Converts strand specific BAM read into a sequence object format # Assumes paired-end strand-specific sequencing with "rf" chemistry # # @param read [Bio::DB::Alignment] Read to be converted. # @return [NGSCI::Read] Converted Read object def rf(read) if read.first_in_pair read.query_strand ? newread(read,strand:"-") : newread(read,strand:"+") else read.query_strand ? newread(read,strand:"+") : newread(read,strand:"-") end end # Converts strand specific BAM read into a sequence object format # Assumes single-end strand-specific sequencing with "f" chemistry # # @param read [Bio::DB::Alignment] Read to be converted. # @return [NGSCI::Read] Converted Read object def f(read) read.query_strand ? newread(read,strand:"+") : newread(read,strand:"-") end # Creates a new read with optional strand argument # # @param read [Bio::DB::Alignment] Aligned read to be converted # @param strand [String] Strand of read # @return [NGSCI::Read] Converted Read object def newread(read,strand: nil) Read.new(read.pos,read.pos+read.seq.size,strand: strand) end # Acquires names and sizes of reference sequences included in the bam file # # @param reference [String] Path to reference fasta file. # @return [Hash] A dictionary of chromosome sizes def reference_sequences(reference) chromosomes={} Bio::FastaFormat.open(@reference).each_entry do |f| chromosomes[f.entry_id]=f.seq.size end chromosomes.select {|chrom| @bam.index_stats.keys.include?(chrom)} end # Exports the results to outfile # # @param outfile [String] Path to outfile def export(outfile) if @results File.open(outfile,'w') do |file| file.puts("Chrom,Base,Strand,Depth,Unique_Reads,Overlap,NGS-CI") @results.each do |chrom,results| results.each do |strand,val| val.each do |x| file.puts([chrom,x[0],strand,*x[1..-1]].join(",")) end end end end return outfile else return nil end end end # End calculator class end