require 'scbi_zcat' class Mpileup def initialize(file_path) @mpileup_file = ScbiZcatFile.new(file_path) @last_line = nil end def read_contig(contig_name, contig_length) coverages = [] if !@last_line.nil? if @last_line[0] != contig_name return nil else coverages = initialize_contig(contig_length) end else line = @mpileup_file.readline if line.nil? @last_line = nil return nil else @last_line = line.chomp.split("\t") if @last_line[0] != contig_name return nil else coverages = initialize_contig(contig_length) end end end while !@mpileup_file.eof fields = @mpileup_file.readline.chomp.split("\t") contig = fields[0] if contig == contig_name coverages[fields[1].to_i-1] = fields[2].to_i else @last_line = fields break end end return coverages end def initialize_contig(contig_length) coverages = Array.new(contig_length, 0) coverages[@last_line[1].to_i-1] = @last_line[2].to_i return coverages end def close @mpileup_file.close end end class Mapping def initialize(user_options = {}) options = { threads: 1, total_reads: [], }.merge!(user_options) @ref_fasta_path = options[:ref_fasta_path] @temp_folder = options[:temp_folder] @threads = options[:threads] @map_files = [] @paired = [] @idxstats = [] @mpileups = [] @coverage_results = {} @total_reads = options[:total_reads] end def do_ref(user_options = {}) options = { name: 'ref', command: 'bowtie2-build -f --threads /THREADS/ /REF_FASTA/ /OUTPUT/', log: 'reference_log', force: false } options.merge!(user_options) @ref = File.join(@temp_folder, options[:name]) cmd = options[:command].gsub('/THREADS/', @threads.to_s) cmd.gsub!('/REF_FASTA/', @ref_fasta_path) cmd.gsub!('/OUTPUT/', @ref) cmd = cmd + " &> #{File.join(@temp_folder, options[:log])}" system(cmd) if options[:force] || Dir.glob(@ref+'*.bt2').length == 0 end def do_samtools_ref cmd = "samtools faidx #{@ref_fasta_path}" system(cmd) if !File.exists?(@ref_fasta_path + '.fai') end def do_map(user_options = {}) options = { files: [], command: 'bowtie2 -p /THREADS/ -x /REFERENCE/', paired_pipe: '| samtools view -bS -F 4 | samtools sort -o /OUTPUT/', single_pipe: '| samtools view -bS -F 4 | samtools sort -o /OUTPUT/', flag_single: '-U', flags_paired: ['-1', '-2'], additional_paired_flags: '', flag_output: '-S', output: File.join(@temp_folder, 'map_data'), log: File.join(@temp_folder, 'mapping_log'), force: false } options.merge!(user_options) options[:files].each_with_index do |read_files, map_process_id| cmd = options[:command].gsub('/THREADS/', @threads.to_s) cmd.gsub!('/REFERENCE/', @ref) if read_files.length == 1 cmd = cmd + " #{options[:flag_single]} #{read_files.first}" @paired << false elsif read_files.length == 2 @paired << true cmd = cmd + " #{options[:additional_paired_flags]} #{options[:flags_paired].first} #{read_files.first} #{options[:flags_paired].last} #{read_files.last}" else raise('Incorrect number of read files. Must be 1 (single) or 2 (paired).') end map_file = nil if options[:paired_pipe].nil? || options[:single_pipe].nil? map_file = options[:output] + "_#{map_process_id}" + '.sam' cmd = cmd + " #{options[:flag_output]} #{map_file} &> #{options[:log]}_#{map_process_id}" else if @paired[map_process_id] pipe = options[:paired_pipe] else pipe = options[:single_pipe] end map_file = options[:output] + "_#{map_process_id}" + '.bam' cmd = cmd + " 2> #{options[:log]}_#{map_process_id} " + pipe.gsub('/OUTPUT/', map_file) end @map_files << map_file system(cmd) if options[:force] || !File.exists?(map_file) @total_reads << File.open("#{options[:log]}_#{map_process_id}").readlines.select{|line| /\d+ reads; of these:/ =~ line}.first.split(' ').first.to_i if File.exists?("#{options[:log]}_#{map_process_id}") && @total_reads[map_process_id].nil? raise('ERROR: The mapping process has failed, please check the map folder into the temp folder') if @total_reads[map_process_id].nil? || @total_reads[map_process_id] == 0 end end def index(user_options = {}) @map_files.each do |map_file| system("samtools index #{map_file}") if (map_file.include?('.bam') && !File.exists?(map_file+'.bai')) || user_options[:force] end end def report reports = [] @map_files.each do |map_file| cmd = "samtools flagstat #{map_file}" report = %x[#{cmd}].split("\n") reports << report end return reports end def idxstats @map_files.each_with_index do |map_file, map_process_id| prefix = File.basename(map_file).gsub(/\.bam|\.sam|\.cram/, '') file_path = File.join(@temp_folder, "#{prefix}_idxstats_#{map_process_id}.gz") cmd = "samtools idxstats #{map_file} | gzip - -f > #{file_path}" system(cmd) if !File.exists?(file_path) parse_idxstats(file_path) end end def mpileup(user_options = {}) parse_options = { add_coverages: false, normalize_coverages: false, cols: [1,2,4] # 1 based for cut } parse_options.merge!(user_options.delete(:parse_options)) if !user_options[:parse_options].nil? opts = [] do_samtools_ref user_options.each do |flag, value| opts << [flag, value.to_s] end contig_list_file = File.join(@temp_folder, File.basename(@ref_fasta_path)+'.lst') system("grep '>' #{@ref_fasta_path} | sed 's/>//g' > #{contig_list_file}") if !File.exists?(contig_list_file) idxstats if @idxstats.empty? cut = nil cut = " |cut -f #{parse_options[:cols].join(',')}" if !parse_options[:cols].nil? && !parse_options[:cols].empty? mpileup_files = [] @map_files.each_with_index do |map_file, map_process_id| prefix = File.basename(map_file).gsub(/\.bam|\.sam|\.cram/, '') file_path = File.join(@temp_folder, "#{prefix}_mpileup_#{map_process_id}.gz") mpileup_files << file_path cmd = "samtools mpileup -f #{@ref_fasta_path} #{opts.join(' ')} #{map_file}#{cut} | gzip - -f > #{file_path}" system(cmd) if !File.exists?(file_path) end coverage_results = {} parse_mpileup(mpileup_files, contig_list_file) do |contig_name, contig_length, coverages| mapped_reads = @idxstats.map{|info| info[contig_name][:mapped]}.inject { |sum, n| sum + n } get_coverage_parameters(contig_name, contig_length, mapped_reads, coverages, parse_options, coverage_results) end return coverage_results end def parse_mpileup(file_paths, contig_list_file) last_contig = nil mpileup_files = file_paths.map{|file_path| Mpileup.new(file_path)} File.open(contig_list_file).each do |contig_name| contig_name.chomp! contig_length = @idxstats.first[contig_name][:length] all_coverages = [] mpileup_files.each do |mpileup_file| coverages = mpileup_file.read_contig(contig_name, contig_length) all_coverages << coverages if !coverages.nil? && !coverages.empty? end yield(contig_name, contig_length, all_coverages) end mpileup_files.map{|mf| mf.close} end def parse_idxstats(file_path) stats = {} stats_file = ScbiZcatFile.new(file_path) while !stats_file.eof fields = stats_file.readline.chomp.split("\t") stats[fields[0]] = {length: fields[1].to_i, mapped: fields[2].to_i, unmmapped: fields[3].to_i} end stats_file.close stats.delete('*') @idxstats << stats end private def get_coverage_parameters(seq_name, contig_length, mapped_reads, mpileup_info, options, coverage_results) # begin mean_normalized_differences = 0 mean_max = 0 mean_coverage = 0 proportion_sequence_mapped = 0 fpkm = 0 if mapped_reads > 0 if !mpileup_info.empty? if mpileup_info.length == 1 coverages = mpileup_info.first else coverages = mpileup_info.transpose.map {|x| x.reduce(:+)} end mean_normalized_differences, mean_max, mean_coverage, proportion_sequence_mapped, fpkm = calculate_coverage_parameters(coverages, contig_length, mapped_reads, options) end end record = [mean_normalized_differences, mean_max, mean_coverage, proportion_sequence_mapped, fpkm, mapped_reads] record << coverages if options[:add_coverages] coverage_results[seq_name] = record # rescue Exception => e # puts "ERROR: The reference sequence: #{seq_name} has failed", # e.message, # e.backtrace.join("\n") # end end def calculate_coverage_parameters(coverages, ref_length, mapped_reads, options) n_mates = 1.0 n_mates = 2.0 if @paired millions = @total_reads.inject { |sum, n| sum + n }/1.0e6 mean_normalized_differences = 0 mean_max = 0 mean_coverage = 0 proportion_sequence_mapped = 0 fpkm = 0 greater0 = coverages.select{|c| c > 0} coverages_greater0 = greater0.length if coverages_greater0 > 0 fpkm = mapped_reads/n_mates/(ref_length/1000.0)/millions mean_coverage = coverages.inject { |sum, n| sum + n }.fdiv(ref_length) n_max = (coverages.length/10.0).ceil maximums = coverages.sort{|c1, c2| c2 <=> c1}[0..n_max-1] mean_max = maximums.inject { |sum, n| sum + n }.fdiv(n_max) mean_coverage_filtered = greater0.inject { |sum, n| sum + n }.fdiv(coverages_greater0) normalized_differences = greater0.map{|c| (c - mean_coverage_filtered).abs/mean_coverage_filtered} mean_normalized_differences = normalized_differences.inject { |sum, n| sum + n } / normalized_differences.length proportion_sequence_mapped = greater0.length.fdiv(ref_length) if options[:normalize_coverages] max = coverages.max coverages.map!{|cov| cov.fdiv(max) } end end return mean_normalized_differences, mean_max, mean_coverage, proportion_sequence_mapped, fpkm end end