#!/usr/bin/env ruby
# Author : Naveed Ishaque (inspired by Dan Maclean) edited again by Dan to include ChD value setting and deletion of SNPs file
# naveed.ishaque@tsl.ac.uk; naveed.ishaque@hotmail.co.uk
# Date: 20th June 2012 and 1st November 2012
# This scripts produces a HTML with embedded images showing the SNP density and chastity plots for a given BAM file
# It will automatically iterate over all contigs, form begining to end
# NOTE - run using ruby executable: /home/programs/gngm/ruby/bin/ruby
# NOTE - this will only run on the cluster via a bsub command
require 'rubygems'
require 'bio'
require 'bio-gngm'
require 'base64'
require 'getoptions'
usage = "\n#{$PROGRAM_NAME} reads in a fasta and bam files and produces a html file indicating QTL locations as peaks\n\n\n\t #{$PROGRAM_NAME}\n\n\n -f [reference fasta file]\n -b [bam file]\n -e expected ChD (allele freq) default 1\n -c control ChD (allele freq) default 0.5\n -s List of known SNPS [tab delimited file]\n\n"
# PARSE INPUTS
opt = GetOptions.new(%w(h help f=@s b=@s e=@s c=@s s=@s))
puts "#{usage}" if opt[:h]
exit if opt[:h]
puts "#{usage}" if opt[:help]
exit if opt[:help]
puts "ERROR - no fasta file provided (-f)\n#{usage}" unless opt[:f]
exit unless opt[:f]
puts "ERROR - fasta file '#{opt[:f][0]}' does not exist\n#{$usage}" unless FileTest.exist?("#{opt[:f][0]}")
exit unless FileTest.exist?("#{opt[:f][0]}")
warn "\nUsing FASTA file #{opt[:f][0]}"
puts "ERROR - no bam file provided (-b)\n#{usage}" unless opt[:b]
exit unless opt[:b]
puts "ERROR - BAM file '#{opt[:b][0]}' does not exist\n#{usage}" unless FileTest.exist?("#{opt[:b][0]}")
exit unless FileTest.exist?("#{opt[:b][0]}")
warn "Using BAM file #{opt[:b][0]}"
expected_chd = 1.0
expected_chd =opt[:e][0].to_f if expected_chd and opt[:e]
control_chd = 0.5
control_chd = opt[:c][0].to_f if control_chd and opt[:c]
known_snps = Hash.new { |h,k| h[k] = Array.new }
if opt[:s].first
File.open(opt[:s].first).each do |line|
chr,pos = line.split("\t")
known_snps[chr] << pos.to_i
end
end
$stderr.puts "using expected ChD: #{expected_chd} and control ChD: #{control_chd}"
hist_bins = [100000, 250000, 500000]
ks = [5, 7, 9, 11]
kadjusts = [0.5, 0.25, 0.1, 0.05, 0.01]
warn "Histogram bin sizes: #{hist_bins}\nThread clusters (K): #{ks}\nKernal adjusts: #{kadjusts}\n"
# LOAD FASTA and find contigs
sequences = Bio::DB::FastaLengthDB.new(:file => "#{opt[:f][0]}")
# For Each contig in the fasta file analyse...
sequences.each do |id,length|
warn "\nProcessing #{id}:1 - #{length}..."
warn "Skipping #{id} as too short ..." if length < (4 * hist_bins.max)
next if length < (4 * hist_bins.max)
g = Bio::Util::Gngm.new(:file => "#{opt[:b][0]}",
:format => :bam,
:fasta => "#{opt[:f][0]}",
:start => 1,
:stop => 10000,
:chromosome => id,
:samtools => {
:q => 20,
:Q => 20
},
:ignore_file => "#{opt[:s][0]}",
:write_pileup => "pileup.txt",
:write_vcf => "snps.vcf"
)
# predict SNPs
warn " Prediciting SNPs for #{id}:1-#{length}..."
g.snp_positions
#delete SNPs from known snp_list
#a = g.snp_positions.dup
#known_snps[seq.entry_id].each {|snp_pos| a.delete_if{|x| x.first == snp_pos} }
#$stderr.puts "deleted #{g.snp_positions.length - a.length} snps appearing in #{opt[:s]}"
#g.snp_positions = a
# produce SNP density histograms
warn " Iterating over different histogram bin sizes..."
hist_bins.each do |bin_width|
warn " Makings PNG for bin size #{bin_width}..."
file_name = "#{id}_SNP_histogram_bin#{bin_width}.png"
g.frequency_histogram("#{file_name}",bin_width, :title => "#{id}: SNP density histogram (bin width - #{bin_width})", :width => 1066, :height => 300)
end
# Write to embedded HTML
htmlout = File.open("#{id}.html", 'w')
htmlout.puts "\n"
htmlout.puts "
\n"
htmlout.puts " GNGM #{id} - QTL mapping\n"
htmlout.puts " \n"
htmlout.puts " \n"
htmlout.puts " \n\t\t"
htmlout.puts " \n"
htmlout.puts " \n"
hist_bins.each do |bin_width|
htmlout.puts " \n"
htmlout.puts "\n"
htmlout.puts " | \n"
end
htmlout.puts "
\n"
htmlout.puts "
\n"
# Perform chastity calculations
warn " Collecting threads..."
g.collect_threads
warn " Iterating over k and kernel adjusts..."
ks.each do | k |
begin
warn " Makings PNG for k = #{k} ..."
warn " Calculating threads ..."
g.calculate_clusters(:k => k, :adjust => 0.5, :control_chd => control_chd, :expected_chd => expected_chd)
warn " Drawing threads ..."
filename = "#{id}_k#{k}_threads.png"
g.draw_threads(filename, :title => "#{id}: Chastity bands - all phases (k=#{k})", :width => 700, :height => 300)
warn " Clustering bands ..."
filename = "#{id}_k#{k}_clustered_bands.png"
g.draw_bands(filename, :title => "#{id}: Homozygous and heterozygous chastity belts (k=#{k})", :width => 800, :height => 300)
kadjusts.each do |kernel_adjust|
begin
warn " Calculating threads (with kernal adjust #{kernel_adjust}) ..."
g.calculate_clusters(:k => k, :adjust => kernel_adjust, :control_chd => control_chd, :expected_chd => expected_chd)
warn " Calculating signal ..."
filename = "#{id}_k#{k}_kadjust#{kernel_adjust}_signal.png"
g.draw_signal(filename, :title => "#{id}: Homo/Het signal ratio (k=#{k}, kernal=#{kernel_adjust})", :width => 800, :height => 300)
warn " Estimating peaks ..."
filename = "#{id}_k#{k}_kadjust#{kernel_adjust}_peaks.png"
g.draw_peaks(filename, :title => "#{id}: Signal peaks (k=#{k}, kernal=#{kernel_adjust})", :width => 800, :height => 300)
rescue => e
$stderr.puts "skipping #{k} #{kernel_adjust} => #{e}"
end
end
rescue => e
$stderr.puts "Skipping #{k} => #{e}"
end
end
g.close
# Write to embedded HTML
htmlout.puts " \n"
# all bands
htmlout.puts " \n"
ks.each do | k |
htmlout.puts " \n"
htmlout.puts ""
htmlout.puts " | \n"
end
htmlout.puts "
\n"
# homo/het bands
htmlout.puts " \n"
ks.each do | k |
htmlout.puts " \n"
htmlout.puts ""
htmlout.puts " | \n"
end
htmlout.puts "
\n"
# k/adjusts
kadjusts.each do |kernel_adjust|
htmlout.puts " \n"
ks.each do | k |
htmlout.puts " \n"
htmlout.puts ""
htmlout.puts " | \n"
end
htmlout.puts "
\n"
htmlout.puts " \n"
ks.each do | k |
htmlout.puts " \n"
htmlout.puts ""
htmlout.puts " | \n"
end
htmlout.puts "
\n"
end
htmlout.puts "
\n"
htmlout.puts "\n \n\n"
htmlout.close
end