#!/usr/bin/env ruby require 'bio' require 'rubygems' require 'pathname' require 'bio-samtools' require 'set' $: << File.expand_path(File.dirname(__FILE__) + '/../lib') $: << File.expand_path('.') path=File.expand_path(File.dirname(__FILE__) + '/../lib/bioruby-polyploid-tools.rb') $stderr.puts "Loading: #{path}" require path fasta_db = Bio::DB::Fasta::FastaFile.new( ARGV[0]) fasta_db.load_fai_entries bam1 = Bio::DB::Sam.new({:fasta=>ARGV[0], :bam=>ARGV[1]}) bam2 = Bio::DB::Sam.new({:fasta=>ARGV[0], :bam=>ARGV[2]}) output_prefix = ARGV[3] block_size=1000 min_cov = ARGV[4].to_i ? ARGV[4].to_i : 10 chunk = ARGV[5].to_i chunk_size = ARGV[6].to_i main_table="#{output_prefix}_#{block_size}_#{min_cov}_table.#{chunk}.csv" table_file = File.open(main_table, "w") table_file.puts "gene\tlength\tsnps_1\tcalled_1\tsnps_per_#{block_size}_1\tsnps_2\tcalled_2\tsnps_per_#{block_size}_2\tsnps_tot\tsnps_per_1k_tot" hist_1= Hash.new(0) hist_2= Hash.new(0) fasta_file = File.open("#{output_prefix}_#{min_cov}.#{chunk}.fa", "w") i = -1 min = chunk * chunk_size max = min + chunk_size fasta_db.index.entries.each do | r | i = i + 1 next if i < min or i >= max #Np r.get_full_region #container.process_region( { :region => r.get_full_region.to_s, :output_file => output_file } ) region=r.get_full_region begin reg_a = bam1.fetch_region({:region=>region, :min_cov=>min_cov, :A=>1}) reg_b = bam2.fetch_region({:region=>region, :min_cov=>min_cov, :A=>1}) cons_1 = reg_a.consensus cons_2 = reg_b.consensus snps_1 = cons_1.count_ambiguities snps_2 = cons_2.count_ambiguities called_1 = reg_a.called called_2 = reg_b.called snps_tot = Bio::Sequence.snps_between(cons_1, cons_2) snps_per_1k_1 = (block_size * snps_1.to_f ) / region.size snps_per_1k_2 = (block_size * snps_2.to_f ) / region.size snps_per_1k_tot = (block_size * snps_tot.to_f ) / region.size hist_1[snps_per_1k_1.to_i] += 1 hist_2[snps_per_1k_2.to_i] += 1 table_file.print "#{r.id}\t#{region.size}\t" table_file.print "#{snps_1}\t#{called_1}\t#{snps_per_1k_1}\t" table_file.print "#{snps_2}\t#{called_2}\t#{snps_per_1k_2}\t" table_file.print "#{snps_tot}\t#{snps_per_1k_tot}\n" fasta_file.puts ">#{r.id}_1" fasta_file.puts "#{cons_1}" fasta_file.puts ">#{r.id}_2" fasta_file.puts "#{cons_2}" rescue Exception => e $stderr.puts "Unable to process #{region}: #{e.to_s}" end end fasta_file.close table_file.close hist_table="#{output_prefix}_#{block_size}_#{min_cov}_hist.#{chunk}.csv" hist_file = File.open(hist_table, "w") all_keys = SortedSet.new(hist_1.keys) all_keys.merge(hist_2.keys) hist_file.puts "SNPs/#{block_size}\thist_1\thist_2\n" all_keys.each do |k| hist_file.puts "#{k}\t#{hist_1[k]}\t#{hist_2[k]}" end hist_file.close