bin/polymarker.rb in bio-polyploid-tools-0.7.3 vs bin/polymarker.rb in bio-polyploid-tools-0.8.0
- old
+ new
@@ -12,10 +12,11 @@
arm_selection_functions = Hash.new;
arm_selection_functions[:arm_selection_first_two] = lambda do | contig_name |
+ contig_name.gsub!(/chr/,"")
ret = contig_name[0,2]
return ret
end
#Function to parse stuff like: "IWGSC_CSS_1AL_scaff_110"
@@ -41,19 +42,18 @@
ret = contig_name;
return ret
end
def validate_files(o)
-
[
o[:path_to_contigs],
o[:marker_list],
o[:snp_list],
o[:mutant_list],
o[:reference]
].flatten.compact.each do |f|
- raise IOError "Unable to read #{f}" unless File.exists? f
+ raise IOError.new "Unable to read #{f}" unless File.exists? f
end
end
options = {}
options[:path_to_contigs] = "/tgac/references/external/projects/iwgsc/css/IWGSC_CSS_all_scaff_v1.fa"
@@ -65,11 +65,15 @@
options[:flanking_size] = 150;
options[:variation_free_region] = 0
options[:extract_found_contigs] = false
options[:genomes_count] = 3
options[:min_identity] = 90
+options[:scoring] = :genome_specific
+options[:database] = false
+options[:aligner] = :exonerate
+
options[:primer_3_preferences] = {
:primer_product_size_range => "50-150" ,
:primer_max_size => 25 ,
:primer_lib_ambiguity_codes_consensus => 1,
:primer_liberal_base => 1,
@@ -117,11 +121,23 @@
opts.on("-e", "--exonerate_model MODEL", "Model to be used in exonerate to search for the contigs") do |o|
options[:model] = o
end
opts.on("-a", "--arm_selection arm_selection_embl|arm_selection_morex|arm_selection_first_two|scaffold", "Function to decide the chromome arm") do |o|
- options[:arm_selection] = arm_selection_functions[o.to_sym];
+ tmp_str = o
+ arr = o.split(",")
+ if arr.size == 2
+ options[:arm_selection] = lambda do |contig_name|
+ separator, field = arr
+ field = field.to_i
+ ret = contig_name.split(separator)[field]
+ return ret
+ end
+ else
+ options[:arm_selection] = arm_selection_functions[o.to_sym];
+ end
+
end
opts.on("-p", "--primer_3_preferences FILE", "file with preferences to be sent to primer3") do |o|
options[:primer_3_preferences] = Bio::DB::Primer3.read_primer_preferences(o, options[:primer_3_preferences] )
end
@@ -137,16 +153,30 @@
opts.on("-P", "--primers_to_order", "If present, save a separate file with the primers with the KASP tails")do
#TODO: have a string with the tails, optional.
options[:primers_to_order] = true
end
-
+ opts.on("-H", "--het_dels", "If present, change the scoring to give priority to: semi-specific, specific, non-specific") do
+ options[:scoring] = :het_dels
+ end
+
+ opts.on("-A", "--aligner exonerate|blast", "Select the aligner to use. Default: exonerate") do |o|
+ raise "Invalid aligner" unless o == "exonerate" or o == "blast"
+ options[:aligner] = o.to_sym
+ end
+
+ opts.on("-d", "--database PREFIX", "Path to the blast database. Only used if the aligner is blast. The default is the name of the contigs file without extension.") do |o|
+ options[:database] = o
+ end
end.parse!
validate_files(options)
+ options[:database] = options[:path_to_contigs] unless options[:database]
+
+
if options[:primer_3_preferences][:primer_product_size_range]
range = options[:primer_3_preferences][:primer_product_size_range]
range_arr = range.split("-")
min = range_arr[0].to_i
max = range_arr[1].to_i
@@ -206,11 +236,11 @@
#0. Load the fasta index
fasta_reference_db = nil
if fasta_reference
fasta_reference_db = Bio::DB::Fasta::FastaFile.new({:fasta=>fasta_reference})
fasta_reference_db.load_fai_entries
- p "Fasta reference: #{fasta_reference}"
+ write_status "Fasta reference: #{fasta_reference}"
end
#1. Read all the SNP files
#chromosome = nil
write_status "Reading SNPs"
@@ -237,25 +267,22 @@
snp.full_sequence = fasta_reference_db.fetch_sequence(region)
else
write_status "WARN: Unable to find entry for #{snp.gene}"
end
else
- rise Bio::DB::Exonerate::ExonerateException.new "Wrong number of arguments. "
+ raise Bio::DB::Exonerate::ExonerateException.new "Wrong number of arguments. "
end
- rise Bio::DB::Exonerate::ExonerateException.new "No SNP for line '#{line}'" if snp == nil
+ raise Bio::DB::Exonerate::ExonerateException.new "No SNP for line '#{line}'" if snp == nil
snp.genomes_count = options[:genomes_count]
snp.snp_in = snp_in
snp.original_name = original_name
if snp.position
snps << snp
else
$stderr.puts "ERROR: #{snp.gene} doesn't contain a SNP"
end
-
-# chromosome = snp.chromosome unless chromosome
- # raise Bio::DB::Exonerate::ExonerateException.new "All the snps should come from the same chromosome" if chromosome != snp.chromosome
end
end
#1.1 Close fasta file
#fasta_reference_db.close() if fasta_reference_db
@@ -276,32 +303,49 @@
#chr_group = chromosome[0]
write_status "Searching markers in genome"
exo_f = File.open(exonerate_file, "w")
contigs_f = File.open(temp_contigs, "w") if options[:extract_found_contigs]
filename=path_to_contigs
-puts filename
+#puts filename
target=filename
fasta_file = Bio::DB::Fasta::FastaFile.new({:fasta=>target})
fasta_file.load_fai_entries
found_contigs = Set.new
-Bio::DB::Exonerate.align({:query=>temp_fasta_query, :target=>target, :model=>model}) do |aln|
+
+
+def do_align(aln, exo_f, found_contigs, min_identity,fasta_file,options)
if aln.identity > min_identity
exo_f.puts aln.line
unless found_contigs.include?(aln.target_id) #We only add once each contig. Should reduce the size of the output file.
found_contigs.add(aln.target_id)
entry = fasta_file.index.region_for_entry(aln.target_id)
raise ExonerateException.new, "Entry not found! #{aln.target_id}. Make sure that the #{target_id}.fai was generated properly." if entry == nil
- region = entry.get_full_region
- seq = fasta_file.fetch_sequence(region)
- contigs_f.puts(">#{aln.target_id}\n#{seq}") if options[:extract_found_contigs]
+ if options[:extract_found_contigs]
+ region = entry.get_full_region
+ seq = fasta_file.fetch_sequence(region)
+ contigs_f.puts(">#{aln.target_id}\n#{seq}")
+ end
end
end
+
end
+
+Bio::DB::Blast.align({:query=>temp_fasta_query, :target=>options[:database], :model=>model}) do |aln|
+ do_align(aln, exo_f, found_contigs,min_identity, fasta_file,options)
+end if options[:aligner] == :blast
+
+Bio::DB::Exonerate.align({:query=>temp_fasta_query, :target=>target, :model=>model}) do |aln|
+ do_align(aln, exo_f, found_contigs, min_identity,fasta_file,options)
+end if options[:aligner] == :exonerate
exo_f.close()
+
+
+
+exo_f.close()
contigs_f.close() if options[:extract_found_contigs]
#4. Load all the results from exonerate and get the input filename for primer3
#Custom arm selection function that only uses the first two characters. Maybe
#we want to make it a bit more cleaver
@@ -312,10 +356,11 @@
container.flanking_size=options[:flanking_size]
container.gene_models(temp_fasta_query)
container.chromosomes(target)
container.add_parental({:name=>snp_in})
container.add_parental({:name=>original_name})
+
snps.each do |snp|
snp.container = container
snp.flanking_size = container.flanking_size
snp.variation_free_region = options[:variation_free_region]
container.add_snp(snp)
@@ -335,18 +380,29 @@
added_exons = container.print_primer_3_exons(file, nil, snp_in)
file.close
Bio::DB::Primer3.run({:in=>primer_3_input, :out=>primer_3_output}) if added_exons > 0
-
#5. Pick the best primer and make the primer3 output
write_status "Selecting best primers"
kasp_container=Bio::DB::Primer3::KASPContainer.new
+
+
+
kasp_container.line_1= original_name
kasp_container.line_2= snp_in
+if options[:scoring] == :het_dels
+ kasp_container.scores = Hash.new
+ kasp_container.scores[:chromosome_specific] = 0
+ kasp_container.scores[:chromosome_semispecific] = 1000
+ kasp_container.scores[:chromosome_nonspecific] = 100
+end
+
snps.each do |snp|
- kasp_container.add_snp(snp)
+ snpk = kasp_container.add_snp(snp)
+
+
end
kasp_container.add_primers_file(primer_3_output) if added_exons > 0
header = "Marker,SNP,RegionSize,chromosome,total_contigs,contig_regions,SNP_type,#{original_name},#{snp_in},common,primer_type,orientation,#{original_name}_TM,#{snp_in}_TM,common_TM,selected_from,product_size,errors"
File.open(output_primers, 'w') { |f| f.write("#{header}\n#{kasp_container.print_primers}") }
\ No newline at end of file