bin/polymarker.rb in bio-polyploid-tools-0.7.3 vs bin/polymarker.rb in bio-polyploid-tools-0.8.0

- old
+ new

@@ -12,10 +12,11 @@ arm_selection_functions = Hash.new; arm_selection_functions[:arm_selection_first_two] = lambda do | contig_name | + contig_name.gsub!(/chr/,"") ret = contig_name[0,2] return ret end #Function to parse stuff like: "IWGSC_CSS_1AL_scaff_110" @@ -41,19 +42,18 @@ ret = contig_name; return ret end def validate_files(o) - [ o[:path_to_contigs], o[:marker_list], o[:snp_list], o[:mutant_list], o[:reference] ].flatten.compact.each do |f| - raise IOError "Unable to read #{f}" unless File.exists? f + raise IOError.new "Unable to read #{f}" unless File.exists? f end end options = {} options[:path_to_contigs] = "/tgac/references/external/projects/iwgsc/css/IWGSC_CSS_all_scaff_v1.fa" @@ -65,11 +65,15 @@ options[:flanking_size] = 150; options[:variation_free_region] = 0 options[:extract_found_contigs] = false options[:genomes_count] = 3 options[:min_identity] = 90 +options[:scoring] = :genome_specific +options[:database] = false +options[:aligner] = :exonerate + options[:primer_3_preferences] = { :primer_product_size_range => "50-150" , :primer_max_size => 25 , :primer_lib_ambiguity_codes_consensus => 1, :primer_liberal_base => 1, @@ -117,11 +121,23 @@ opts.on("-e", "--exonerate_model MODEL", "Model to be used in exonerate to search for the contigs") do |o| options[:model] = o end opts.on("-a", "--arm_selection arm_selection_embl|arm_selection_morex|arm_selection_first_two|scaffold", "Function to decide the chromome arm") do |o| - options[:arm_selection] = arm_selection_functions[o.to_sym]; + tmp_str = o + arr = o.split(",") + if arr.size == 2 + options[:arm_selection] = lambda do |contig_name| + separator, field = arr + field = field.to_i + ret = contig_name.split(separator)[field] + return ret + end + else + options[:arm_selection] = arm_selection_functions[o.to_sym]; + end + end opts.on("-p", "--primer_3_preferences FILE", "file with preferences to be sent to primer3") do |o| options[:primer_3_preferences] = Bio::DB::Primer3.read_primer_preferences(o, options[:primer_3_preferences] ) end @@ -137,16 +153,30 @@ opts.on("-P", "--primers_to_order", "If present, save a separate file with the primers with the KASP tails")do #TODO: have a string with the tails, optional. options[:primers_to_order] = true end - + opts.on("-H", "--het_dels", "If present, change the scoring to give priority to: semi-specific, specific, non-specific") do + options[:scoring] = :het_dels + end + + opts.on("-A", "--aligner exonerate|blast", "Select the aligner to use. Default: exonerate") do |o| + raise "Invalid aligner" unless o == "exonerate" or o == "blast" + options[:aligner] = o.to_sym + end + + opts.on("-d", "--database PREFIX", "Path to the blast database. Only used if the aligner is blast. The default is the name of the contigs file without extension.") do |o| + options[:database] = o + end end.parse! validate_files(options) + options[:database] = options[:path_to_contigs] unless options[:database] + + if options[:primer_3_preferences][:primer_product_size_range] range = options[:primer_3_preferences][:primer_product_size_range] range_arr = range.split("-") min = range_arr[0].to_i max = range_arr[1].to_i @@ -206,11 +236,11 @@ #0. Load the fasta index fasta_reference_db = nil if fasta_reference fasta_reference_db = Bio::DB::Fasta::FastaFile.new({:fasta=>fasta_reference}) fasta_reference_db.load_fai_entries - p "Fasta reference: #{fasta_reference}" + write_status "Fasta reference: #{fasta_reference}" end #1. Read all the SNP files #chromosome = nil write_status "Reading SNPs" @@ -237,25 +267,22 @@ snp.full_sequence = fasta_reference_db.fetch_sequence(region) else write_status "WARN: Unable to find entry for #{snp.gene}" end else - rise Bio::DB::Exonerate::ExonerateException.new "Wrong number of arguments. " + raise Bio::DB::Exonerate::ExonerateException.new "Wrong number of arguments. " end - rise Bio::DB::Exonerate::ExonerateException.new "No SNP for line '#{line}'" if snp == nil + raise Bio::DB::Exonerate::ExonerateException.new "No SNP for line '#{line}'" if snp == nil snp.genomes_count = options[:genomes_count] snp.snp_in = snp_in snp.original_name = original_name if snp.position snps << snp else $stderr.puts "ERROR: #{snp.gene} doesn't contain a SNP" end - -# chromosome = snp.chromosome unless chromosome - # raise Bio::DB::Exonerate::ExonerateException.new "All the snps should come from the same chromosome" if chromosome != snp.chromosome end end #1.1 Close fasta file #fasta_reference_db.close() if fasta_reference_db @@ -276,32 +303,49 @@ #chr_group = chromosome[0] write_status "Searching markers in genome" exo_f = File.open(exonerate_file, "w") contigs_f = File.open(temp_contigs, "w") if options[:extract_found_contigs] filename=path_to_contigs -puts filename +#puts filename target=filename fasta_file = Bio::DB::Fasta::FastaFile.new({:fasta=>target}) fasta_file.load_fai_entries found_contigs = Set.new -Bio::DB::Exonerate.align({:query=>temp_fasta_query, :target=>target, :model=>model}) do |aln| + + +def do_align(aln, exo_f, found_contigs, min_identity,fasta_file,options) if aln.identity > min_identity exo_f.puts aln.line unless found_contigs.include?(aln.target_id) #We only add once each contig. Should reduce the size of the output file. found_contigs.add(aln.target_id) entry = fasta_file.index.region_for_entry(aln.target_id) raise ExonerateException.new, "Entry not found! #{aln.target_id}. Make sure that the #{target_id}.fai was generated properly." if entry == nil - region = entry.get_full_region - seq = fasta_file.fetch_sequence(region) - contigs_f.puts(">#{aln.target_id}\n#{seq}") if options[:extract_found_contigs] + if options[:extract_found_contigs] + region = entry.get_full_region + seq = fasta_file.fetch_sequence(region) + contigs_f.puts(">#{aln.target_id}\n#{seq}") + end end end + end + +Bio::DB::Blast.align({:query=>temp_fasta_query, :target=>options[:database], :model=>model}) do |aln| + do_align(aln, exo_f, found_contigs,min_identity, fasta_file,options) +end if options[:aligner] == :blast + +Bio::DB::Exonerate.align({:query=>temp_fasta_query, :target=>target, :model=>model}) do |aln| + do_align(aln, exo_f, found_contigs, min_identity,fasta_file,options) +end if options[:aligner] == :exonerate exo_f.close() + + + +exo_f.close() contigs_f.close() if options[:extract_found_contigs] #4. Load all the results from exonerate and get the input filename for primer3 #Custom arm selection function that only uses the first two characters. Maybe #we want to make it a bit more cleaver @@ -312,10 +356,11 @@ container.flanking_size=options[:flanking_size] container.gene_models(temp_fasta_query) container.chromosomes(target) container.add_parental({:name=>snp_in}) container.add_parental({:name=>original_name}) + snps.each do |snp| snp.container = container snp.flanking_size = container.flanking_size snp.variation_free_region = options[:variation_free_region] container.add_snp(snp) @@ -335,18 +380,29 @@ added_exons = container.print_primer_3_exons(file, nil, snp_in) file.close Bio::DB::Primer3.run({:in=>primer_3_input, :out=>primer_3_output}) if added_exons > 0 - #5. Pick the best primer and make the primer3 output write_status "Selecting best primers" kasp_container=Bio::DB::Primer3::KASPContainer.new + + + kasp_container.line_1= original_name kasp_container.line_2= snp_in +if options[:scoring] == :het_dels + kasp_container.scores = Hash.new + kasp_container.scores[:chromosome_specific] = 0 + kasp_container.scores[:chromosome_semispecific] = 1000 + kasp_container.scores[:chromosome_nonspecific] = 100 +end + snps.each do |snp| - kasp_container.add_snp(snp) + snpk = kasp_container.add_snp(snp) + + end kasp_container.add_primers_file(primer_3_output) if added_exons > 0 header = "Marker,SNP,RegionSize,chromosome,total_contigs,contig_regions,SNP_type,#{original_name},#{snp_in},common,primer_type,orientation,#{original_name}_TM,#{snp_in}_TM,common_TM,selected_from,product_size,errors" File.open(output_primers, 'w') { |f| f.write("#{header}\n#{kasp_container.print_primers}") } \ No newline at end of file