#!/usr/bin/env ruby require 'optparse' require 'ostruct' require 'bio-maf' require 'bio-genomic-interval' def parse_interval(line) src, r_start_s, r_end_s, _ = line.split(nil, 4) r_start = r_start_s.to_i r_end = r_end_s.to_i return Bio::GenomicInterval.zero_based(src, r_start, r_end) end def target_for(base, interval, &blk) path = "#{base}_#{interval.zero_start}-#{interval.zero_end}.fa" File.open(path, 'w', &blk) end def apply_options(options, tiler) tiler.reference = options.ref if options.ref tiler.species = options.species tiler.species_map = options.species_map end options = OpenStruct.new options.p = { :threads => 1 } options.species = [] options.species_map = {} options.usage = false o_parser = OptionParser.new do |opts| opts.banner = "Usage: maf_tile [options] [index]" opts.separator "" opts.separator "Options:" opts.on("-r", "--reference SEQ", "FASTA reference sequence") do |ref| options.ref = ref end opts.on("-i", "--interval [CHR:]BEGIN:END", "Genomic interval, zero-based") do |int| if int =~ /(.+):(\d+):(\d+)/ gi = Bio::GenomicInterval.zero_based($1, ($2.to_i), ($3.to_i)) options.genomic_interval = gi elsif int =~ /(\d+):(\d+)/ options.interval = ($1.to_i)...($2.to_i) else $stderr.puts "Invalid interval specification #{int}!" options.usage = true end end opts.on("-s", "--species SPECIES[:NAME]", "Species to use (with mapped name)") do |sp| if sp =~ /:/ species, mapped = sp.split(/:/) options.species << species options.species_map[species] = mapped else options.species << sp end end opts.on("-o", "--output-base BASE", "Base name for output files", "Use stdout for a single interval if not given") do |base| options.output_base = base end opts.on("--bed BED", "BED file specifying intervals", "(requires --output-base)") do |bed| options.bed = bed end end o_parser.parse!(ARGV) maf_p = ARGV.shift index_p = ARGV.shift unless (! options.usage) \ && maf_p && (! options.species.empty?) \ && (options.output_base \ ? options.bed \ : options.interval || options.genomic_interval) $stderr.puts o_parser exit 2 end access = if File.directory? maf_p Bio::MAF::Access.maf_dir(maf_p, options.p) else Bio::MAF::Access.file(maf_p, index_p, options.p) end if options.bed intervals = [] File.open(options.bed) do |bed_f| bed_f.each_line { |line| intervals << parse_interval(line) } end intervals.sort_by! { |int| int.zero_start } intervals.each do |int| access.tile(int) do |tiler| apply_options(options, tiler) target_for(options.output_base, int) do |target| tiler.write_fasta(target) end end end else # single interval if options.genomic_interval interval = options.genomic_interval else if access.indices.size != 1 raise "Must explicitly specify sequence in --interval argument with multiple candidate MAF files!" end ref_seq = access.indices.keys.first interval = Bio::GenomicInterval.zero_based(ref_seq, options.interval.begin, options.interval.end) end access.tile(interval) do |tiler| apply_options(options, tiler) if options.output_base target = target_for(options.output_base, tiler.interval) else target = $stdout end tiler.write_fasta(target) target.close end end