bin/gff3-fetch in bio-gff3-0.8.4 vs bin/gff3-fetch in bio-gff3-0.8.5

- old
+ new

@@ -8,37 +8,40 @@ USAGE = <<EOM Fetch and assemble GFF3 types (e.g. ORF, mRNA, CDS) + print in FASTA format. - gff3-fetch [--low-mem] [--validate] type [filename.fa] filename.gff3 + gff3-fetch [options] type [filename.fa] filename.gff3 - Where (NYI == Not Yet Implemented): - - --translate : output as amino acid sequence - --validate : validate GFF3 file by translating - --fix : check 3-frame translation and fix, if possible - --fix-wormbase : fix 3-frame translation on ORFs named 'gene1' - --no-assemble : output each record as a sequence -- NYI - --add-phase : output records using phase (useful w. no-assemble CDS to AA) --NYI + --translate : output as amino acid sequence + --validate : validate GFF3 file by translating + --fix : check 3-frame translation and fix, if possible + --fix-wormbase : fix 3-frame translation on ORFs named 'gene1' + --no-assemble : output each record as a sequence + --phase : output records using phase (useful w. no-assemble CDS to AA) type is any valid type in the GFF3 definition. For example: - mRNA : assemble mRNA - CDS : assemble CDS - exon : list all exons - gene|ORF : list gene ORFs - other : use any type from GFF3 definition, e.g. 'Terminate' -- NYI + mRNA : assemble mRNA + CDS : assemble CDS + exon : list all exons + gene|ORF : list gene ORFs + other : use any type from GFF3 definition, e.g. 'Terminate' -- NYI and the following performance options: - --cache full : load all in RAM (fast) - --cache none : do not load anything in memory (slow) - --low-mem : use LRU cache (limit RAM use, fast) -- NYI - --max-cpus num : use num threads -- NYI - --emboss : use EMBOSS translation (fast) -- NYI + --parser bioruby : use BioRuby GFF3 parser (slow) + --parser line : use GFF3 line parser -- in preparation + --parser block : use GFF3 block parser (optimistic) -- NYI + --cache full : load all in RAM (fast, default) + --cache none : do not load anything in memory (slow) + --cache lru : use LRU cache (limit RAM use, fast) -- NYI + --max-cpus num : use num threads -- NYI + --emboss : use EMBOSS translation (fast) -- NYI + Where (NYI == Not Yet Implemented): + Multiple GFF3 files can be used. With external FASTA files, always the last one before the GFF3 filename is matched. Note that above switches are only partially implemented at this stage. Full feature support is projected Feb. 2011. @@ -50,11 +53,11 @@ gff3-fetch mRNA test/data/gff/test.gff3 gff3-fetch CDS test/data/gff/test.gff3 Find CDS records from external FASTA file, adding phase and translate to protein sequence - gff3-fetch --no-assemble --add-phase --translate CDS test/data/gff/MhA1_Contig1133.fa test/data/gff/MhA1_Contig1133.gff3 + gff3-fetch --no-assemble --phase --translate CDS test/data/gff/MhA1_Contig1133.fa test/data/gff/MhA1_Contig1133.gff3 Find mRNA from external FASTA file, without loading everything in RAM gff3-fetch --cache none mRNA test/data/gff/test-ext-fasta.fa test/data/gff/test-ext-fasta.gff3 gff3-fetch --cache none mRNA test/data/gff/test-ext-fasta.fa test/data/gff/test-ext-fasta.gff3 @@ -65,19 +68,29 @@ Find GENEID predicted terminal exons gff3-fetch terminal chromosome1.fa geneid.gff3 + Fine tuning output - show errors only + + gff3-fetch mRNA test/data/gff/test.gff3 --trace ERROR + + Fine tuning output - write log messages to file.log + + gff3-fetch mRNA test/data/gff/test.gff3 --trace ERROR --logger file.log + == Performance -time gff3-fetch cds m_hapla.WS217.dna.fa m_hapla.WS217.gff3 > test.fa +time gff3-fetch cds m_hapla.WS217.dna.fa m_hapla.WS217.gff3 2> /dev/null > test.fa - Cache real user sys + Cache real user sys + ------------------------------------------------------------ + full,bioruby 12m41 12m28 0m09 (0.8.0) + full,line 12m13 12m06 0m07 (0.8.5) + none,bioruby 504m39 477m49 26m50 (0.8.0) + lru,bioruby ? ---------------------------------------------------- - full 12m41s 12m28s 0m09s (0.8.0 Jan. 2011) - none 504m39s 477m49s 26m50s (0.8.0 Jan. 2011) - ---------------------------------------------------- where 52M m_hapla.WS217.dna.fa 456M m_hapla.WS217.gff3 @@ -95,15 +108,15 @@ Copyright (C) 2010,2011 Pjotr Prins <pjotr.prins@thebird.nl> EOM -SEP = File::SEPARATOR -rootpath = File.dirname(File.dirname(__FILE__))+SEP -$: << rootpath+'lib' -GFF3_VERSION = File.new(rootpath+'VERSION').read.chomp +rootpath = File.dirname(File.dirname(__FILE__)) +$: << File.join(rootpath,'lib') +GFF3_VERSION = File.new(File.join(rootpath,'VERSION')).read.chomp + require 'bio-gff3' require 'optparse' require 'ostruct' require 'bio/output/gfffastawriter' @@ -111,18 +124,36 @@ if ARGV.size == 0 print USAGE end +Bio::Log::CLI.logger('stderr') +Bio::Log::CLI.trace('info') + options = OpenStruct.new() + +# ---- Default options +options.parser = :bioruby + opts = OptionParser.new() { |opts| opts.on_tail("-h", "--help", "Show help and examples") { print(opts) print USAGE exit() } + opts.on("--parser [bioruby,line]", String, "Parser (default bioruby)") do |p| + case p.downcase + when 'bioruby' + options.parser = :bioruby + when 'line' + options.parser = :line + else + raise 'Unknown --parser option' + end + end + opts.on("--cache [none,full]", String, "Caching (default full)") do |cache| case cache.downcase when 'none' options.cache = :cache_none # when 'lru' @@ -132,41 +163,60 @@ else raise 'Unknown --cache option' end end + opts.on("--no-assemble", "output sequences without assembling") do |b| + options.no_assemble = true + end + + opts.on("--phase", "adjust for phase (useful for CDS --no-assemble --translate)") do |b| + options.phase = true + end + opts.on("--translate", "output as amino acid sequence") do |b| options.translate = b end opts.on("--validate", "validate GFF3 file by translating") do |v| options.validate = v - $stop_on_error = true # replace global in near future end opts.on("--fix", "Fix frame errors in the GFF3 definition") do |v| options.fix = true end opts.on("--fix-wormbase", "Wormbase fix gene1 frame error") do |v| options.fix_wormbase = true end - # opts.on("-q", "--quiet", "Run quietly") do |q| - # options.quiet = q - # end + opts.on("--logger filename",String,"Log to file (default stderr)") do | name | + Bio::Log::CLI.logger(name) + end - # opts.on("-v", "--[no-]verbose", "Run verbosely") do |v| - # options.verbose = v - # end + opts.on("--trace options",String,"Set log level (default INFO, see bio-logger)") do | s | + Bio::Log::CLI.trace(s) + end - # opts.on("-t", "--[no-]trace", "Debug") do |t| - # options.trace = t - # end -} + opts.on("-q", "--quiet", "Run quietly") do |q| + Bio::Log::CLI.trace('error') + end + + opts.on("-v", "--verbose", "Run verbosely") do |v| + Bio::Log::CLI.trace('info') + end + + opts.on("--debug", "Show debug messages") do |v| + Bio::Log::CLI.trace('debug') + options.debug = true + end + + } opts.parse!(ARGV) +Bio::Log::CLI.configure('bio-gff3') + gfftype = ARGV.shift fastafn = nil ARGV.each do | fn | @@ -175,18 +225,25 @@ fastafn = fn next end opts = {} - opts[:validate] = options.validate + opts[:validate] = options.validate + opts[:parser] = options.parser opts[:cache_components] = options.cache opts[:cache_records] = options.cache - opts[:fasta_filename] = fastafn if fastafn - opts[:fix_wormbase] = options.fix_wormbase - opts[:fix] = options.fix - gffdb = Bio::GFFbrowser::GFFdb.new(fn,opts) - gff = gffdb.assembler + opts[:fasta_filename] = fastafn if fastafn + opts[:fix_wormbase] = options.fix_wormbase + opts[:fix] = options.fix + opts[:no_assemble] = options.no_assemble + opts[:phase] = options.phase + opts[:debug] = options.debug + + gff3 = Bio::GFFbrowser::GFF3.new(fn,opts) + + gff = gff3.assembler writer = Bio::GFFbrowser::FastaWriter.new(options.translate, options.validate) + case gfftype.downcase when 'gene' gff.each_gene_seq do | id, seq | writer.put(id,seq) end