bin/gff3-fetch in bio-gff3-0.8.0 vs bin/gff3-fetch in bio-gff3-0.8.2

- old
+ new

@@ -4,99 +4,197 @@ # Copyright:: August 2010 # License:: Ruby License # # Copyright (C) 2010,2011 Pjotr Prins <pjotr.prins@thebird.nl> - USAGE = <<EOM - Fetch and assemble mRNAs, or CDS and print in FASTA format. - gff3-fetch [--no-cache] mRNA|CDS [filename.fa] filename.gff + Fetch and assemble GFF3 types (e.g. ORF, mRNA, CDS) + print in FASTA format. - Where: + gff3-fetch [--low-mem] [--validate] type [filename.fa] filename.gff3 - --no-cache : do not load everything in memory (slower) + Where (NYI == Not Yet Implemented): + + --translate : output as amino acid sequence + --validate : validate GFF3 file by translating + --no-assemble : output each record as a sequence -- NYI + --add-phase : output records using phase (useful w. no-assemble CDS to AA) --NYI + --fix : check 3-frame translation and fix, if possible -- NYI + + type is any valid type in the GFF3 definition. For example: + mRNA : assemble mRNA CDS : assemble CDS + exon : list all exons + gene/ORF : list gene ORFs -- NYI + other : use any type from GFF3 definition, e.g. 'Terminate' -- NYI - Multiple GFF3 files can be used. For external FASTA files, always the last - one before the GFF file is used. + and the following performance options: + --cache full : load all in RAM (fast) + --cache none : do not load anything in memory (slow) + --low-mem : use LRU cache (limit RAM use, fast) -- NYI + --max-cpus num : use num threads -- NYI + --emboss : use EMBOSS translation (fast) -- NYI + + Multiple GFF3 files can be used. With external FASTA files, always the last + one before the GFF3 filename is matched. + + Note that above switches are only partially implemented at this stage. Full + feature support is projected Feb. 2011. + Examples: - Find mRNA and CDS information from test.gff3 (which includes sequence information) + Assemble mRNA and CDS information from test.gff3 (which includes sequence information) gff3-fetch mRNA test/data/gff/test.gff3 gff3-fetch CDS test/data/gff/test.gff3 - Find CDS from external FASTA file + Find CDS records from external FASTA file, adding phase and translate to protein sequence - gff3-fetch CDS test/data/gff/MhA1_Contig1133.fa test/data/gff/MhA1_Contig1133.gff3 + gff3-fetch --no-assemble --add-phase --translate CDS test/data/gff/MhA1_Contig1133.fa test/data/gff/MhA1_Contig1133.gff3 Find mRNA from external FASTA file, without loading everything in RAM - gff3-fetch --no-cache mRNA test/data/gff/test-ext-fasta.fa test/data/gff/test-ext-fasta.gff3 + gff3-fetch --cache none mRNA test/data/gff/test-ext-fasta.fa test/data/gff/test-ext-fasta.gff3 + gff3-fetch --cache none mRNA test/data/gff/test-ext-fasta.fa test/data/gff/test-ext-fasta.gff3 - If you use this software, please cite http://dx.doi.org/10.1093/bioinformatics/btq475 + Validate GFF3 file using EMBOSS translation and validation + gff3-fetch --cache none --validate --emboss mRNA test/data/gff/test-ext-fasta.fa test/data/gff/test-ext-fasta.gff3 + + Find GENEID predicted terminal exons + + gff3-fetch terminal chromosome1.fa geneid.gff3 + +== Performance + +time gff3-fetch cds m_hapla.WS217.dna.fa m_hapla.WS217.gff3 > test.fa + + Cache real user sys + ---------------------------------------------------- + full 12m41s 12m28s 0m09s (0.8.0 Jan. 2011) + none 504m39s 477m49s 26m50s (0.8.0 Jan. 2011) + ---------------------------------------------------- + +where + + 52M m_hapla.WS217.dna.fa + 456M m_hapla.WS217.gff3 + +ruby 1.9.2p136 (2010-12-25 revision 30365) [x86_64-linux] +on an 8 CPU, 2.6 GHz (6MB cache), 16 GB RAM machine. + +== Cite + + If you use this software, please cite + + http://dx.doi.org/10.1093/bioinformatics/btq475 + == Copyright Copyright (C) 2010,2011 Pjotr Prins <pjotr.prins@thebird.nl> EOM -rootpath = File.dirname(File.dirname(__FILE__)) -$: << rootpath+'/lib' -$: << rootpath+'/../bioruby/lib' +SEP = File::SEPARATOR +rootpath = File.dirname(File.dirname(__FILE__))+SEP +$: << rootpath+'lib' +GFF3_VERSION = File.new(rootpath+'VERSION').read.chomp require 'bio-gff3' +require 'optparse' +require 'ostruct' +require 'bio/output/gfffastawriter' -$stderr.print "BioRuby GFF3 Plugin Copyright (C) 2010,2011 Pjotr Prins <pjotr.prins@thebird.nl>\n\n" +$stderr.print "BioRuby GFF3 Plugin "+GFF3_VERSION+" Copyright (C) 2010,2011 Pjotr Prins <pjotr.prins@thebird.nl>\n\n" if ARGV.size == 0 print USAGE end +options = OpenStruct.new() +opts = OptionParser.new() { |opts| + opts.on_tail("-h", "--help", "Show help and examples") { + print(opts) + print USAGE + exit() + } + + opts.on("--cache [none,full]", String, "Caching (default full)") do |cache| + case cache.downcase + when 'none' + options.cache = :cache_none + # when 'lru' + # options.cache = :cache_lru + when 'full' + options.cache = :cache_full + else + raise 'Unknown --cache option' + end + end + + opts.on("--translate", "output as amino acid sequence") do |b| + options.translate = b + end + + opts.on("--validate", "validate GFF3 file by translating") do |v| + options.validate = v + $stop_on_error = true # replace global in near future + end + + # opts.on("-q", "--quiet", "Run quietly") do |q| + # options.quiet = q + # end + + # opts.on("-v", "--[no-]verbose", "Run verbosely") do |v| + # options.verbose = v + # end + + # opts.on("-t", "--[no-]trace", "Debug") do |t| + # options.trace = t + # end +} +opts.parse!(ARGV) + gfftype = ARGV.shift -caching = true -if gfftype == "--no-cache" - caching = false - gfftype = ARGV.shift -end + raise "Unknown GFF type '#{gfftype}'" if gfftype !~ /mrna|cds|exon/i fastafn = nil ARGV.each do | fn | - if File.extname(fn) =~ /fa|fas|fasta/i + if File.extname(fn) =~ /fa|fas|fasta$/i + raise "Only one fasta file allowed per GFF3" if fastafn != nil fastafn = fn next end - options = {:validate => false} - options = {:validate => false, :cache_components => :cache_none, :cache_records => :cache_none} if caching == false - options[:fasta_filename] = fastafn if fastafn - gffdb = Bio::GFFbrowser::GFFdb.new(fn,options) + opts = {} + opts[:validate] = options.validate + opts[:cache_components] = options.cache + opts[:cache_records] = options.cache + opts[:fasta_filename] = fastafn if fastafn + gffdb = Bio::GFFbrowser::GFFdb.new(fn,opts) gff = gffdb.assembler + writer = Bio::GFFbrowser::FastaWriter.new(options.translate, options.validate) case gfftype.downcase when 'mrna' gff.each_mRNA_seq do | id, seq | - puts ">"+id - puts seq + writer.put(id,seq) end when 'exon' gff.each_exon_seq do | id, seq | - puts ">"+id - puts seq + writer.put(id,seq) end when 'cds' gff.each_CDS_seq do | id, seq | - puts ">"+id - puts seq + writer.put(id,seq) end else raise "Unknown action <#{gfftype}>" end + fastafn = nil end