bin/gff3-fetch in bio-gff3-0.8.0 vs bin/gff3-fetch in bio-gff3-0.8.2
- old
+ new
@@ -4,99 +4,197 @@
# Copyright:: August 2010
# License:: Ruby License
#
# Copyright (C) 2010,2011 Pjotr Prins <pjotr.prins@thebird.nl>
-
USAGE = <<EOM
- Fetch and assemble mRNAs, or CDS and print in FASTA format.
- gff3-fetch [--no-cache] mRNA|CDS [filename.fa] filename.gff
+ Fetch and assemble GFF3 types (e.g. ORF, mRNA, CDS) + print in FASTA format.
- Where:
+ gff3-fetch [--low-mem] [--validate] type [filename.fa] filename.gff3
- --no-cache : do not load everything in memory (slower)
+ Where (NYI == Not Yet Implemented):
+
+ --translate : output as amino acid sequence
+ --validate : validate GFF3 file by translating
+ --no-assemble : output each record as a sequence -- NYI
+ --add-phase : output records using phase (useful w. no-assemble CDS to AA) --NYI
+ --fix : check 3-frame translation and fix, if possible -- NYI
+
+ type is any valid type in the GFF3 definition. For example:
+
mRNA : assemble mRNA
CDS : assemble CDS
+ exon : list all exons
+ gene/ORF : list gene ORFs -- NYI
+ other : use any type from GFF3 definition, e.g. 'Terminate' -- NYI
- Multiple GFF3 files can be used. For external FASTA files, always the last
- one before the GFF file is used.
+ and the following performance options:
+ --cache full : load all in RAM (fast)
+ --cache none : do not load anything in memory (slow)
+ --low-mem : use LRU cache (limit RAM use, fast) -- NYI
+ --max-cpus num : use num threads -- NYI
+ --emboss : use EMBOSS translation (fast) -- NYI
+
+ Multiple GFF3 files can be used. With external FASTA files, always the last
+ one before the GFF3 filename is matched.
+
+ Note that above switches are only partially implemented at this stage. Full
+ feature support is projected Feb. 2011.
+
Examples:
- Find mRNA and CDS information from test.gff3 (which includes sequence information)
+ Assemble mRNA and CDS information from test.gff3 (which includes sequence information)
gff3-fetch mRNA test/data/gff/test.gff3
gff3-fetch CDS test/data/gff/test.gff3
- Find CDS from external FASTA file
+ Find CDS records from external FASTA file, adding phase and translate to protein sequence
- gff3-fetch CDS test/data/gff/MhA1_Contig1133.fa test/data/gff/MhA1_Contig1133.gff3
+ gff3-fetch --no-assemble --add-phase --translate CDS test/data/gff/MhA1_Contig1133.fa test/data/gff/MhA1_Contig1133.gff3
Find mRNA from external FASTA file, without loading everything in RAM
- gff3-fetch --no-cache mRNA test/data/gff/test-ext-fasta.fa test/data/gff/test-ext-fasta.gff3
+ gff3-fetch --cache none mRNA test/data/gff/test-ext-fasta.fa test/data/gff/test-ext-fasta.gff3
+ gff3-fetch --cache none mRNA test/data/gff/test-ext-fasta.fa test/data/gff/test-ext-fasta.gff3
- If you use this software, please cite http://dx.doi.org/10.1093/bioinformatics/btq475
+ Validate GFF3 file using EMBOSS translation and validation
+ gff3-fetch --cache none --validate --emboss mRNA test/data/gff/test-ext-fasta.fa test/data/gff/test-ext-fasta.gff3
+
+ Find GENEID predicted terminal exons
+
+ gff3-fetch terminal chromosome1.fa geneid.gff3
+
+== Performance
+
+time gff3-fetch cds m_hapla.WS217.dna.fa m_hapla.WS217.gff3 > test.fa
+
+ Cache real user sys
+ ----------------------------------------------------
+ full 12m41s 12m28s 0m09s (0.8.0 Jan. 2011)
+ none 504m39s 477m49s 26m50s (0.8.0 Jan. 2011)
+ ----------------------------------------------------
+
+where
+
+ 52M m_hapla.WS217.dna.fa
+ 456M m_hapla.WS217.gff3
+
+ruby 1.9.2p136 (2010-12-25 revision 30365) [x86_64-linux]
+on an 8 CPU, 2.6 GHz (6MB cache), 16 GB RAM machine.
+
+== Cite
+
+ If you use this software, please cite
+
+ http://dx.doi.org/10.1093/bioinformatics/btq475
+
== Copyright
Copyright (C) 2010,2011 Pjotr Prins <pjotr.prins@thebird.nl>
EOM
-rootpath = File.dirname(File.dirname(__FILE__))
-$: << rootpath+'/lib'
-$: << rootpath+'/../bioruby/lib'
+SEP = File::SEPARATOR
+rootpath = File.dirname(File.dirname(__FILE__))+SEP
+$: << rootpath+'lib'
+GFF3_VERSION = File.new(rootpath+'VERSION').read.chomp
require 'bio-gff3'
+require 'optparse'
+require 'ostruct'
+require 'bio/output/gfffastawriter'
-$stderr.print "BioRuby GFF3 Plugin Copyright (C) 2010,2011 Pjotr Prins <pjotr.prins@thebird.nl>\n\n"
+$stderr.print "BioRuby GFF3 Plugin "+GFF3_VERSION+" Copyright (C) 2010,2011 Pjotr Prins <pjotr.prins@thebird.nl>\n\n"
if ARGV.size == 0
print USAGE
end
+options = OpenStruct.new()
+opts = OptionParser.new() { |opts|
+ opts.on_tail("-h", "--help", "Show help and examples") {
+ print(opts)
+ print USAGE
+ exit()
+ }
+
+ opts.on("--cache [none,full]", String, "Caching (default full)") do |cache|
+ case cache.downcase
+ when 'none'
+ options.cache = :cache_none
+ # when 'lru'
+ # options.cache = :cache_lru
+ when 'full'
+ options.cache = :cache_full
+ else
+ raise 'Unknown --cache option'
+ end
+ end
+
+ opts.on("--translate", "output as amino acid sequence") do |b|
+ options.translate = b
+ end
+
+ opts.on("--validate", "validate GFF3 file by translating") do |v|
+ options.validate = v
+ $stop_on_error = true # replace global in near future
+ end
+
+ # opts.on("-q", "--quiet", "Run quietly") do |q|
+ # options.quiet = q
+ # end
+
+ # opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
+ # options.verbose = v
+ # end
+
+ # opts.on("-t", "--[no-]trace", "Debug") do |t|
+ # options.trace = t
+ # end
+}
+opts.parse!(ARGV)
+
gfftype = ARGV.shift
-caching = true
-if gfftype == "--no-cache"
- caching = false
- gfftype = ARGV.shift
-end
+
raise "Unknown GFF type '#{gfftype}'" if gfftype !~ /mrna|cds|exon/i
fastafn = nil
ARGV.each do | fn |
- if File.extname(fn) =~ /fa|fas|fasta/i
+ if File.extname(fn) =~ /fa|fas|fasta$/i
+ raise "Only one fasta file allowed per GFF3" if fastafn != nil
fastafn = fn
next
end
- options = {:validate => false}
- options = {:validate => false, :cache_components => :cache_none, :cache_records => :cache_none} if caching == false
- options[:fasta_filename] = fastafn if fastafn
- gffdb = Bio::GFFbrowser::GFFdb.new(fn,options)
+ opts = {}
+ opts[:validate] = options.validate
+ opts[:cache_components] = options.cache
+ opts[:cache_records] = options.cache
+ opts[:fasta_filename] = fastafn if fastafn
+ gffdb = Bio::GFFbrowser::GFFdb.new(fn,opts)
gff = gffdb.assembler
+ writer = Bio::GFFbrowser::FastaWriter.new(options.translate, options.validate)
case gfftype.downcase
when 'mrna'
gff.each_mRNA_seq do | id, seq |
- puts ">"+id
- puts seq
+ writer.put(id,seq)
end
when 'exon'
gff.each_exon_seq do | id, seq |
- puts ">"+id
- puts seq
+ writer.put(id,seq)
end
when 'cds'
gff.each_CDS_seq do | id, seq |
- puts ">"+id
- puts seq
+ writer.put(id,seq)
end
else
raise "Unknown action <#{gfftype}>"
end
+ fastafn = nil
end