bin/gff3-fetch in bio-gff3-0.8.4 vs bin/gff3-fetch in bio-gff3-0.8.5
- old
+ new
@@ -8,37 +8,40 @@
USAGE = <<EOM
Fetch and assemble GFF3 types (e.g. ORF, mRNA, CDS) + print in FASTA format.
- gff3-fetch [--low-mem] [--validate] type [filename.fa] filename.gff3
+ gff3-fetch [options] type [filename.fa] filename.gff3
- Where (NYI == Not Yet Implemented):
-
- --translate : output as amino acid sequence
- --validate : validate GFF3 file by translating
- --fix : check 3-frame translation and fix, if possible
- --fix-wormbase : fix 3-frame translation on ORFs named 'gene1'
- --no-assemble : output each record as a sequence -- NYI
- --add-phase : output records using phase (useful w. no-assemble CDS to AA) --NYI
+ --translate : output as amino acid sequence
+ --validate : validate GFF3 file by translating
+ --fix : check 3-frame translation and fix, if possible
+ --fix-wormbase : fix 3-frame translation on ORFs named 'gene1'
+ --no-assemble : output each record as a sequence
+ --phase : output records using phase (useful w. no-assemble CDS to AA)
type is any valid type in the GFF3 definition. For example:
- mRNA : assemble mRNA
- CDS : assemble CDS
- exon : list all exons
- gene|ORF : list gene ORFs
- other : use any type from GFF3 definition, e.g. 'Terminate' -- NYI
+ mRNA : assemble mRNA
+ CDS : assemble CDS
+ exon : list all exons
+ gene|ORF : list gene ORFs
+ other : use any type from GFF3 definition, e.g. 'Terminate' -- NYI
and the following performance options:
- --cache full : load all in RAM (fast)
- --cache none : do not load anything in memory (slow)
- --low-mem : use LRU cache (limit RAM use, fast) -- NYI
- --max-cpus num : use num threads -- NYI
- --emboss : use EMBOSS translation (fast) -- NYI
+ --parser bioruby : use BioRuby GFF3 parser (slow)
+ --parser line : use GFF3 line parser -- in preparation
+ --parser block : use GFF3 block parser (optimistic) -- NYI
+ --cache full : load all in RAM (fast, default)
+ --cache none : do not load anything in memory (slow)
+ --cache lru : use LRU cache (limit RAM use, fast) -- NYI
+ --max-cpus num : use num threads -- NYI
+ --emboss : use EMBOSS translation (fast) -- NYI
+ Where (NYI == Not Yet Implemented):
+
Multiple GFF3 files can be used. With external FASTA files, always the last
one before the GFF3 filename is matched.
Note that above switches are only partially implemented at this stage. Full
feature support is projected Feb. 2011.
@@ -50,11 +53,11 @@
gff3-fetch mRNA test/data/gff/test.gff3
gff3-fetch CDS test/data/gff/test.gff3
Find CDS records from external FASTA file, adding phase and translate to protein sequence
- gff3-fetch --no-assemble --add-phase --translate CDS test/data/gff/MhA1_Contig1133.fa test/data/gff/MhA1_Contig1133.gff3
+ gff3-fetch --no-assemble --phase --translate CDS test/data/gff/MhA1_Contig1133.fa test/data/gff/MhA1_Contig1133.gff3
Find mRNA from external FASTA file, without loading everything in RAM
gff3-fetch --cache none mRNA test/data/gff/test-ext-fasta.fa test/data/gff/test-ext-fasta.gff3
gff3-fetch --cache none mRNA test/data/gff/test-ext-fasta.fa test/data/gff/test-ext-fasta.gff3
@@ -65,19 +68,29 @@
Find GENEID predicted terminal exons
gff3-fetch terminal chromosome1.fa geneid.gff3
+ Fine tuning output - show errors only
+
+ gff3-fetch mRNA test/data/gff/test.gff3 --trace ERROR
+
+ Fine tuning output - write log messages to file.log
+
+ gff3-fetch mRNA test/data/gff/test.gff3 --trace ERROR --logger file.log
+
== Performance
-time gff3-fetch cds m_hapla.WS217.dna.fa m_hapla.WS217.gff3 > test.fa
+time gff3-fetch cds m_hapla.WS217.dna.fa m_hapla.WS217.gff3 2> /dev/null > test.fa
- Cache real user sys
+ Cache real user sys
+ ------------------------------------------------------------
+ full,bioruby 12m41 12m28 0m09 (0.8.0)
+ full,line 12m13 12m06 0m07 (0.8.5)
+ none,bioruby 504m39 477m49 26m50 (0.8.0)
+ lru,bioruby ?
----------------------------------------------------
- full 12m41s 12m28s 0m09s (0.8.0 Jan. 2011)
- none 504m39s 477m49s 26m50s (0.8.0 Jan. 2011)
- ----------------------------------------------------
where
52M m_hapla.WS217.dna.fa
456M m_hapla.WS217.gff3
@@ -95,15 +108,15 @@
Copyright (C) 2010,2011 Pjotr Prins <pjotr.prins@thebird.nl>
EOM
-SEP = File::SEPARATOR
-rootpath = File.dirname(File.dirname(__FILE__))+SEP
-$: << rootpath+'lib'
-GFF3_VERSION = File.new(rootpath+'VERSION').read.chomp
+rootpath = File.dirname(File.dirname(__FILE__))
+$: << File.join(rootpath,'lib')
+GFF3_VERSION = File.new(File.join(rootpath,'VERSION')).read.chomp
+
require 'bio-gff3'
require 'optparse'
require 'ostruct'
require 'bio/output/gfffastawriter'
@@ -111,18 +124,36 @@
if ARGV.size == 0
print USAGE
end
+Bio::Log::CLI.logger('stderr')
+Bio::Log::CLI.trace('info')
+
options = OpenStruct.new()
+
+# ---- Default options
+options.parser = :bioruby
+
opts = OptionParser.new() { |opts|
opts.on_tail("-h", "--help", "Show help and examples") {
print(opts)
print USAGE
exit()
}
+ opts.on("--parser [bioruby,line]", String, "Parser (default bioruby)") do |p|
+ case p.downcase
+ when 'bioruby'
+ options.parser = :bioruby
+ when 'line'
+ options.parser = :line
+ else
+ raise 'Unknown --parser option'
+ end
+ end
+
opts.on("--cache [none,full]", String, "Caching (default full)") do |cache|
case cache.downcase
when 'none'
options.cache = :cache_none
# when 'lru'
@@ -132,41 +163,60 @@
else
raise 'Unknown --cache option'
end
end
+ opts.on("--no-assemble", "output sequences without assembling") do |b|
+ options.no_assemble = true
+ end
+
+ opts.on("--phase", "adjust for phase (useful for CDS --no-assemble --translate)") do |b|
+ options.phase = true
+ end
+
opts.on("--translate", "output as amino acid sequence") do |b|
options.translate = b
end
opts.on("--validate", "validate GFF3 file by translating") do |v|
options.validate = v
- $stop_on_error = true # replace global in near future
end
opts.on("--fix", "Fix frame errors in the GFF3 definition") do |v|
options.fix = true
end
opts.on("--fix-wormbase", "Wormbase fix gene1 frame error") do |v|
options.fix_wormbase = true
end
- # opts.on("-q", "--quiet", "Run quietly") do |q|
- # options.quiet = q
- # end
+ opts.on("--logger filename",String,"Log to file (default stderr)") do | name |
+ Bio::Log::CLI.logger(name)
+ end
- # opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
- # options.verbose = v
- # end
+ opts.on("--trace options",String,"Set log level (default INFO, see bio-logger)") do | s |
+ Bio::Log::CLI.trace(s)
+ end
- # opts.on("-t", "--[no-]trace", "Debug") do |t|
- # options.trace = t
- # end
-}
+ opts.on("-q", "--quiet", "Run quietly") do |q|
+ Bio::Log::CLI.trace('error')
+ end
+
+ opts.on("-v", "--verbose", "Run verbosely") do |v|
+ Bio::Log::CLI.trace('info')
+ end
+
+ opts.on("--debug", "Show debug messages") do |v|
+ Bio::Log::CLI.trace('debug')
+ options.debug = true
+ end
+
+ }
opts.parse!(ARGV)
+Bio::Log::CLI.configure('bio-gff3')
+
gfftype = ARGV.shift
fastafn = nil
ARGV.each do | fn |
@@ -175,18 +225,25 @@
fastafn = fn
next
end
opts = {}
- opts[:validate] = options.validate
+ opts[:validate] = options.validate
+ opts[:parser] = options.parser
opts[:cache_components] = options.cache
opts[:cache_records] = options.cache
- opts[:fasta_filename] = fastafn if fastafn
- opts[:fix_wormbase] = options.fix_wormbase
- opts[:fix] = options.fix
- gffdb = Bio::GFFbrowser::GFFdb.new(fn,opts)
- gff = gffdb.assembler
+ opts[:fasta_filename] = fastafn if fastafn
+ opts[:fix_wormbase] = options.fix_wormbase
+ opts[:fix] = options.fix
+ opts[:no_assemble] = options.no_assemble
+ opts[:phase] = options.phase
+ opts[:debug] = options.debug
+
+ gff3 = Bio::GFFbrowser::GFF3.new(fn,opts)
+
+ gff = gff3.assembler
writer = Bio::GFFbrowser::FastaWriter.new(options.translate, options.validate)
+
case gfftype.downcase
when 'gene'
gff.each_gene_seq do | id, seq |
writer.put(id,seq)
end