bin/fasta_util in fasta_util-0.2.1 vs bin/fasta_util in fasta_util-0.2.2
- old
+ new
@@ -1,4 +1,88 @@
#!/usr/bin/env ruby
-require 'fasta_util'
+require 'thor'
+require 'bio'
+class FastaUtility < Thor
+ include Thor::Actions
+ Struct.new("Stats", :sum, :l50, :n50, :count, :mean, :median)
+
+ no_tasks do
+ def stats(lengths)
+ lengths = lengths.sort{|a, b| b <=> a}
+ stats = Struct::Stats.new
+
+ temp_sum = 0
+ stats[:sum] = lengths.inject(:+)
+ stats[:l50] = lengths.find{|length| (temp_sum += length) > stats[:sum]/2.0}
+ stats[:n50] = lengths.count{|length| length >= stats[:l50]}
+ stats[:mean] = stats[:sum].to_f/lengths.length
+ stats[:median] = (lengths.length % 2 == 0) ? (lengths[lengths.length/2-1] + lengths[lengths.length/2])/2.0 : lengths[lengths.length/2]
+ stats[:count] = lengths.count
+ return stats
+ end
+
+ def format(stats)
+ output = []
+ buffer_length = stats.members.map{|key| key.length}.max
+ stats.each_pair do |key, value|
+ numtype = value.is_a?(Float) ? "f" : "d"
+ output << " %-#{buffer_length}s: %#{numtype}" % [key.to_s.capitalize, value]
+ end
+ output.join("\n")
+ end
+ end
+
+ desc "filecheck", "Checks to see if a given file exists. Used internally, don't worry about it too much", :hide => true
+ def filecheck(filename)
+ say "The file '#{filename}' doesn't seem to exist!", :red unless File.exists?(filename)
+ end
+
+ desc "lengths", "Print a set of summary statistics for the given fasta file, including L50, N50, sum and count."
+ method_options [:cutoff, '-c'] => 0
+ def lengths(filename)
+ invoke :filecheck
+ lengths = Bio::FlatFile.open(filename).map{|entry| (entry.seq[-1,1] == "*") ? entry.length - 1 : entry.length}
+
+ say "All entries", :green
+ puts format(stats(lengths))
+ if options.cutoff > 0
+ say "Entries with length >= #{options.cutoff}", :green
+ puts format(stats(lengths.find_all{|l| l >= options.cutoff}))
+ end
+ end
+
+ desc "filter FILENAME [options]", "Impose a filter or set of filters on entries in a fasta file."
+ long_desc "Impose a filter or set of filters on entries in a fasta file where each sequence in the file has to pass all of the filters to be printed."
+ method_option :length_cutoff, :aliases => '-l', :type => :numeric, :default => 0, :desc => 'Only entries with length >= cutoff will be returned.'
+ method_option :inverse_match, :aliases => '-v', :type => :boolean, :desc => "Return the inverse of the match after all the other filters have been applied."
+ method_option :defline_grep, :aliases => '-d', :type => :string, :default => '', :desc => "A regular expression, used to search the entry's definition line."
+ def filter(filename)
+ invoke :filecheck
+ Bio::FlatFile.open(filename).each do |entry|
+ passed = true
+ passed &&= (entry.length >= options.length_cutoff)
+ passed &&= (entry.definition.match(Regexp.new(options.defline_grep)))
+ passed = !passed if options.inverse_match
+ puts entry if passed
+ end
+ end
+
+ desc "clean FILENAME [options]", "Clean up a fasta file"
+ method_option :wrap_width, :aliases => '-w', :type => :numeric, :desc => 'Wrap the fasta to N columns'
+ def clean(filename)
+ invoke :filecheck
+ Bio::FlatFile.open(filename).each do |entry|
+ puts entry.to_biosequence.output(:fasta, :header => entry.definition, :width => options.wrap_width)
+ end
+ end
+
+
+ desc "sort FILENAME [options]", "Sorts a fasta file according to criteria"
+ def sort(filename)
+ invoke :filecheck
+ Bio::FlatFile.open(filename).to_a.sort{|a,b| b.length <=> a.length}.each do |entry|
+ puts entry
+ end
+ end
+end
FastaUtility.start