lib/bio/sequence/format.rb in bio-1.2.1 vs lib/bio/sequence/format.rb in bio-1.3.0
- old
+ new
@@ -1,26 +1,26 @@
#
# = bio/sequence/format.rb - various output format of the biological sequence
#
-# Copyright:: Copyright (C) 2006
+# Copyright:: Copyright (C) 2006-2008
# Toshiaki Katayama <k@bioruby.org>,
# Naohisa Goto <ng@bioruby.org>,
-# Ryan Raaum <ryan@raaum.org>
+# Ryan Raaum <ryan@raaum.org>,
+# Jan Aerts <jan.aerts@bbsrc.ac.uk>
# License:: The Ruby License
#
# = TODO
#
# porting from N. Goto's feature-output.rb on BioRuby list.
#
-# $Id: format.rb,v 1.4 2007/04/05 23:35:41 trevor Exp $
+# $Id: format.rb,v 1.4.2.8 2008/06/17 15:50:05 ngoto Exp $
#
+require 'erb'
module Bio
- autoload :Sequence, 'bio/sequence'
-
class Sequence
# = DESCRIPTION
# A Mixin[http://www.rubycentral.com/book/tut_modules.html]
# of methods used by Bio::Sequence#output to output sequences in
@@ -31,151 +31,328 @@
# puts s.output(:fasta)
# puts s.output(:genbank)
# puts s.output(:embl)
module Format
- # INTERNAL USE ONLY, YOU SHOULD NOT CALL THIS METHOD. (And in any
- # case, it would be difficult to successfully call this method outside
- # its expected context).
+ # Repository of generic (or both nucleotide and protein) sequence
+ # formatter classes
+ module Formatter
+
+ # Raw format generatar
+ autoload :Raw, 'bio/sequence/format_raw'
+
+ # Fasta format generater
+ autoload :Fasta, 'bio/db/fasta/format_fasta'
+
+ # NCBI-style Fasta format generatar
+ # (resemble to EMBOSS "ncbi" format)
+ autoload :Fasta_ncbi, 'bio/db/fasta/format_fasta'
+
+ end #module Formatter
+
+ # Repository of nucleotide sequence formatter classes
+ module NucFormatter
+
+ # GenBank format generater
+ # Note that the name is 'Genbank' and NOT 'GenBank'
+ autoload :Genbank, 'bio/db/genbank/format_genbank'
+
+ # EMBL format generater
+ # Note that the name is 'Embl' and NOT 'EMBL'
+ autoload :Embl, 'bio/db/embl/format_embl'
+
+ end #module NucFormatter
+
+ # Repository of protein sequence formatter classes
+ module AminoFormatter
+ # currently no formats available
+ end #module AminoFormatter
+
+ # Formatter base class.
+ # Any formatter class should inherit this class.
+ class FormatterBase
+
+ # Returns a formatterd string of the given sequence
+ # ---
+ # *Arguments*:
+ # * (required) _sequence_: Bio::Sequence object
+ # * (optional) _options_: a Hash object
+ # *Returns*:: String object
+ def self.output(sequence, options = {})
+ self.new(sequence, options).output
+ end
+
+ # register new Erb template
+ def self.erb_template(str)
+ erb = ERB.new(str)
+ erb.def_method(self, 'output')
+ true
+ end
+ private_class_method :erb_template
+
+ # generates output data
+ # ---
+ # *Returns*:: String object
+ def output
+ raise NotImplementedError, 'should be implemented in subclass'
+ end
+
+ # creates a new formatter object for output
+ def initialize(sequence, options = {})
+ @sequence = sequence
+ @options = options
+ end
+
+ private
+
+ # any unknown methods are delegated to the sequence object
+ def method_missing(sym, *args, &block) #:nodoc:
+ begin
+ @sequence.__send__(sym, *args, &block)
+ rescue NoMethodError => evar
+ lineno = __LINE__ - 2
+ file = __FILE__
+ bt_here = [ "#{file}:#{lineno}:in \`__send__\'",
+ "#{file}:#{lineno}:in \`method_missing\'"
+ ]
+ if bt_here == evar.backtrace[0, 2] then
+ bt = evar.backtrace[2..-1]
+ evar = evar.class.new("undefined method \`#{sym.to_s}\' for #{self.inspect}")
+ evar.set_backtrace(bt)
+ end
+ raise(evar)
+ end
+ end
+ end #class FormatterBase
+
+ # Using Bio::Sequence::Format, return a String with the Bio::Sequence
+ # object formatted in the given style.
#
- # Output the FASTA format string of the sequence.
+ # Formats currently implemented are: 'fasta', 'genbank', and 'embl'
#
- # UNFORTUNATLY, the current implementation of Bio::Sequence is incapable of
- # using either the header or width arguments. So something needs to be
- # changed...
- #
- # Currently, this method is used in Bio::Sequence#output like so,
- #
# s = Bio::Sequence.new('atgc')
# puts s.output(:fasta) #=> "> \natgc\n"
+ #
+ # The style argument is given as a Ruby
+ # Symbol(http://www.ruby-doc.org/core/classes/Symbol.html)
# ---
- # *Arguments*:
- # * (optional) _header_: String (default nil)
- # * (optional) _width_: Fixnum (default nil)
+ # *Arguments*:
+ # * (required) _format_: :fasta, :genbank, *or* :embl
# *Returns*:: String object
- def format_fasta(header = nil, width = nil)
- header ||= "#{@entry_id} #{@definition}"
+ def output(format = :fasta, options = {})
+ formatter_const = format.to_s.capitalize.intern
- ">#{header}\n" +
- if width
- @seq.to_s.gsub(Regexp.new(".{1,#{width}}"), "\\0\n")
+ formatter_class = nil
+ get_formatter_repositories.each do |mod|
+ begin
+ formatter_class = mod.const_get(formatter_const)
+ rescue NameError
+ end
+ break if formatter_class
+ end
+ unless formatter_class then
+ raise "unknown format name #{format.inspect}"
+ end
+
+ formatter_class.output(self, options)
+ end
+
+ # Returns a list of available output formats for the sequence
+ # ---
+ # *Arguments*:
+ # *Returns*:: Array of Symbols
+ def list_output_formats
+ a = get_formatter_repositories.collect { |mod| mod.constants }
+ a.flatten!
+ a.collect! { |x| x.to_s.downcase.intern }
+ a
+ end
+
+ private
+
+ # returns formatter repository modules
+ def get_formatter_repositories
+ if self.moltype == Bio::Sequence::NA then
+ [ NucFormatter, Formatter ]
+ elsif self.moltype == Bio::Sequence::AA then
+ [ AminoFormatter, Formatter ]
else
- @seq.to_s + "\n"
+ [ NucFormatter, AminoFormatter, Formatter ]
end
end
+ #---
+
# Not yet implemented :)
# Remove the nodoc command after implementation!
# ---
# *Returns*:: String object
- def format_gff #:nodoc:
- raise NotImplementedError
- end
+ #def format_gff #:nodoc:
+ # raise NotImplementedError
+ #end
+ #+++
+
+# Formatting helper methods for INSD (NCBI, EMBL, DDBJ) feature table
+module INSDFeatureHelper
+ private
+
# INTERNAL USE ONLY, YOU SHOULD NOT CALL THIS METHOD. (And in any
# case, it would be difficult to successfully call this method outside
# its expected context).
#
- # Output the Genbank format string of the sequence.
+ # Output the Genbank feature format string of the sequence.
# Used in Bio::Sequence#output.
# ---
# *Returns*:: String object
- def format_genbank
+ def format_features_genbank(features)
prefix = ' ' * 5
indent = prefix + ' ' * 16
fwidth = 79 - indent.length
-
- format_features(prefix, indent, fwidth)
+
+ format_features(features, prefix, indent, fwidth)
end
# INTERNAL USE ONLY, YOU SHOULD NOT CALL THIS METHOD. (And in any
# case, it would be difficult to successfully call this method outside
# its expected context).
#
- # Output the EMBL format string of the sequence.
+ # Output the EMBL feature format string of the sequence.
# Used in Bio::Sequence#output.
# ---
# *Returns*:: String object
- def format_embl
+ def format_features_embl(features)
prefix = 'FT '
indent = prefix + ' ' * 16
fwidth = 80 - indent.length
+
+ format_features(features, prefix, indent, fwidth)
+ end
- format_features(prefix, indent, fwidth)
+ # format INSD featurs
+ def format_features(features, prefix, indent, width)
+ result = []
+ features.each do |feature|
+ result.push format_feature(feature, prefix, indent, width)
+ end
+ return result.join('')
end
+ # format an INSD feature
+ def format_feature(feature, prefix, indent, width)
+ result = prefix + sprintf("%-16s", feature.feature)
- private
+ position = feature.position
+ #position = feature.locations.to_s
- def format_features(prefix, indent, width)
- result = ''
- @features.each do |feature|
- result << prefix + sprintf("%-16s", feature.feature)
-
- position = feature.position
- #position = feature.locations.to_s
-
- head = ''
- wrap(position, width).each_line do |line|
- result << head << line
- head = indent
- end
-
- result << format_qualifiers(feature.qualifiers, width)
- end
+ result << wrap_and_split_lines(position, width).join("\n" + indent)
+ result << "\n"
+ result << format_qualifiers(feature.qualifiers, indent, width)
return result
end
+ # format qualifiers
def format_qualifiers(qualifiers, indent, width)
- qualifiers.each do |qualifier|
+ qualifiers.collect do |qualifier|
q = qualifier.qualifier
v = qualifier.value.to_s
if v == true
- lines = wrap('/' + q, width)
+ lines = wrap_with_newline('/' + q, width)
elsif q == 'translation'
- lines = fold('/' + q + '=' + val, width)
+ lines = fold("/#{q}=\"#{v}\"", width)
else
- if v[/\D/]
+ if v[/\D/] or q == 'chromosome'
#v.delete!("\x00-\x1f\x7f-\xff")
v.gsub!(/"/, '""')
v = '"' + v + '"'
end
- lines = wrap('/' + q + '=' + val, width)
+ lines = wrap_with_newline('/' + q + '=' + v, width)
end
- return lines.gsub(/^/, indent)
- end
+ lines.gsub!(/^/, indent)
+ lines
+ end.join
end
def fold(str, width)
str.gsub(Regexp.new("(.{1,#{width}})"), "\\1\n")
end
- def wrap(str, width)
+ def fold_and_split_lines(str, width)
+ str.scan(Regexp.new(".{1,#{width}}"))
+ end
+
+ def wrap_and_split_lines(str, width)
result = []
- left = str.dup
- while left and left.length > width
- line = nil
- width.downto(1) do |i|
- if left[i..i] == ' ' or /[,;]/ =~ left[(i-1)..(i-1)] then
- line = left[0..(i-1)].sub(/ +\z/, '')
- left = left[i..-1].sub(/\A +/, '')
- break
+ lefts = str.chomp.split(/(?:\r\n|\r|\n)/)
+ lefts.each do |left|
+ left.rstrip!
+ while left and left.length > width
+ line = nil
+ width.downto(1) do |i|
+ if left[i..i] == ' ' or /[\,\;]/ =~ left[(i-1)..(i-1)] then
+ line = left[0..(i-1)].sub(/ +\z/, '')
+ left = left[i..-1].sub(/\A +/, '')
+ break
+ end
end
+ if line.nil? then
+ line = left[0..(width-1)]
+ left = left[width..-1]
+ end
+ result << line
+ left = nil if left.to_s.empty?
end
- if line.nil? then
- line = left[0..(width-1)]
- left = left[width..-1]
- end
- result << line
+ result << left if left
end
- result << left if left
- return result.join("\n")
+ return result
end
-end # Format
+ def wrap_with_newline(str, width)
+ result = wrap_and_split_lines(str, width)
+ result_string = result.join("\n")
+ result_string << "\n" unless result_string.empty?
+ return result_string
+ end
-end # Sequence
+ def wrap(str, width = 80, prefix = '')
+ actual_width = width - prefix.length
+ result = wrap_and_split_lines(str, actual_width)
+ result_string = result.join("\n#{prefix}")
+ result_string = prefix + result_string unless result_string.empty?
+ return result_string
+ end
-end # Bio
+ #--
+ # internal use only
+ MonthStr = [ nil,
+ 'JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN',
+ 'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC'
+ ].collect { |x| x.freeze }.freeze
+ #++
+
+ # formats a date from Date, DateTime, or Time object, or String.
+ def format_date(d)
+ begin
+ yy = d.year
+ mm = d.month
+ dd = d.day
+ rescue NoMethodError, NameError, ArgumentError, TypeError
+ return sprintf("%-11s", d)
+ end
+ sprintf("%02d-%-3s-%04d", dd, MonthStr[mm], yy)
+ end
+
+ # null date
+ def null_date
+ Date.new(0, 1, 1)
+ end
+
+end #module INSDFeatureHelper
+
+end #module Format
+
+end #class Sequence
+
+end #module Bio