lib/bio/db/gff.rb in bio-1.0.0 vs lib/bio/db/gff.rb in bio-1.1.0

- old
+ new

@@ -1,69 +1,112 @@ # # = bio/db/gff.rb - GFF format class # # Copyright:: Copyright (C) 2003, 2005 # Toshiaki Katayama <k@bioruby.org> -# License:: LGPL +# 2006 Jan Aerts <jan.aerts@bbsrc.ac.uk> +# License:: The Ruby License # -# $Id: gff.rb,v 1.5 2005/12/18 15:58:41 k Exp $ +# $Id: gff.rb,v 1.9 2007/05/18 15:23:42 k Exp $ # -# == Description -# -# -# == Example -# -# -# == References -# -# * http://www.sanger.ac.uk/Software/formats/GFF/ -# -#-- -# -# This library is free software; you can redistribute it and/or -# modify it under the terms of the GNU Lesser General Public -# License as published by the Free Software Foundation; either -# version 2 of the License, or (at your option) any later version. -# -# This library is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# Lesser General Public License for more details. -# -# You should have received a copy of the GNU Lesser General Public -# License along with this library; if not, write to the Free Software -# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -# -#++ -# module Bio - +# == DESCRIPTION +# The Bio::GFF and Bio::GFF::Record classes describe data contained in a +# GFF-formatted file. For information on the GFF format, see +# http://www.sanger.ac.uk/Software/formats/GFF/. Data are represented in tab- +# delimited format, including +# * seqname +# * source +# * feature +# * start +# * end +# * score +# * strand +# * frame +# * attributes (optional) +# +# For example: +# SEQ1 EMBL atg 103 105 . + 0 +# SEQ1 EMBL exon 103 172 . + 0 +# SEQ1 EMBL splice5 172 173 . + . +# SEQ1 netgene splice5 172 173 0.94 + . +# SEQ1 genie sp5-20 163 182 2.3 + . +# SEQ1 genie sp5-10 168 177 2.1 + . +# SEQ1 grail ATG 17 19 2.1 - 0 +# +# The Bio::GFF object is a container for Bio::GFF::Record objects, each +# representing a single line in the GFF file. class GFF - - attr_accessor :records - + # Creates a Bio::GFF object by building a collection of Bio::GFF::Record + # objects. + # + # Create a Bio::GFF object the hard way + # this_gff = "SEQ1\tEMBL\tatg\t103\t105\t.\t+\t0\n" + # this_gff << "SEQ1\tEMBL\texon\t103\t172\t.\t+\t0\n" + # this_gff << "SEQ1\tEMBL\tsplice5\t172\t173\t.\t+\t.\n" + # this_gff << "SEQ1\tnetgene\tsplice5\t172\t173\t0.94\t+\t.\n" + # this_gff << "SEQ1\tgenie\tsp5-20\t163\t182\t2.3\t+\t.\n" + # this_gff << "SEQ1\tgenie\tsp5-10\t168\t177\t2.1\t+\t.\n" + # this_gff << "SEQ1\tgrail\tATG\t17\t19\t2.1\t-\t0\n" + # p Bio::GFF.new(this_gff) + # + # or create one based on a GFF-formatted file: + # p Bio::GFF.new(File.open('my_data.gff') + # --- + # *Arguments*: + # * _str_: string in GFF format + # *Returns*:: Bio::GFF object def initialize(str = '') @records = Array.new str.each_line do |line| @records << Record.new(line) end end + # An array of Bio::GFF::Record objects. + attr_accessor :records + + # Represents a single line of a GFF-formatted file. See Bio::GFF for more + # information. class Record + # Name of the reference sequence attr_accessor :seqname + + # Name of the source of the feature (e.g. program that did prediction) attr_accessor :source + + # Name of the feature attr_accessor :feature + + # Start position of feature on reference sequence attr_accessor :start + + # End position of feature on reference sequence attr_accessor :end + + # Score of annotation (e.g. e-value for BLAST search) attr_accessor :score + + # Strand that feature is located on attr_accessor :strand + + # For features of type 'exon': indicates where feature begins in the reading frame attr_accessor :frame + + # List of tag=value pairs (e.g. to store name of the feature: ID=my_id) attr_accessor :attributes + + # Comments for the GFF record attr_accessor :comments + # Creates a Bio::GFF::Record object. Is typically not called directly, but + # is called automatically when creating a Bio::GFF object. + # --- + # *Arguments*: + # * _str_: a tab-delimited line in GFF format def initialize(str) @comments = str.chomp[/#.*/] return if /^#/.match(str) @seqname, @source, @feature, @start, @end, @score, @strand, @frame, attributes, = str.chomp.split("\t") @@ -80,16 +123,34 @@ end return hash end end + # = DESCRIPTION + # Represents version 2 of GFF specification. Is completely implemented by the + # Bio::GFF class. class GFF2 < GFF VERSION = 2 end + # = DESCRIPTION + # Represents version 3 of GFF specification. Is completely implemented by the + # Bio::GFF class. For more information on version GFF3, see + # http://flybase.bio.indiana.edu/annot/gff3.html class GFF3 < GFF VERSION = 3 + + private + + def parse_attributes(attributes) + hash = Hash.new + attributes.split(/[^\\];/).each do |atr| + key, value = atr.split('=', 2) + hash[key] = value + end + return hash + end end end # class GFF end # module Bio @@ -100,7 +161,14 @@ require 'pp' alias p pp rescue LoadError end - p Bio::GFF.new(ARGF.read) + this_gff = "SEQ1\tEMBL\tatg\t103\t105\t.\t+\t0\n" + this_gff << "SEQ1\tEMBL\texon\t103\t172\t.\t+\t0\n" + this_gff << "SEQ1\tEMBL\tsplice5\t172\t173\t.\t+\t.\n" + this_gff << "SEQ1\tnetgene\tsplice5\t172\t173\t0.94\t+\t.\n" + this_gff << "SEQ1\tgenie\tsp5-20\t163\t182\t2.3\t+\t.\n" + this_gff << "SEQ1\tgenie\tsp5-10\t168\t177\t2.1\t+\t.\n" + this_gff << "SEQ1\tgrail\tATG\t17\t19\t2.1\t-\t0\n" + p Bio::GFF.new(this_gff) end