embl.rb in bio-1.1.0

- old
+ new

@@ -1,13 +1,13 @@
 #
 # = bio/db/embl/embl.rb - EMBL database class
 #
 # 
-# Copyright::   Copyright (C) 2001-2005 Mitsuteru C. Nakao <n@bioruby.org>
-# License::     LGPL
+# Copyright::   Copyright (C) 2001-2007 Mitsuteru C. Nakao <n@bioruby.org>
+# License::     The Ruby License
 #
-# $Id: embl.rb,v 1.26 2006/01/28 06:40:38 nakao Exp $
+# $Id: embl.rb,v 1.29 2007/04/05 23:35:40 trevor Exp $
 #
 # == Description
 #
 # Parser class for EMBL database entry.
 #
@@ -26,28 +26,10 @@
 #   http://www.ebi.ac.uk/embl/
 #
 # * The EMBL Nucleotide Sequence Database: Users Manual
 #   http://www.ebi.ac.uk/embl/Documentation/User_manual/usrman.html
 #
-#--
-#
-#  This library is free software; you can redistribute it and/or
-#  modify it under the terms of the GNU Lesser General Public
-#  License as published by the Free Software Foundation; either
-#  version 2 of the License, or (at your option) any later version.
-#
-#  This library is distributed in the hope that it will be useful,
-#  but WITHOUT ANY WARRANTY; without even the implied warranty of
-#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-#  Lesser General Public License for more details.
-#
-#  You should have received a copy of the GNU Lesser General Public
-#  License along with this library; if not, write to the Free Software
-#  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
-#
-#++
-#
 
 require 'bio/db'
 require 'bio/db/embl/common'
 
 module Bio
@@ -56,11 +38,11 @@
 
   # returns contents in the ID line.
   # * Bio::EMBL#id_line -> <ID Hash>
   # where <ID Hash> is:
   #  {'ENTRY_NAME' => String, 'MOLECULE_TYPE' => String, 'DIVISION' => String,
-  #   'SEQUENCE_LENGTH' => Int}
+  #   'SEQUENCE_LENGTH' => Int, 'SEQUENCE_VERSION' => Int}
   #
   # ID Line
   #  "ID  ENTRY_NAME DATA_CLASS; MOLECULE_TYPE; DIVISION; SEQUENCE_LENGTH BP."
   #
   # DATA_CLASS = ['standard']
@@ -85,18 +67,35 @@
   #  SYN (Synthetic)
   #  STS (STSs)
   #  UNC (Unclassified)
   #  VRL (Viruses)
   #
+  # Rel 89-
+  # ID   CD789012; SV 4; linear; genomic DNA; HTG; MAM; 500 BP.
+  # ID <1>; SV <2>; <3>; <4>; <5>; <6>; <7> BP.
+  # 1. Primary accession number
+  # 2. Sequence version number
+  # 3. Topology: 'circular' or 'linear'
+  # 4. Molecule type (see note 1 below)
+  # 5. Data class (see section 3.1)
+  # 6. Taxonomic division (see section 3.2)
+  # 7. Sequence length (see note 2 below)
   def id_line(key=nil)
     unless @data['ID']
       tmp = Hash.new
       idline = fetch('ID').split(/; +/)         
-      tmp['ENTRY_NAME'], tmp['DATA_CLASS'] = idline[0].split(/ +/)
-      tmp['MOLECULE_TYPE'] = idline[1]
-      tmp['DIVISION'] = idline[2]
-      tmp['SEQUENCE_LENGTH'] = idline[3].strip.split(' ').first.to_i
+      tmp['ENTRY_NAME'], tmp['DATA_CLASS'] = idline.shift.split(/ +/)
+      if idline.first =~ /^SV/
+        tmp['SEQUENCE_VERSION'] = idline.shift.split(' ').last
+        tmp['TOPOLOGY'] = idline.shift
+        tmp['MOLECULE_TYPE'] = idline.shift
+        tmp['DATA_CLASS'] = idline.shift
+      else
+        tmp['MOLECULE_TYPE'] = idline.shift
+      end
+      tmp['DIVISION'] = idline.shift
+      tmp['SEQUENCE_LENGTH'] = idline.shift.strip.split(' ').first.to_i
 
       @data['ID'] = tmp
     end
     
     if key
@@ -144,14 +143,18 @@
   # * Bio::EMBL#version -> accession in Int
   #
   # SV Line; sequence version (1/entry)
   #  SV    Accession.Version
   def sv
-    field_fetch('SV').sub(/;/,'')
+    if (v = field_fetch('SV').sub(/;/,'')) == ""
+      [id_line['ENTRY_NAME'], id_line['SEQUENCE_VERSION']].join('.') 
+    else
+      v
+    end  
   end
   def version
-    sv.split(".")[1].to_i
+    (sv.split(".")[1] || id_line['SEQUENCE_VERSION']).to_i
   end
 
   
   # returns contents in the date (DT) line.
   # * Bio::EMBL#dt  -> <DT Hash>
@@ -392,63 +395,8 @@
     end
 
     return feature
   end
 
-end
+end # class EMBL
 
-end
-
-
-if __FILE__ == $0
-  while ent = $<.gets(Bio::EMBL::RS)
-    puts "\n ==> e = Bio::EMBL.new(ent) "
-    e = Bio::EMBL.new(ent)
-
-    puts "\n ==> e.entry_id "
-    p e.entry_id
-    puts "\n ==> e.id_line "
-    p e.id_line
-    puts "\n ==> e.id_line('molecule') "
-    p e.id_line('molecule')
-    puts "\n ==> e.molecule "
-    p e.molecule
-    puts "\n ==> e.ac "
-    p e.ac
-    puts "\n ==> e.sv "
-    p e.sv
-    puts "\n ==> e.dt "
-    p e.dt
-    puts "\n ==> e.dt('created') "
-    p e.dt('created')
-    puts "\n ==> e.de "
-    p e.de
-    puts "\n ==> e.kw "
-    p e.kw
-    puts "\n ==> e.os "
-    p e.os
-    puts "\n ==> e.oc "
-    p e.oc
-    puts "\n ==> e.og "
-    p e.og
-    puts "\n ==> e.ref "
-    p e.ref
-    puts "\n ==> e.dr "
-    p e.dr
-    puts "\n ==> e.ft "
-    p e.ft
-    puts "\n ==> e.each_cds {|c| p c}"
-    p e.each_cds {|c| p c }
-    puts "\n ==> e.sq "
-    p e.sq
-    puts "\n ==> e.sq('a') "
-    p e.sq('a')
-    puts "\n ==> e.gc"    
-    p e.gc
-    puts "\n ==> e.seq "
-    p e.seq
-  end
-
-end
-
-
-
+end # module Bio