genes.rb in bio-1.4.0

- old
+ new

@@ -3,11 +3,11 @@
 #
 # Copyright::   Copyright (C) 2001, 2002, 2006
 #               Toshiaki Katayama <k@bioruby.org>
 # License::     The Ruby License
 #
-# $Id: genes.rb,v 0.26 2007/12/14 16:20:38 k Exp $
+# $Id:$
 #
 #
 # == KEGG GENES parser
 #
 # See http://www.genome.jp/kegg/genes.html
@@ -36,11 +36,11 @@
 #  p entry.definition  # => String
 #  p entry.eclinks     # => Array
 # 
 #  # PATHWAY
 #  p entry.pathway     # => String
-#  p entry.pathways    # => Array
+#  p entry.pathways    # => Hash
 # 
 #  # POSITION
 #  p entry.position    # => String
 #  p entry.chromosome  # => String
 #  p entry.gbposition  # => String
@@ -50,10 +50,13 @@
 #  p entry.motif       # => Hash of Array
 #
 #  # DBLINKS
 #  p entry.dblinks     # => Hash of Array
 # 
+#  # STRUCTURE
+#  p entry.structure   # => Array
+#
 #  # CODON_USAGE
 #  p entry.codon_usage # => Hash
 #  p entry.cu_list     # => Array
 # 
 #  # AASEQ
@@ -71,22 +74,57 @@
 
   autoload :KEGGDB,    'bio/db'
   autoload :Locations, 'bio/location'
   autoload :Sequence,  'bio/sequence'
 
+  require 'bio/db/kegg/common'
+
 class KEGG
 
+# == Description
+#
+# KEGG GENES entry parser.
+#
+# == References
+#
+# * http://www.genome.jp/kegg/genes.html
+#
 class GENES < KEGGDB
 
   DELIMITER	= RS = "\n///\n"
   TAGSIZE	= 12
 
+  include Common::DblinksAsHash
+  # Returns a Hash of the DB name and an Array of entry IDs in DBLINKS field.
+  def dblinks_as_hash; super; end if false #dummy for RDoc
+  alias dblinks dblinks_as_hash
+
+  include Common::PathwaysAsHash
+  # Returns a Hash of the pathway ID and name in PATHWAY field.
+  def pathways_as_hash; super; end if false #dummy for RDoc
+  alias pathways pathways_as_hash
+
+  include Common::OrthologsAsHash
+  # Returns a Hash of the orthology ID and definition in ORTHOLOGY field.
+  def orthologs_as_hash; super; end if false #dummy for RDoc
+  alias orthologs orthologs_as_hash
+
+  # Creates a new Bio::KEGG::GENES object.
+  # ---
+  # *Arguments*:
+  # * (required) _entry_: (String) single entry as a string
+  # *Returns*:: Bio::KEGG::GENES object
   def initialize(entry)
     super(entry, TAGSIZE)
   end
 
-
+  # Returns the "ENTRY" line content as a Hash.
+  # For example, 
+  #   {"organism"=>"E.coli", "division"=>"CDS", "id"=>"b0356"}
+  #
+  # ---
+  # *Returns*:: Hash
   def entry
     unless @data['ENTRY']
       hash = Hash.new('')
       if get('ENTRY').length > 30
         e = get('ENTRY')
@@ -97,84 +135,135 @@
       @data['ENTRY'] = hash
     end
     @data['ENTRY']
   end
 
+  # ID of the entry, described in the ENTRY line.
+  # ---
+  # *Returns*:: String
   def entry_id
     entry['id']
   end
 
+  # Division of the entry, described in the ENTRY line.
+  # ---
+  # *Returns*:: String
   def division
     entry['division']			# CDS, tRNA etc.
   end
 
+  # Organism name of the entry, described in the ENTRY line.
+  # ---
+  # *Returns*:: String
   def organism
     entry['organism']			# H.sapiens etc.
   end
 
+  # Returns the NAME line.
+  # ---
+  # *Returns*:: String
   def name
     field_fetch('NAME')
   end
 
+  # Names of the entry as an Array, described in the NAME line.
+  #
+  # ---
+  # *Returns*:: Array containing String
   def genes
     name.split(', ')
   end
 
+  # Returns the first gene name described in the NAME line.
+  # ---
+  # *Returns*:: String
   def gene
     genes.first
   end
 
+  # Definition of the entry, described in the DEFINITION line.
+  # ---
+  # *Returns*:: String
   def definition
     field_fetch('DEFINITION')
   end
 
+  # Enzyme's EC numbers shown in the DEFINITION line.
+  # ---
+  # *Returns*:: Array containing String
   def eclinks
     ec_list = definition.slice(/\[EC:(.*?)\]/, 1)
     if ec_list
       ec_list.strip.split(/\s+/)
     else
       []
     end
   end
 
-  def orthologs
+  # Orthologs described in the ORTHOLOGY lines.
+  # ---
+  # *Returns*:: Array containing String
+  def orthologs_as_strings
     lines_fetch('ORTHOLOGY')
   end
 
+  # Returns the PATHWAY lines as a String.
+  # ---
+  # *Returns*:: String
   def pathway
     field_fetch('PATHWAY')
   end
 
-  def pathways
-    pathway.scan(/\[PATH:(.*?)\]/).flatten
+  # Pathways described in the PATHWAY lines.
+  # ---
+  # *Returns*:: Array containing String
+  def pathways_as_strings
+    lines_fetch('PATHWAY')
   end
 
+  # The position in the genome described in the POSITION line.
+  # ---
+  # *Returns*:: String
   def position
     unless @data['POSITION']
       @data['POSITION'] = fetch('POSITION').gsub(/\s/, '')
     end
     @data['POSITION']
   end
 
+  # Chromosome described in the POSITION line.
+  # ---
+  # *Returns*:: String or nil
   def chromosome
     if position[/:/]
       position.sub(/:.*/, '')
     elsif ! position[/\.\./]
       position
     else
       nil
     end
   end
 
+  # The position in the genome described in the POSITION line
+  # as GenBank feature table location formatted string.
+  # ---
+  # *Returns*:: String
   def gbposition
     position.sub(/.*?:/, '')
   end
 
+  # The position in the genome described in the POSITION line
+  # as Bio::Locations object.
+  # ---
+  # *Returns*:: Bio::Locations object
   def locations
     Bio::Locations.new(gbposition)
   end
 
+  # Motif information described in the MOTIF lines.
+  # ---
+  # *Returns*:: Hash
   def motif
     unless @data['MOTIF']
       hash = {}
       db = nil
       lines_fetch('MOTIF').each do |line|
@@ -189,22 +278,31 @@
       @data['MOTIF'] = hash
     end
     @data['MOTIF']		# Hash of Array of IDs in MOTIF
   end
 
-  def dblinks
-    unless @data['DBLINKS']
-      hash = {}
-      get('DBLINKS').scan(/(\S+):\s*(.*)\n?/).each do |db, str|
-        id_array = str.strip.split(/\s+/)
-        hash[db] = id_array
-      end
-      @data['DBLINKS'] = hash
+  # Links to other databases described in the DBLINKS lines.
+  # ---
+  # *Returns*:: Array containing String objects
+  def dblinks_as_strings
+    lines_fetch('DBLINKS')
+  end
+
+  # Returns structure ID information described in the STRUCTURE lines.
+  # ---
+  # *Returns*:: Array containing String
+  def structure
+    unless @data['STRUCTURE']
+      @data['STRUCTURE'] = fetch('STRUCTURE').sub(/(PDB: )*/,'').split(/\s+/)
     end
-    @data['DBLINKS']		# Hash of Array of IDs in DBLINKS
+    @data['STRUCTURE'] # ['PDB:1A9X', ...]
   end
+  alias structures structure
 
+  # Codon usage data described in the CODON_USAGE lines.
+  # ---
+  # *Returns*:: Hash
   def codon_usage(codon = nil)
     unless @data['CODON_USAGE']
       hash = Hash.new
       list = cu_list
       base = %w(t c a g)
@@ -218,38 +316,53 @@
       @data['CODON_USAGE'] = hash
     end
     @data['CODON_USAGE']
   end
 
+  # Codon usage data described in the CODON_USAGE lines as an array.
+  # ---
+  # *Returns*:: Array
   def cu_list
     ary = []
     get('CODON_USAGE').sub(/.*/,'').each_line do |line|	# cut 1st line
       line.chomp.sub(/^.{11}/, '').scan(/..../) do |cu|
         ary.push(cu.to_i)
       end
     end
     return ary
   end
 
+  # Returns amino acid sequence described in the AASEQ lines.
+  # ---
+  # *Returns*:: Bio::Sequence::AA object
   def aaseq
     unless @data['AASEQ']
       @data['AASEQ'] = Bio::Sequence::AA.new(fetch('AASEQ').gsub(/\d+/, ''))
     end
     @data['AASEQ']
   end
 
+  # Returns length of the amino acid sequence described in the AASEQ lines.
+  # ---
+  # *Returns*:: Integer
   def aalen
     fetch('AASEQ')[/\d+/].to_i
   end
 
+  # Returns nucleic acid sequence described in the NTSEQ lines.
+  # ---
+  # *Returns*:: Bio::Sequence::NA object
   def ntseq
     unless @data['NTSEQ']
       @data['NTSEQ'] = Bio::Sequence::NA.new(fetch('NTSEQ').gsub(/\d+/, ''))
     end
     @data['NTSEQ']
   end
   alias naseq ntseq
 
+  # Returns nucleic acid sequence length.
+  # ---
+  # *Returns*:: Integer
   def ntlen
     fetch('NTSEQ')[/\d+/].to_i
   end
   alias nalen ntlen