lib/bio/db/kegg/genes.rb in bio-1.3.1 vs lib/bio/db/kegg/genes.rb in bio-1.4.0

- old
+ new

@@ -3,11 +3,11 @@ # # Copyright:: Copyright (C) 2001, 2002, 2006 # Toshiaki Katayama <k@bioruby.org> # License:: The Ruby License # -# $Id: genes.rb,v 0.26 2007/12/14 16:20:38 k Exp $ +# $Id:$ # # # == KEGG GENES parser # # See http://www.genome.jp/kegg/genes.html @@ -36,11 +36,11 @@ # p entry.definition # => String # p entry.eclinks # => Array # # # PATHWAY # p entry.pathway # => String -# p entry.pathways # => Array +# p entry.pathways # => Hash # # # POSITION # p entry.position # => String # p entry.chromosome # => String # p entry.gbposition # => String @@ -50,10 +50,13 @@ # p entry.motif # => Hash of Array # # # DBLINKS # p entry.dblinks # => Hash of Array # +# # STRUCTURE +# p entry.structure # => Array +# # # CODON_USAGE # p entry.codon_usage # => Hash # p entry.cu_list # => Array # # # AASEQ @@ -71,22 +74,57 @@ autoload :KEGGDB, 'bio/db' autoload :Locations, 'bio/location' autoload :Sequence, 'bio/sequence' + require 'bio/db/kegg/common' + class KEGG +# == Description +# +# KEGG GENES entry parser. +# +# == References +# +# * http://www.genome.jp/kegg/genes.html +# class GENES < KEGGDB DELIMITER = RS = "\n///\n" TAGSIZE = 12 + include Common::DblinksAsHash + # Returns a Hash of the DB name and an Array of entry IDs in DBLINKS field. + def dblinks_as_hash; super; end if false #dummy for RDoc + alias dblinks dblinks_as_hash + + include Common::PathwaysAsHash + # Returns a Hash of the pathway ID and name in PATHWAY field. + def pathways_as_hash; super; end if false #dummy for RDoc + alias pathways pathways_as_hash + + include Common::OrthologsAsHash + # Returns a Hash of the orthology ID and definition in ORTHOLOGY field. + def orthologs_as_hash; super; end if false #dummy for RDoc + alias orthologs orthologs_as_hash + + # Creates a new Bio::KEGG::GENES object. + # --- + # *Arguments*: + # * (required) _entry_: (String) single entry as a string + # *Returns*:: Bio::KEGG::GENES object def initialize(entry) super(entry, TAGSIZE) end - + # Returns the "ENTRY" line content as a Hash. + # For example, + # {"organism"=>"E.coli", "division"=>"CDS", "id"=>"b0356"} + # + # --- + # *Returns*:: Hash def entry unless @data['ENTRY'] hash = Hash.new('') if get('ENTRY').length > 30 e = get('ENTRY') @@ -97,84 +135,135 @@ @data['ENTRY'] = hash end @data['ENTRY'] end + # ID of the entry, described in the ENTRY line. + # --- + # *Returns*:: String def entry_id entry['id'] end + # Division of the entry, described in the ENTRY line. + # --- + # *Returns*:: String def division entry['division'] # CDS, tRNA etc. end + # Organism name of the entry, described in the ENTRY line. + # --- + # *Returns*:: String def organism entry['organism'] # H.sapiens etc. end + # Returns the NAME line. + # --- + # *Returns*:: String def name field_fetch('NAME') end + # Names of the entry as an Array, described in the NAME line. + # + # --- + # *Returns*:: Array containing String def genes name.split(', ') end + # Returns the first gene name described in the NAME line. + # --- + # *Returns*:: String def gene genes.first end + # Definition of the entry, described in the DEFINITION line. + # --- + # *Returns*:: String def definition field_fetch('DEFINITION') end + # Enzyme's EC numbers shown in the DEFINITION line. + # --- + # *Returns*:: Array containing String def eclinks ec_list = definition.slice(/\[EC:(.*?)\]/, 1) if ec_list ec_list.strip.split(/\s+/) else [] end end - def orthologs + # Orthologs described in the ORTHOLOGY lines. + # --- + # *Returns*:: Array containing String + def orthologs_as_strings lines_fetch('ORTHOLOGY') end + # Returns the PATHWAY lines as a String. + # --- + # *Returns*:: String def pathway field_fetch('PATHWAY') end - def pathways - pathway.scan(/\[PATH:(.*?)\]/).flatten + # Pathways described in the PATHWAY lines. + # --- + # *Returns*:: Array containing String + def pathways_as_strings + lines_fetch('PATHWAY') end + # The position in the genome described in the POSITION line. + # --- + # *Returns*:: String def position unless @data['POSITION'] @data['POSITION'] = fetch('POSITION').gsub(/\s/, '') end @data['POSITION'] end + # Chromosome described in the POSITION line. + # --- + # *Returns*:: String or nil def chromosome if position[/:/] position.sub(/:.*/, '') elsif ! position[/\.\./] position else nil end end + # The position in the genome described in the POSITION line + # as GenBank feature table location formatted string. + # --- + # *Returns*:: String def gbposition position.sub(/.*?:/, '') end + # The position in the genome described in the POSITION line + # as Bio::Locations object. + # --- + # *Returns*:: Bio::Locations object def locations Bio::Locations.new(gbposition) end + # Motif information described in the MOTIF lines. + # --- + # *Returns*:: Hash def motif unless @data['MOTIF'] hash = {} db = nil lines_fetch('MOTIF').each do |line| @@ -189,22 +278,31 @@ @data['MOTIF'] = hash end @data['MOTIF'] # Hash of Array of IDs in MOTIF end - def dblinks - unless @data['DBLINKS'] - hash = {} - get('DBLINKS').scan(/(\S+):\s*(.*)\n?/).each do |db, str| - id_array = str.strip.split(/\s+/) - hash[db] = id_array - end - @data['DBLINKS'] = hash + # Links to other databases described in the DBLINKS lines. + # --- + # *Returns*:: Array containing String objects + def dblinks_as_strings + lines_fetch('DBLINKS') + end + + # Returns structure ID information described in the STRUCTURE lines. + # --- + # *Returns*:: Array containing String + def structure + unless @data['STRUCTURE'] + @data['STRUCTURE'] = fetch('STRUCTURE').sub(/(PDB: )*/,'').split(/\s+/) end - @data['DBLINKS'] # Hash of Array of IDs in DBLINKS + @data['STRUCTURE'] # ['PDB:1A9X', ...] end + alias structures structure + # Codon usage data described in the CODON_USAGE lines. + # --- + # *Returns*:: Hash def codon_usage(codon = nil) unless @data['CODON_USAGE'] hash = Hash.new list = cu_list base = %w(t c a g) @@ -218,38 +316,53 @@ @data['CODON_USAGE'] = hash end @data['CODON_USAGE'] end + # Codon usage data described in the CODON_USAGE lines as an array. + # --- + # *Returns*:: Array def cu_list ary = [] get('CODON_USAGE').sub(/.*/,'').each_line do |line| # cut 1st line line.chomp.sub(/^.{11}/, '').scan(/..../) do |cu| ary.push(cu.to_i) end end return ary end + # Returns amino acid sequence described in the AASEQ lines. + # --- + # *Returns*:: Bio::Sequence::AA object def aaseq unless @data['AASEQ'] @data['AASEQ'] = Bio::Sequence::AA.new(fetch('AASEQ').gsub(/\d+/, '')) end @data['AASEQ'] end + # Returns length of the amino acid sequence described in the AASEQ lines. + # --- + # *Returns*:: Integer def aalen fetch('AASEQ')[/\d+/].to_i end + # Returns nucleic acid sequence described in the NTSEQ lines. + # --- + # *Returns*:: Bio::Sequence::NA object def ntseq unless @data['NTSEQ'] @data['NTSEQ'] = Bio::Sequence::NA.new(fetch('NTSEQ').gsub(/\d+/, '')) end @data['NTSEQ'] end alias naseq ntseq + # Returns nucleic acid sequence length. + # --- + # *Returns*:: Integer def ntlen fetch('NTSEQ')[/\d+/].to_i end alias nalen ntlen