# # = bio/db/go.rb - Classes for Gene Ontology # # Copyright:: Copyright (C) 2003 Mitsuteru C. Nakao # License:: # # $Id: go.rb,v 1.9 2005/10/31 18:32:36 nakao Exp $ # # == Gene Ontology # # == Example # # == References #-- # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # #++ # require 'bio/pathway' module Bio # = Bio::GO # Classes for Gene Ontology http://www.geneontology.org class GO # = Bio::GO::Ontology # # Container class for ontologies in the DAG Edit format. # # == Example # # c_data = File.open('component.oontology').read # go_c = Bio::GO::Ontology.new(c_data) # p go_c.bfs_shortest_path('0003673','0005632') class Ontology < Bio::Pathway # Bio::GO::Ontology.parse_ogids(line) # # Parsing GOID line in the DAGEdit format # GO:ID[ ; GO:ID...] def self.parse_goids(line) goids = [] loop { if /^ *[$%<]\S.+?;/ =~ line endpoint = line.index(';') + 1 line = line[endpoint..line.size] elsif /^,* GO:(\d{7}),*/ =~ line goids << $1.clone endpoint = line.index(goids.last) + goids.last.size line = line[endpoint..line.size] else break end } return goids end # Returns a Hash instance of the header lines in ontology flatfile. attr_reader :header_lines # attr_reader :id2term # attr_reader :id2id # Bio::GO::Ontology.new(str) # The DAG Edit format ontology data parser. def initialize(str) @id2term = {} @header_lines = {} @id2id = {} adj_list = dag_edit_format_parser(str) super(adj_list) end # Returns a GO_Term correspondig with the given GO_ID. def goid2term(goid) term = id2term[goid] term = id2term[id2id[goid]] if term == nil return term end private # constructing adjaency list for the given ontology def dag_edit_format_parser(str) stack = [] adj_list = [] str.each {|line| if /^!(.+?):\s+(\S.+)$/ =~ line # Parsing head lines tag = $1 value = $2 tag.gsub!(/-/,'_') next if tag == 'type' instance_eval("@header_lines['#{tag}'] = '#{value}'") next end case line when /^( *)([$<%])(.+?) ; GO:(\d{7})(\n*)/ # GO Term ; GO:ID depth = $1.length.to_i rel = $2 term = $3 goid1 = goid = $4 en = $5 goids = parse_goids(line) # GO:ID[ ; GO:ID...] synonyms = parse_synonyms(line) # synonym:Term[ ; synonym:Term...] stack[depth] = goids.first @id2term[goid] = term next if depth == 0 goids.each {|goid| @id2term[goid] = term @id2id[goid] = goids.first adj_list << Bio::Relation.new(stack[depth - 1], goid, rel) } if en == "" loop { case line when /^\n$/ break when /^ *([<%]) (.+?) ; GO:(\d{7})/ # <%GO Term ; GO:ID rel1 = $1 term1 = $2 goid1 = $3 goids1 = parse_goids(line) synonyms1 = parse_synonyms(line) @id2term[goid1] = term1 goids.each {|goid| adj_list << Bio::Relation.new(goid1, goid, rel1) } else break end } end end } return adj_list end # Returns an ary of GO IDs by parsing an entry line in the DAG Edit # format. def parse_goids(line) Ontology.parse_goids(line) end # Bio::GO::Ontology#parse_synonyms(line) def parse_synonyms(line) synonyms = [] loop { if / ; synonym:(\S.+?) *[;<%\n]/ =~ line synonyms << $1.clone endpoint = line.index(synonyms.last) + synonyms.last.size line = line[endpoint..line.size] else break end } return synonyms end end # class Ontology # = Bio::GO::GeneAssociation # $CVSROOT/go/gene-associations/gene_association.* # # Data parser for the gene_association go annotation. # See also the file format http://www.geneontology.org/doc/GO.annotation.html#file # # == Example # # mgi_data = File.open('gene_association.mgi').read # mgi = Bio::GO::GeneAssociation.parser(mgi_data) # # Bio::GO::GeneAssociation.parser(mgi_data) do |entry| # p [entry.entry_id, entry.evidence, entry.goid] # end # class GeneAssociation # < Bio::DB # Delimiter DELIMITER = "\n" # Delimiter RS = DELIMITER # Retruns an Array of parsed gene_association flatfile. # Block is acceptable. def self.parser(str) if block_given? str.each(DELIMITER) {|line| next if /^!/ =~ line yield GeneAssociation.new(line) } else galist = [] str.each(DELIMITER) {|line| next if /^!/ =~ line galist << GeneAssociation.new(line) } return galist end end # Returns DB variable. attr_reader :db # -> aStr # Returns Db_Object_Id variable. Alias to entry_id. attr_reader :db_object_id # -> aStr # Returns Db_Object_Symbol variable. attr_reader :db_object_symbol # Returns Db_Object_Name variable. attr_reader :qualifier # Returns Db_Reference variable. attr_reader :db_reference # -> [] # Retruns Evidence code variable. attr_reader :evidence # Returns the entry is associated with this value. attr_reader :with # -> [] # Returns Aspect valiable. attr_reader :aspect # attr_reader :db_object_name # attr_reader :db_object_synonym # -> [] # Returns Db_Object_Type variable. attr_reader :db_object_type # Returns Taxon variable. attr_reader :taxon # Returns Date variable. attr_reader :date # attr_reader :assigned_by alias entry_id db_object_id # Parsing an entry (in a line) in the gene_association flatfile. def initialize(entry) tmp = entry.chomp.split(/\t/) @db = tmp[0] @db_object_id = tmp[1] @db_object_symbol = tmp[2] @qualifier = tmp[3] # @goid = tmp[4] @db_reference = tmp[5].split(/\|/) # @evidence = tmp[6] @with = tmp[7].split(/\|/) # @aspect = tmp[8] @db_object_name = tmp[9] # @db_object_synonym = tmp[10].split(/\|/) # @db_object_type = tmp[11] @taxon = tmp[12] # taxon:4932 @date = tmp[13] # 20010118 @assigned_by = tmp[14] end # Returns GO_ID in /\d{7}/ format. Giving not nil arg, returns # /GO:\d{7}/ style. # # * Bio::GO::GeneAssociation#goid -> "001234" # * Bio::GO::GeneAssociation#goid(true) -> "GO:001234" def goid(org = nil) if org @goid else @goid.sub('GO:','') end end # Bio::GO::GeneAssociation#to_str -> a line of gene_association file. def to_str return [@db, @db_object_id, @db_object_symbol, @quialifier, @goid, @qualifier.join("|"), @evidence, @with.join("|"), @aspect, @db_object_name, @db_object_synonym.join("|"), @db_object_type, @taxon, @date, @assigned_by].join("\t") end end # class GeneAssociation # = Container class for files in geneontology.org/go/external2go/*2go. # # The line syntax is: # # database: > GO: ; GO: # # == Example # # spkw2go = Bio::GO::External2go.new(File.read("spkw2go")) # spkw2go.size # spkw2go.each do |relation| # relation # -> {:db => "", :db_id => "", :go_term => "", :go_id => ""} # end # spkw2go.dbs # # == SAMPLE # !date: 2005/02/08 18:02:54 # !Mapping of SWISS-PROT KEYWORDS to GO terms. # !Evelyn Camon, SWISS-PROT. # ! # SP_KW:ATP synthesis > GO:ATP biosynthesis ; GO:0006754 # ... # class External2go < Array # Returns aHash of the external2go header information attr_reader :header # Constructor from parsing external2go file. def self.parser(str) e2g = self.new str.each_line do |line| line.chomp! if line =~ /^\!date: (.+)/ e2g.header[:date] = $1 elsif line =~ /^\!(.*)/ e2g.header[:desc] << $1 elsif ary = line.scan(/^(.+?):(.+) > GO:(.+) ; (GO:\d{7})/).first e2g << {:db_id => ary[1], :db => ary[0], :go_term => ary[2], :go_id => ary[3]} else raise("Invalid Format Line: \n #{line.inspect}\n") end end return e2g end # Constructor. # relation := {:db => aStr, :db_id => aStr, :go_term => aStr, :go_id => aStr} def initialize @header = {:date => '', :desc => []} super end # Bio::GO::External2go#set_date(value) def set_date(value) @header[:date] = value end # Bio::GO::External2go#set_desc(ary) def set_desc(ary) @header[:desc] = ary end # Bio::GO::External2go#to_str # Returns the contents in the external2go format. def to_str ["!date: #{@header[:date]}", @header[:desc].map {|e| "!#{e}" }, self.map { |e| [e[:db], ':', e[:db_id], ' > GO:', e[:go_term], ' ; ', e[:go_id]].join } ].join("\n") end # Returns ary of databases. def dbs self.map {|rel| rel[:db] }.uniq end # Returns ary of database IDs. def db_ids self.map {|rel| rel[:db_id] }.uniq end # Returns ary of GO Terms. def go_terms self.map {|rel| rel[:go_term] }.uniq end # Returns ary of GO IDs. def go_ids self.map {|rel| rel[:go_id] }.uniq end end # class External2go end # class GO end # module Bio if __FILE__ == $0 require 'net/http' def wget(url) if /http:\/\/(.+?)\// =~ url host = $1 path = url[(url.index(host) + host.size)..url.size] else raise ArgumentError, "Invalid URL\n#{url}" end result = Net::HTTP.new(host).get(path).body end go_c_url = 'http://www.geneontology.org/ontology/component.ontology' ga_url = 'http://www.geneontology.org/gene-associations/gene_association.sgd.gz' e2g_url = 'http://www.geneontology.org/external2go/spkw2go' puts "\n #==> Bio::GO::Ontology" p go_c_url component_ontology = wget(go_c_url) comp = Bio::GO::Ontology.new(component_ontology) [['0003673', '0005632'], ['0003673', '0005619'], ['0003673', '0004649']].each {|pair| puts p pair p [:pair, pair.map {|i| [comp.id2term[i], comp.goid2term(i)] }] puts "\n #==> comp.bfs_shortest_path(pair[0], pair[1])" p comp.bfs_shortest_path(pair[0], pair[1]) } puts "\n #==> Bio::GO::External2go" p e2g_url spkw2go = Bio::GO::External2go.new(wget(e2g_url)) puts "\n #==> spkw2go.db" p spkw2go.db puts "\n #==> spkw2go[1]" p spkw2go[1] require 'zlib' puts "\n #==> Bio::GO::GeenAssociation" p ga_url ga = Zlib::Inflate.inflate(wget(ga_url)) ga = Bio::GO::GeneAssociation.parser(ga) puts "\n #==> ga.size" p ga.size puts "\n #==> ga[100]" p ga[100] end