lib/bio/io/sql.rb in bio-1.0.0 vs lib/bio/io/sql.rb in bio-1.1.0

- old
+ new

@@ -1,26 +1,14 @@ # -# bio/io/sql.rb - BioSQL access module +# = bio/io/sql.rb - BioSQL access module # -# Copyright (C) 2002 KATAYAMA Toshiaki <k@bioruby.org> +# Copyright:: Copyright (C) 2002 Toshiaki Katayama <k@bioruby.org> +# Copyright:: Copyright (C) 2006 Raoul Jean Pierre Bonnal <raoul.bonnal@itb.cnr.it> +# License:: The Ruby License # -# This library is free software; you can redistribute it and/or -# modify it under the terms of the GNU Lesser General Public -# License as published by the Free Software Foundation; either -# version 2 of the License, or (at your option) any later version. +# $Id: sql.rb,v 1.8 2007/04/05 23:35:41 trevor Exp $ # -# This library is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# Lesser General Public License for more details. -# -# You should have received a copy of the GNU Lesser General Public -# License along with this library; if not, write to the Free Software -# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -# -# $Id: sql.rb,v 1.4 2005/09/26 13:04:28 k Exp $ -# begin require 'dbi' rescue LoadError end @@ -28,299 +16,310 @@ require 'bio/feature' module Bio - class SQL +class SQL - def initialize(db = 'dbi:Mysql:biosql', user = nil, pass = nil) - @dbh = DBI.connect(db, user, pass) - end + def initialize(db = 'dbi:Mysql:biosql', user = nil, pass = nil) + @dbh = DBI.connect(db, user, pass) + end - def close - @dbh.disconnect - end + def close + @dbh.disconnect + end - def fetch(accession) # or display_id for fall back - query = "select * from bioentry where accession = ?" - entry = @dbh.execute(query, accession).fetch - return Sequence.new(@dbh, entry) if entry + # Returns Bio::SQL::Sequence object. + def fetch(accession) # or display_id for fall back + query = "select * from bioentry where accession = ?" + entry = @dbh.execute(query, accession).fetch + return Sequence.new(@dbh, entry) if entry - query = "select * from bioentry where display_id = ?" - entry = @dbh.execute(query, accession).fetch - return Sequence.new(@dbh, entry) if entry - end - alias get_by_id fetch + query = "select * from bioentry where display_id = ?" + entry = @dbh.execute(query, accession).fetch + return Sequence.new(@dbh, entry) if entry + end + alias get_by_id fetch - # for lazy fetching + # for lazy fetching - class Sequence + class Sequence - def initialize(dbh, entry) - @dbh = dbh - @bioentry_id = entry['bioentry_id'] - @database_id = entry['biodatabase_id'] - @entry_id = entry['display_id'] - @accession = entry['accession'] - @version = entry['entry_version'] - @division = entry['division'] - end - attr_reader :accession, :division, :entry_id, :version + def initialize(dbh, entry) + @dbh = dbh + @bioentry_id = entry['bioentry_id'] + @database_id = entry['biodatabase_id'] + @entry_id = entry['display_id'] + @accession = entry['accession'] + @version = entry['entry_version'] + @division = entry['division'] + end + attr_reader :accession, :division, :entry_id, :version - def to_fasta - if seq = seq - return seq.to_fasta(@accession) - end + def to_fasta + if seq = seq + return seq.to_fasta(@accession) end + end - def seq - query = "select * from biosequence where bioentry_id = ?" - row = @dbh.execute(query, @bioentry_id).fetch - return unless row + # Returns Bio::Sequence::NA or AA object. + def seq + query = "select * from biosequence where bioentry_id = ?" + row = @dbh.execute(query, @bioentry_id).fetch + return unless row - mol = row['molecule'] - seq = row['biosequence_str'] + mol = row['alphabet'] + seq = row['seq'] - case mol - when /.na/i # 'dna' or 'rna' - Bio::Sequence::NA.new(seq) - else # 'protein' - Bio::Sequence::AA.new(seq) - end + case mol + when /.na/i # 'dna' or 'rna' + Bio::Sequence::NA.new(seq) + else # 'protein' + Bio::Sequence::AA.new(seq) end + end - def subseq(from, to) - length = to - from + 1 - query = "select molecule, substring(biosequence_str, ?, ?) as subseq" + - " from biosequence where bioentry_id = ?" - row = @dbh.execute(query, from, length, @bioentry_id).fetch - return unless row + # Returns Bio::Sequence::NA or AA object (by lazy fetching). + def subseq(from, to) + length = to - from + 1 + query = "select alphabet, substring(seq, ?, ?) as subseq" + + " from biosequence where bioentry_id = ?" + row = @dbh.execute(query, from, length, @bioentry_id).fetch + return unless row - mol = row['molecule'] - seq = row['subseq'] + mol = row['alphabet'] + seq = row['subseq'] - case mol - when /.na/i # 'dna' or 'rna' - Bio::Sequence::NA.new(seq) - else # 'protein' - Bio::Sequence::AA.new(seq) - end + case mol + when /.na/i # 'dna' or 'rna' + Bio::Sequence::NA.new(seq) + else # 'protein' + Bio::Sequence::AA.new(seq) end + end - def features - array = [] - query = "select * from seqfeature where bioentry_id = ?" - @dbh.execute(query, @bioentry_id).fetch_all.each do |row| - next unless row + # Returns Bio::Features object. + def features + array = [] + query = "select * from seqfeature where bioentry_id = ?" + @dbh.execute(query, @bioentry_id).fetch_all.each do |row| + next unless row - f_id = row['seqfeature_id'] - k_id = row['seqfeature_key_id'] - s_id = row['seqfeature_source_id'] - rank = row['seqfeature_rank'].to_i - 1 + f_id = row['seqfeature_id'] + k_id = row['type_term_id'] + s_id = row['source_term_id'] + rank = row['rank'].to_i - 1 - # key : type (gene, CDS, ...) - type = feature_key(k_id) + # key : type (gene, CDS, ...) + type = feature_key(k_id) - # source : database (EMBL/GenBank/SwissProt) - database = feature_source(s_id) + # source : database (EMBL/GenBank/SwissProt) + database = feature_source(s_id) - # location : position - locations = feature_locations(f_id) + # location : position + locations = feature_locations(f_id) - # qualifier - qualifiers = feature_qualifiers(f_id) - - # rank - array[rank] = Bio::Feature.new(type, locations, qualifiers) - end - return Bio::Features.new(array) + # qualifier + qualifiers = feature_qualifiers(f_id) + + # rank + array[rank] = Bio::Feature.new(type, locations, qualifiers) end + return Bio::Features.new(array) + end - def references - array = [] - query = <<-END - select * from bioentry_reference, reference - where bioentry_id = ? and - bioentry_reference.reference_id = reference.reference_id - END - @dbh.execute(query, @bioentry_id).fetch_all.each do |row| - next unless row + # Returns reference informations in Array of Hash (not Bio::Reference). + def references + array = [] + query = <<-END + select * from bioentry_reference, reference + where bioentry_id = ? and + bioentry_reference.reference_id = reference.reference_id + END + @dbh.execute(query, @bioentry_id).fetch_all.each do |row| + next unless row - hash = { - 'start' => row['reference_start'], - 'end' => row['reference_end'], - 'journal' => row['reference_location'], - 'title' => row['reference_title'], - 'authors' => row['reference_authors'], - 'medline' => row['reference_medline'] - } - hash.default = '' + hash = { + 'start' => row['start_pos'], + 'end' => row['end_pos'], + 'journal' => row['location'], + 'title' => row['title'], + 'authors' => row['authors'], + 'medline' => row['crc'] + } + hash.default = '' - rank = row['reference_rank'].to_i - 1 - array[rank] = hash - end - return array + rank = row['rank'].to_i - 1 + array[rank] = hash end + return array + end - def comment - query = "select * from comment where bioentry_id = ?" - row = @dbh.execute(query, @bioentry_id).fetch - row ? row['comment_text'] : '' - end + # Returns the first comment. For complete comments, use comments method. + def comment + query = "select * from comment where bioentry_id = ?" + row = @dbh.execute(query, @bioentry_id).fetch + row ? row['comment_text'] : '' + end - def comments - array = [] - query = "select * from comment where bioentry_id = ?" - @dbh.execute(query, @bioentry_id).fetch_all.each do |row| - next unless row - rank = row['comment_rank'].to_i - 1 - array[rank] = row['comment_text'] - end - return array + # Returns comments in an Array of Strings. + def comments + array = [] + query = "select * from comment where bioentry_id = ?" + @dbh.execute(query, @bioentry_id).fetch_all.each do |row| + next unless row + rank = row['rank'].to_i - 1 + array[rank] = row['comment_text'] end + return array + end - def database - query = "select * from biodatabase where biodatabase_id = ?" - row = @dbh.execute(query, @database_id).fetch - row ? row['name'] : '' - end + def database + query = "select * from biodatabase where biodatabase_id = ?" + row = @dbh.execute(query, @database_id).fetch + row ? row['name'] : '' + end - def date - query = "select * from bioentry_date where bioentry_id = ?" - row = @dbh.execute(query, @bioentry_id).fetch - row ? row['date'] : '' - end + def date + query = "select * from bioentry_date where bioentry_id = ?" + row = @dbh.execute(query, @bioentry_id).fetch + row ? row['date'] : '' + end - def dblink - query = "select * from bioentry_direct_links where source_bioentry_id = ?" - row = @dbh.execute(query, @bioentry_id).fetch - row ? [row['dbname'], row['accession']] : [] - end + def dblink + query = "select * from bioentry_direct_links where source_bioentry_id = ?" + row = @dbh.execute(query, @bioentry_id).fetch + row ? [row['dbname'], row['accession']] : [] + end - def definition - query = "select * from bioentry_description where bioentry_id = ?" - row = @dbh.execute(query, @bioentry_id).fetch - row ? row['description'] : '' - end + def definition + query = "select * from bioentry_description where bioentry_id = ?" + row = @dbh.execute(query, @bioentry_id).fetch + row ? row['description'] : '' + end - def keyword - query = "select * from bioentry_keywords where bioentry_id = ?" - row = @dbh.execute(query, @bioentry_id).fetch - row ? row['keywords'] : '' - end + def keyword + query = "select * from bioentry_keywords where bioentry_id = ?" + row = @dbh.execute(query, @bioentry_id).fetch + row ? row['keywords'] : '' + end - def taxonomy - query = <<-END - select full_lineage, common_name, ncbi_taxa_id - from bioentry_taxa, taxa - where bioentry_id = ? and bioentry_taxa.taxa_id = taxa.taxa_id - END - row = @dbh.execute(query, @bioentry_id).fetch - @lineage = row ? row['full_lineage'] : '' - @common_name = row ? row['common_name'] : '' - @ncbi_taxa_id = row ? row['ncbi_taxa_id'] : '' - row ? [@lineage, @common_name, @ncbi_taxa_id] : [] - end + # Use lineage, common_name, ncbi_taxa_id methods to extract in detail. + def taxonomy + query = <<-END + select taxon_name.name, taxon.ncbi_taxon_id from bioentry + join taxon_name using(taxon_id) join taxon using (taxon_id) + where bioentry_id = ? + END + row = @dbh.execute(query, @bioentry_id).fetch +# @lineage = row ? row['full_lineage'] : '' + @common_name = row ? row['name'] : '' + @ncbi_taxa_id = row ? row['ncbi_taxon_id'] : '' + row ? [@lineage, @common_name, @ncbi_taxa_id] : [] + end - def lineage - taxonomy unless @lineage - return @lineage - end + def lineage + taxonomy unless @lineage + return @lineage + end - def common_name - taxonomy unless @common_name - return @common_name - end + def common_name + taxonomy unless @common_name + return @common_name + end - def ncbi_taxa_id - taxonomy unless @ncbi_taxa_id - return @ncbi_taxa_id - end + def ncbi_taxa_id + taxonomy unless @ncbi_taxa_id + return @ncbi_taxa_id + end - private + private - def feature_key(k_id) - query = "select * from seqfeature_key where seqfeature_key_id = ?" - row = @dbh.execute(query, k_id).fetch - row ? row['key_name'] : '' - end + def feature_key(k_id) + query = "select * from term where term_id= ?" + row = @dbh.execute(query, k_id).fetch + row ? row['name'] : '' + end - def feature_source(s_id) - query = "select * from seqfeature_source where seqfeature_source_id = ?" - row = @dbh.execute(query, s_id).fetch - row ? row['source_name'] : '' - end + def feature_source(s_id) + query = "select * from term where term_id = ?" + row = @dbh.execute(query, s_id).fetch + row ? row['name'] : '' + end - def feature_locations(f_id) - locations = [] - query = "select * from seqfeature_location where seqfeature_id = ?" - @dbh.execute(query, f_id).fetch_all.each do |row| - next unless row + def feature_locations(f_id) + locations = [] + query = "select * from location where seqfeature_id = ?" + @dbh.execute(query, f_id).fetch_all.each do |row| + next unless row - location = Bio::Location.new - location.strand = row['seq_strand'] - location.from = row['seq_start'] - location.to = row['seq_end'] + location = Bio::Location.new + location.strand = row['strand'] + location.from = row['start_pos'] + location.to = row['end_pos'] - xref = feature_locations_remote(row['seqfeature_location_id']) - location.xref_id = xref.shift unless xref.empty? + xref = feature_locations_remote(row['dbxref_if']) + location.xref_id = xref.shift unless xref.empty? - # just omit fuzzy location for now... - #feature_locations_qv(row['seqfeature_location_id']) + # just omit fuzzy location for now... + #feature_locations_qv(row['seqfeature_location_id']) - rank = row['location_rank'].to_i - 1 - locations[rank] = location - end - return Bio::Locations.new(locations) + rank = row['rank'].to_i - 1 + locations[rank] = location end + return Bio::Locations.new(locations) + end - def feature_locations_remote(l_id) - query = "select * from remote_seqfeature_name where seqfeature_location_id = ?" - row = @dbh.execute(query, l_id).fetch - row ? [row['accession'], row['version']] : [] - end + def feature_locations_remote(l_id) + query = "select * from dbxref where dbxref_id = ?" + row = @dbh.execute(query, l_id).fetch + row ? [row['accession'], row['version']] : [] + end - def feature_locations_qv(l_id) - query = "select * from location_qualifier_value where seqfeature_location_id = ?" - row = @dbh.execute(query, l_id).fetch - row ? [row['qualifier_value'], row['slot_value']] : [] - end + def feature_locations_qv(l_id) + query = "select * from location_qualifier_value where location_id = ?" + row = @dbh.execute(query, l_id).fetch + row ? [row['value'], row['int_value']] : [] + end - def feature_qualifiers(f_id) - qualifiers = [] - query = "select * from seqfeature_qualifier_value where seqfeature_id = ?" - @dbh.execute(query, f_id).fetch_all.each do |row| - next unless row + def feature_qualifiers(f_id) + qualifiers = [] + query = "select * from seqfeature_qualifier_value where seqfeature_id = ?" + @dbh.execute(query, f_id).fetch_all.each do |row| + next unless row - key = feature_qualifiers_key(row['seqfeature_qualifier_id']) - value = row['qualifier_value'] - qualifier = Bio::Feature::Qualifier.new(key, value) + key = feature_qualifiers_key(row['seqfeature_id']) + value = row['value'] + qualifier = Bio::Feature::Qualifier.new(key, value) - rank = row['seqfeature_qualifier_rank'].to_i - 1 - qualifiers[rank] = qualifier - end - return qualifiers.compact # .compact is nasty hack for a while + rank = row['rank'].to_i - 1 + qualifiers[rank] = qualifier end - - def feature_qualifiers_key(q_id) - query = "select * from seqfeature_qualifier where seqfeature_qualifier_id = ?" - row = @dbh.execute(query, q_id).fetch - row ? row['qualifier_name'] : '' - end + return qualifiers.compact # .compact is nasty hack for a while end + def feature_qualifiers_key(q_id) + query = <<-END + select * from seqfeature_qualifier_value + join term using(term_id) where seqfeature_id = ? + END + row = @dbh.execute(query, q_id).fetch + row ? row['name'] : '' + end end -end +end # SQL +end # Bio + if __FILE__ == $0 begin require 'pp' alias p pp rescue LoadError @@ -361,68 +360,6 @@ pp ent2.subseq(1,10) pp ent2.taxonomy pp ent2.version end - - -=begin - -= Bio::SQL - ---- Bio::SQL.new(db = 'dbi:Mysql:biosql', user = nil, pass = nil) - ---- Bio::SQL.close - ---- Bio::SQL#fetch(accession) - - Returns Bio::SQL::Sequence object. - -== Bio::SQL::Sequence - ---- Bio::SQL::Sequence.new(dbh, entry) - ---- Bio::SQL::Sequence#accession -> String ---- Bio::SQL::Sequence#comment -> String - - Returns the first comment. For complete comments, use comments method. - ---- Bio::SQL::Sequence#comments -> Array - - Returns comments in an Array of Strings. - ---- Bio::SQL::Sequence#common_name -> String ---- Bio::SQL::Sequence#database -> String ---- Bio::SQL::Sequence#date -> String ---- Bio::SQL::Sequence#dblink -> Array ---- Bio::SQL::Sequence#definition -> String ---- Bio::SQL::Sequence#division -> String ---- Bio::SQL::Sequence#entry_id -> String - ---- Bio::SQL::Sequence#features - - Returns Bio::Features object. - ---- Bio::SQL::Sequence#keyword -> String ---- Bio::SQL::Sequence#lineage -> String ---- Bio::SQL::Sequence#ncbi_taxa_id -> String - ---- Bio::SQL::Sequence#references -> Array - - Returns reference informations in Array of Hash (not Bio::Reference). - ---- Bio::SQL::Sequence#seq - - Returns Bio::Sequence::NA or AA object. - ---- Bio::SQL::Sequence#subseq(from, to) - - Returns Bio::Sequence::NA or AA object (by lazy fetching). - ---- Bio::SQL::Sequence#taxonomy -> DBI::Row - - Use lineage, common_name, ncbi_taxa_id methods to extract in detail. - ---- Bio::SQL::Sequence#version -> String - -=end