lib/bio/io/sql.rb in bio-1.2.1 vs lib/bio/io/sql.rb in bio-1.3.0

- old
+ new

@@ -1,365 +1,186 @@ -# -# = bio/io/sql.rb - BioSQL access module -# -# Copyright:: Copyright (C) 2002 Toshiaki Katayama <k@bioruby.org> -# Copyright:: Copyright (C) 2006 Raoul Jean Pierre Bonnal <raoul.bonnal@itb.cnr.it> -# License:: The Ruby License -# -# $Id: sql.rb,v 1.8 2007/04/05 23:35:41 trevor Exp $ -# -begin - require 'dbi' -rescue LoadError -end -require 'bio/sequence' -require 'bio/feature' +require 'rubygems' +require 'erb' +require 'composite_primary_keys' +# BiosqlPlug +=begin +Ok Hilmar gives to me some clarification +1) "EMBL/GenBank/SwissProt" name in term table, is only a convention assuming data loaded by genbank embl ans swissprot formats. + If your features come from others ways for example blast or alignment ... whatever.. the user as to take care about the source. -module Bio -class SQL - - def initialize(db = 'dbi:Mysql:biosql', user = nil, pass = nil) - @dbh = DBI.connect(db, user, pass) - end - - def close - @dbh.disconnect - end - - # Returns Bio::SQL::Sequence object. - def fetch(accession) # or display_id for fall back - query = "select * from bioentry where accession = ?" - entry = @dbh.execute(query, accession).fetch - return Sequence.new(@dbh, entry) if entry - - query = "select * from bioentry where display_id = ?" - entry = @dbh.execute(query, accession).fetch - return Sequence.new(@dbh, entry) if entry - end - alias get_by_id fetch - - - # for lazy fetching - - class Sequence - - def initialize(dbh, entry) - @dbh = dbh - @bioentry_id = entry['bioentry_id'] - @database_id = entry['biodatabase_id'] - @entry_id = entry['display_id'] - @accession = entry['accession'] - @version = entry['entry_version'] - @division = entry['division'] +=end +=begin +TODO: +1) source_term_id => surce_term and check before if the source term is present or not and the level, the root should always be something "EMBL/GenBank/SwissProt" or contestualized. +2) Into DummyBase class delete connection there and use Bio::ArSQL.establish_connection which reads info from a yml file. +3) Chk Locations in Biofeatures ArSQL +=end +module Bio + class SQL + #no check is made + def self.establish_connection(configurations, env) + #configurations is an hash similar what YAML returns. + #{:database=>"biorails_development", :adapter=>"postgresql", :username=>"rails", :password=>nil} + configurations.assert_valid_keys('development', 'production','test') + configurations[env].assert_valid_keys('hostname','database','adapter','username','password') + DummyBase.configurations = configurations + DummyBase.establish_connection "#{env}" end - attr_reader :accession, :division, :entry_id, :version - - - def to_fasta - if seq = seq - return seq.to_fasta(@accession) - end + + def self.fetch_id(id) + Bio::SQL::Bioentry.find(id) end - - # Returns Bio::Sequence::NA or AA object. - def seq - query = "select * from biosequence where bioentry_id = ?" - row = @dbh.execute(query, @bioentry_id).fetch - return unless row - - mol = row['alphabet'] - seq = row['seq'] - - case mol - when /.na/i # 'dna' or 'rna' - Bio::Sequence::NA.new(seq) - else # 'protein' - Bio::Sequence::AA.new(seq) - end + + def self.fetch_accession(accession) + accession = accession.upcase + Bio::SQL::Bioentry.exists?(:accession => accession) ? Bio::SQL::Sequence.new(:entry=>Bio::SQL::Bioentry.find_by_accession(accession)) : nil end - - # Returns Bio::Sequence::NA or AA object (by lazy fetching). - def subseq(from, to) - length = to - from + 1 - query = "select alphabet, substring(seq, ?, ?) as subseq" + - " from biosequence where bioentry_id = ?" - row = @dbh.execute(query, from, length, @bioentry_id).fetch - return unless row - - mol = row['alphabet'] - seq = row['subseq'] - - case mol - when /.na/i # 'dna' or 'rna' - Bio::Sequence::NA.new(seq) - else # 'protein' - Bio::Sequence::AA.new(seq) - end + + def self.exists_accession(accession) + Bio::SQL::Bioentry.find_by_accession(accession.upcase).nil? ? false : true end - - - # Returns Bio::Features object. - def features - array = [] - query = "select * from seqfeature where bioentry_id = ?" - @dbh.execute(query, @bioentry_id).fetch_all.each do |row| - next unless row - - f_id = row['seqfeature_id'] - k_id = row['type_term_id'] - s_id = row['source_term_id'] - rank = row['rank'].to_i - 1 - - # key : type (gene, CDS, ...) - type = feature_key(k_id) - - # source : database (EMBL/GenBank/SwissProt) - database = feature_source(s_id) - - # location : position - locations = feature_locations(f_id) - - # qualifier - qualifiers = feature_qualifiers(f_id) - - # rank - array[rank] = Bio::Feature.new(type, locations, qualifiers) - end - return Bio::Features.new(array) + + def self.exists_database(name) + Bio::SQL::Biodatabase.find_by_name(name).nil? ? false : true end - - - # Returns reference informations in Array of Hash (not Bio::Reference). - def references - array = [] - query = <<-END - select * from bioentry_reference, reference - where bioentry_id = ? and - bioentry_reference.reference_id = reference.reference_id - END - @dbh.execute(query, @bioentry_id).fetch_all.each do |row| - next unless row - - hash = { - 'start' => row['start_pos'], - 'end' => row['end_pos'], - 'journal' => row['location'], - 'title' => row['title'], - 'authors' => row['authors'], - 'medline' => row['crc'] - } - hash.default = '' - - rank = row['rank'].to_i - 1 - array[rank] = hash - end - return array + + def self.list_entries + Bio::SQL::Bioentry.find(:all).collect{|entry| + {:id=>entry.bioentry_id, :accession=>entry.accession} + } end - - - # Returns the first comment. For complete comments, use comments method. - def comment - query = "select * from comment where bioentry_id = ?" - row = @dbh.execute(query, @bioentry_id).fetch - row ? row['comment_text'] : '' + + def self.list_databases + Bio::SQL::Biodatabase.find(:all).collect{|entry| + {:id=>entry.biodatabase_id, :name => entry.name} + } end - - # Returns comments in an Array of Strings. - def comments - array = [] - query = "select * from comment where bioentry_id = ?" - @dbh.execute(query, @bioentry_id).fetch_all.each do |row| - next unless row - rank = row['rank'].to_i - 1 - array[rank] = row['comment_text'] - end - return array + + def self.delete_entry_id(id) + Bioentry.delete(id) end - - def database - query = "select * from biodatabase where biodatabase_id = ?" - row = @dbh.execute(query, @database_id).fetch - row ? row['name'] : '' + + def self.delete_entry_accession(accession) + Bioentry.delete(Bioentry.find_by_accession(accession)) end + + + class DummyBase < ActiveRecord::Base + #NOTE: Using postgresql, not setting sequence name, system will discover the name by default. + #NOTE: this class will not establish the connection automatically + self.abstract_class = true + self.pluralize_table_names = false + #prepend table name to the usual id, avoid to specify primary id for every table + self.primary_key_prefix_type = :table_name_with_underscore + #biosql_configurations=YAML::load(ERB.new(IO.read(File.join(File.dirname(__FILE__),'../config', 'database.yml'))).result) + #self.configurations=biosql_configurations + #self.establish_connection "development" + end #DummyBase + + autoload :Biodatabase, 'bio/io/biosql/biodatabase' + autoload :Bioentry, 'bio/io/biosql/bioentry' + autoload :BioentryDbxref, 'bio/io/biosql/bioentry_dbxref' + autoload :BioentryPath, 'bio/io/biosql/bioentry_path' + autoload :BioentryQualifierValue, 'bio/io/biosql/bioentry_qualifier_value' + autoload :BioentryReference, 'bio/io/biosql/bioentry_reference' + autoload :BioentryRelationship, 'bio/io/biosql/bioentry_relationship' + autoload :Biosequence, 'bio/io/biosql/biosequence' + autoload :Comment, 'bio/io/biosql/comment' + autoload :Dbxref, 'bio/io/biosql/dbxref' + autoload :DbxrefQualifierValue, 'bio/io/biosql/dbxref_qualifier_value' + autoload :Location, 'bio/io/biosql/location' + autoload :LocationQualifierValue, 'bio/io/biosql/location_qualifier_value' + autoload :Ontology, 'bio/io/biosql/ontology' + autoload :Reference, 'bio/io/biosql/reference' + autoload :Seqfeature, 'bio/io/biosql/seqfeature' + autoload :SeqfeatureDbxref, 'bio/io/biosql/seqfeature_dbxref' + autoload :SeqfeaturePath, 'bio/io/biosql/seqfeature_path' + autoload :SeqfeatureQualifierValue, 'bio/io/biosql/seqfeature_qualifier_value' + autoload :SeqfeatureRelationship, 'bio/io/biosql/seqfeature_relationship' + autoload :Taxon, 'bio/io/biosql/taxon' + autoload :TaxonName, 'bio/io/biosql/taxon_name' + autoload :Term, 'bio/io/biosql/term' + autoload :TermDbxref, 'bio/io/biosql/term_dbxref' + autoload :TermPath, 'bio/io/biosql/term_path' + autoload :TermRelationship, 'bio/io/biosql/term_relationship' + autoload :TermRelationshipTerm, 'bio/io/biosql/term_relationship_term' + autoload :Sequence, 'bio/db/biosql/sequence' + end #biosql + +end #Bio - def date - query = "select * from bioentry_date where bioentry_id = ?" - row = @dbh.execute(query, @bioentry_id).fetch - row ? row['date'] : '' - end +if __FILE__ == $0 + require 'rubygems' + require 'composite_primary_keys' + require 'bio' + require 'pp' + + # pp connection = Bio::SQL.establish_connection('bio/io/biosql/config/database.yml','development') + connection = Bio::SQL.establish_connection({'development'=>{'database'=>"bio_test", 'adapter'=>"postgresql", 'username'=>"rails", 'password'=>nil}},'development') + #pp YAML::load(ERB.new(IO.read('bio/io/biosql/config/database.yml')).result) + if true + #Bio::SQL.list_entries - def dblink - query = "select * from bioentry_direct_links where source_bioentry_id = ?" - row = @dbh.execute(query, @bioentry_id).fetch - row ? [row['dbname'], row['accession']] : [] - end +# biosequence = data.to_biosequence +# puts biosequence.output(:genbank) + db=Bio::SQL::Biodatabase.new(:name=>'JEFF', :authority=>'ME', :description=>'YOU') + db.save! - def definition - query = "select * from bioentry_description where bioentry_id = ?" - row = @dbh.execute(query, @bioentry_id).fetch - row ? row['description'] : '' + puts "### FileFile.auto" + if ARGV.size > 0 + #embl = Bio::FlatFile.auto(ARGF.read) + Bio::FlatFile.auto(ARGF) do |ff| + ff.each do |data| + biosequence=data.to_biosequence + puts biosequence.output(:fasta) + sqlseq = Bio::SQL::Sequence.new(:biosequence=>biosequence,:biodatabase_id=>db.biodatabase_id) + sqlseq.save + sqlseq.to_biosequence.output(:fasta) + end + end + else + require 'bio/io/fetch' + server = Bio::Fetch.new('http://www.ebi.ac.uk/cgi-bin/dbfetch') + data = Bio::EMBL.new(server.fetch('embl','AJ224123')) end - def keyword - query = "select * from bioentry_keywords where bioentry_id = ?" - row = @dbh.execute(query, @bioentry_id).fetch - row ? row['keywords'] : '' - end + +# sqlseq = Bio::SQL::Sequence.new(:biosequence=>biosequence,:biodatabase_id=>db.biodatabase_id) +# sqlseq.save +# sqlseq_bioseq=sqlseq.to_biosequence +# puts sqlseq_bioseq.output(:genbank) - # Use lineage, common_name, ncbi_taxa_id methods to extract in detail. - def taxonomy - query = <<-END - select taxon_name.name, taxon.ncbi_taxon_id from bioentry - join taxon_name using(taxon_id) join taxon using (taxon_id) - where bioentry_id = ? - END - row = @dbh.execute(query, @bioentry_id).fetch -# @lineage = row ? row['full_lineage'] : '' - @common_name = row ? row['name'] : '' - @ncbi_taxa_id = row ? row['ncbi_taxon_id'] : '' - row ? [@lineage, @common_name, @ncbi_taxa_id] : [] - end - def lineage - taxonomy unless @lineage - return @lineage - end - def common_name - taxonomy unless @common_name - return @common_name - end - - def ncbi_taxa_id - taxonomy unless @ncbi_taxa_id - return @ncbi_taxa_id - end - - - private - - def feature_key(k_id) - query = "select * from term where term_id= ?" - row = @dbh.execute(query, k_id).fetch - row ? row['name'] : '' - end - - def feature_source(s_id) - query = "select * from term where term_id = ?" - row = @dbh.execute(query, s_id).fetch - row ? row['name'] : '' - end - - def feature_locations(f_id) - locations = [] - query = "select * from location where seqfeature_id = ?" - @dbh.execute(query, f_id).fetch_all.each do |row| - next unless row - - location = Bio::Location.new - location.strand = row['strand'] - location.from = row['start_pos'] - location.to = row['end_pos'] - - xref = feature_locations_remote(row['dbxref_if']) - location.xref_id = xref.shift unless xref.empty? - - # just omit fuzzy location for now... - #feature_locations_qv(row['seqfeature_location_id']) - - rank = row['rank'].to_i - 1 - locations[rank] = location - end - return Bio::Locations.new(locations) - end - - def feature_locations_remote(l_id) - query = "select * from dbxref where dbxref_id = ?" - row = @dbh.execute(query, l_id).fetch - row ? [row['accession'], row['version']] : [] - end - - def feature_locations_qv(l_id) - query = "select * from location_qualifier_value where location_id = ?" - row = @dbh.execute(query, l_id).fetch - row ? [row['value'], row['int_value']] : [] - end - - def feature_qualifiers(f_id) - qualifiers = [] - query = "select * from seqfeature_qualifier_value where seqfeature_id = ?" - @dbh.execute(query, f_id).fetch_all.each do |row| - next unless row - - key = feature_qualifiers_key(row['seqfeature_id']) - value = row['value'] - qualifier = Bio::Feature::Qualifier.new(key, value) - - rank = row['rank'].to_i - 1 - qualifiers[rank] = qualifier - end - return qualifiers.compact # .compact is nasty hack for a while - end - - def feature_qualifiers_key(q_id) - query = <<-END - select * from seqfeature_qualifier_value - join term using(term_id) where seqfeature_id = ? - END - row = @dbh.execute(query, q_id).fetch - row ? row['name'] : '' - end + # bioseq = Bio::SQL.fetch_accession('AJ224122') + # pp bioseq + # pp bioseq.entry_id + #TODO create a test only for tables not sequence here +# pp bioseq.molecule_type + #pp bioseq.molecule_type.class + #bioseq.molecule_type_update('dna', 1) +## pp Bio::SQL::Taxon.find(8121).taxon_names + + #sqlseq.to_biosequence + +# sqlseq.delete + +# db.destroy end - -end # SQL - -end # Bio - - -if __FILE__ == $0 - begin - require 'pp' - alias p pp - rescue LoadError - end - - db = ARGV.empty? ? 'dbi:Mysql:database=biosql;host=localhost' : ARGV.shift - serv = Bio::SQL.new(db, 'root') - - ent0 = serv.fetch('X76706') - ent0 = serv.fetch('A15H9FIB') - ent1 = serv.fetch('J01902') - ent2 = serv.fetch('X04311') - - pp ent0.features - pp ent0.references - - pp ent1.seq - pp ent1.seq.translate - pp ent1.seq.gc - pp ent1.subseq(1,20) - - pp ent2.accession - pp ent2.comment - pp ent2.comments - pp ent2.common_name - pp ent2.database - pp ent2.date - pp ent2.dblink - pp ent2.definition - pp ent2.division - pp ent2.entry_id - pp ent2.features - pp ent2.keyword - pp ent2.lineage - pp ent2.ncbi_taxa_id - pp ent2.references - pp ent2.seq - pp ent2.subseq(1,10) - pp ent2.taxonomy - pp ent2.version - + #pp bioseq.molecule_type + #term = Bio::SQL::Term.find_by_name('mol_type') + #pp term + #pp bioseq.entry.bioentry_qualifier_values.create(:term=>term, :rank=>2, :value=>'pippo') + #pp bioseq.entry.bioentry_qualifier_values.inspect + #pp bioseq.entry.bioentry_qualifier_values.find_all_by_term_id(26) + #pp primo.class + # pp primo.value='dna' + # pp primo.save + #pp bioseq.molecule_type= 'prova' + + #Bio::SQL::BioentryQualifierValue.delete(delete.bioentry_id,delete.term_id,delete.rank) + + end -