# # = bio/db.rb - common API for database parsers # # Copyright:: Copyright (C) 2001, 2002, 2005 # KATAYAMA Toshiaki # License:: Ruby's # # $Id: db.rb,v 0.34 2006/02/27 09:13:08 k Exp $ # # == On-demand parsing and cache # # The flatfile parsers (sub classes of the Bio::DB) split the original entry # into a Hash and store the hash in the @orig instance variable. To parse # in detail is delayed until the method is called which requires a further # parsing of a content of the @orig hash. Fully parsed data is cached in the # another hash, @data, separately. # # == Guide lines for the developers to create an new database class # # --- Bio::DB.new(entry) # # The 'new' method should accept the entire entry in one String and # return the parsed database object. # # --- Bio::DB#entry_id # # Database classes should implement the following methods if appropriate: # # * entry_id # * definition # # Every sub class should define the following constants if appropriate: # # * DELIMITER (RS) # * entry separator of the flatfile of the database. # * RS (= record separator) is an alias for the DELIMITER in short. # # * TAGSIZE # * length of the tag field in the FORTRAN-like format. # # |<- tag ->||<- data ---->| # ENTRY_ID A12345 # DEFINITION Hoge gene of the Pokemonia pikachuae # # === Template of the sub class # # module Bio # class Hoge < DB # # DELIMITER = RS = "\n//\n" # TAGSIZE = 12 # You can omit this line if not needed # # def initialize(entry) # end # # def entry_id # end # # end # class Hoge # end # module Bio # # === Recommended method names for sub classes # # In general, the method name should be in the singular form when returns # a Object (including the case when the Object is a String), and should be # the plural form when returns same Objects in Array. It depends on the # database classes that which form of the method name can be use. # # For example, GenBank has several REFERENCE fields in one entry, so define # Bio::GenBank#references and this method should return an Array of the # Reference objects. On the other hand, MEDLINE has one REFERENCE information # per one entry, so define Bio::MEDLINE#reference method and this should # return a Reference object. # # The method names used in the sub classes should be taken from the following # list if appropriate: # # --- entry_id #=> String # # The entry identifier. # # --- definition #=> String # # The description of the entry. # # --- reference #=> Bio::Reference # --- references #=> Array of Bio::Reference # # The reference field(s) of the entry. # # --- dblink #=> String # --- dblinks #=> Array of String # # The link(s) to the other database entry. # # --- naseq #=> Bio::Sequence::NA # # The DNA/RNA sequence of the entry. # # --- nalen #=> Integer # # The length of the DNA/RNA sequence of the entry. # # --- aaseq #=> Bio::Sequence::AA # # The amino acid sequence of the entry. # # --- aalen #=> Integer # # The length of the amino acid sequence of the entry. # # --- seq #=> Bio::Sequence::NA or Bio::Sequence::AA # # Returns an appropriate sequence object. # # --- position #=> String # # The position of the sequence in the entry or in the genome (depends on # the database). # # --- locations #=> Bio::Locations # # Returns Bio::Locations.new(position). # # --- division #=> String # # The sub division name of the database. # # * Example: # * EST, VRL etc. for GenBank # * PATTERN, RULE etc. for PROSITE # # --- date #=> String # # The date of the entry. # Should we use Date (by ParseDate) instead of String? # # --- gene #=> String # --- genes #=> Array of String # # The name(s) of the gene. # # --- organism #=> String # # The name of the organism. # require 'bio/sequence' require 'bio/reference' require 'bio/feature' module Bio class DB def self.open(filename, *mode, &block) Bio::FlatFile.open(self, filename, *mode, &block) end # Returns an entry identifier as a String. This method must be # implemented in every database classes by overriding this method. def entry_id raise NotImplementedError end # Returns a list of the top level tags of the entry as an Array of String. def tags @orig.keys end # Returns true or false - wether the entry contains the field of the # given tag name. def exists?(tag) @orig.include?(tag) end # Returns an intact field of the tag as a String. def get(tag) @orig[tag] end # Similar to the get method, however, fetch returns the content of the # field without its tag and any extra white spaces stripped. def fetch(tag, skip = 0) field = @orig[tag].split(/\n/, skip + 1).last.to_s truncate(field.gsub(/^.{0,#{@tagsize}}/,'')) end private # Returns a String with successive white spaces are replaced by one # space and stripeed. def truncate(str) str ||= "" return str.gsub(/\s+/, ' ').strip end # Returns a tag name of the field as a String. def tag_get(str) str ||= "" return str[0,@tagsize].strip end # Returns a String of the field without a tag name. def tag_cut(str) str ||= "" str[0,@tagsize] = '' return str end # Returns the content of the field as a String like the fetch method. # Furthermore, field_fetch stores the result in the @data hash. def field_fetch(tag, skip = 0) unless @data[tag] @data[tag] = fetch(tag, skip) end return @data[tag] end # Returns an Array containing each line of the field without a tag. # lines_fetch also stores the result in the @data hash. def lines_fetch(tag) unless @data[tag] @data[tag] = get(tag).split(/\n/).map{ |l| tag_cut(l) } end @data[tag] end end # class DB # Stores a NCBI style (GenBank, KEGG etc.) entry. class NCBIDB < DB autoload :Common, 'bio/db/genbank/common' # The entire entry is passed as a String. The length of the tag field is # passed as an Integer. Parses the entry roughly by the entry2hash method # and returns a database object. def initialize(entry, tagsize) @tagsize = tagsize @orig = entry2hash(entry.strip) # Hash of the original entry @data = {} # Hash of the parsed entry end private # Splits an entry into an Array of Strings at the level of top tags. def toptag2array(str) sep = "\001" str.gsub(/\n([A-Za-z\/\*])/, "\n#{sep}\\1").split(sep) end # Splits a field into an Array of Strings at the level of sub tags. def subtag2array(str) sep = "\001" str.gsub(/\n(\s{1,#{@tagsize-1}}\S)/, "\n#{sep}\\1").split(sep) end # Returns the contents of the entry as a Hash with the top level tags as # its keys. def entry2hash(entry) hash = Hash.new('') fields = toptag2array(entry) fields.each do |field| tag = tag_get(field) hash[tag] += field end return hash end end # class NCBIDB # Class for KEGG databases. Inherits a NCBIDB class. class KEGGDB < NCBIDB end # Stores an EMBL style (EMBL, TrEMBL, Swiss-Prot etc.) entry. class EMBLDB < DB autoload :Common, 'bio/db/embl/common' # The entire entry is passed as a String. The length of the tag field is # passed as an Integer. Parses the entry roughly by the entry2hash method # and returns a database object. def initialize(entry, tagsize) @tagsize = tagsize @orig = entry2hash(entry.strip) # Hash of the original entry @data = {} # Hash of the parsed entry end private # Returns the contents of the entry as a Hash. def entry2hash(entry) hash = Hash.new('') entry.each_line do |line| tag = tag_get(line) next if tag == 'XX' tag = 'R' if tag =~ /^R./ # Reference lines hash[tag] += line end return hash end end # class EMBLDB end # module Bio