# # = bio/db/genbank/common.rb - Common methods for GenBank style database classes # # Copyright:: Copyright (C) 2004 Toshiaki Katayama # License:: The Ruby License # # $Id: common.rb,v 1.11.2.5 2008/06/17 15:53:21 ngoto Exp $ # require 'bio/db' module Bio class NCBIDB # == Description # # This module defines a common framework among GenBank, GenPept, RefSeq, and # DDBJ. For more details, see the documentations in each genbank/*.rb files. # # == References # # * ftp://ftp.ncbi.nih.gov/genbank/gbrel.txt # * http://www.ncbi.nlm.nih.gov/collab/FT/index.html # module Common DELIMITER = RS = "\n//\n" TAGSIZE = 12 def initialize(entry) super(entry, TAGSIZE) end # LOCUS -- Locus class must be defined in child classes. def locus # must be overrided in each subclass end # DEFINITION -- Returns contents of the DEFINITION record as a String. def definition field_fetch('DEFINITION') end # ACCESSION -- Returns contents of the ACCESSION record as an Array. def accessions field_fetch('ACCESSION').strip.split(/\s+/) end # VERSION -- Returns contents of the VERSION record as an Array of Strings. def versions @data['VERSION'] ||= fetch('VERSION').split(/\s+/) end # Returns the first part of the VERSION record as "ACCESSION.VERSION" String. def acc_version versions.first.to_s end # Returns the ACCESSION part of the acc_version. def accession acc_version.split(/\./).first.to_s end # Returns the VERSION part of the acc_version as a Fixnum def version acc_version.split(/\./).last.to_i end # Returns the second part of the VERSION record as a "GI:#######" String. def gi versions.last end # NID -- Returns contents of the NID record as a String. def nid field_fetch('NID') end # KEYWORDS -- Returns contents of the KEYWORDS record as an Array of Strings. def keywords @data['KEYWORDS'] ||= fetch('KEYWORDS').chomp('.').split(/; /) end # SEGMENT -- Returns contents of the SEGMENT record as a "m/n" form String. def segment @data['SEGMENT'] ||= fetch('SEGMENT').scan(/\d+/).join("/") end # SOURCE -- Returns contents of the SOURCE record as a Hash. def source unless @data['SOURCE'] name, org = get('SOURCE').split('ORGANISM') org ||= "" if org[/\S+;/] organism = $` taxonomy = $& + $' elsif org[/\S+\./] # rs:NC_001741 organism = $` taxonomy = $& + $' else organism = org taxonomy = '' end @data['SOURCE'] = { 'common_name' => truncate(tag_cut(name)), 'organism' => truncate(organism), 'taxonomy' => truncate(taxonomy), } @data['SOURCE'].default = '' end @data['SOURCE'] end def common_name source['common_name'] end alias vernacular_name common_name def organism source['organism'] end def taxonomy source['taxonomy'] end # REFERENCE -- Returns contents of the REFERENCE records as an Array of # Bio::Reference objects. def references unless @data['REFERENCE'] ary = [] toptag2array(get('REFERENCE')).each do |ref| hash = Hash.new subtag2array(ref).each do |field| case tag_get(field) when /REFERENCE/ if /(\d+)(\s*\((.+)\))?/m =~ tag_cut(field) then hash['embl_gb_record_number'] = $1.to_i if $3 and $3 != 'sites' then seqpos = $3 seqpos.sub!(/\A\s*bases\s+/, '') seqpos.gsub!(/(\d+)\s+to\s+(\d+)/, "\\1-\\2") seqpos.gsub!(/\s*\;\s*/, ', ') hash['sequence_position'] = seqpos end end when /AUTHORS/ authors = truncate(tag_cut(field)) authors = authors.split(/, /) authors[-1] = authors[-1].split(/\s+and\s+/) if authors[-1] authors = authors.flatten.map { |a| a.sub(/,/, ', ') } hash['authors'] = authors when /TITLE/ hash['title'] = truncate(tag_cut(field)) + '.' when /JOURNAL/ journal = truncate(tag_cut(field)) if journal =~ /(.*) (\d+) \((\d+)\), (\d+-\d+) \((\d+)\)$/ hash['journal'] = $1 hash['volume'] = $2 hash['issue'] = $3 hash['pages'] = $4 hash['year'] = $5 else hash['journal'] = journal end when /MEDLINE/ hash['medline'] = truncate(tag_cut(field)) when /PUBMED/ hash['pubmed'] = truncate(tag_cut(field)) when /REMARK/ hash['comments'] ||= [] hash['comments'].push truncate(tag_cut(field)) end end ary.push(Reference.new(hash)) end @data['REFERENCE'] = ary.extend(Bio::References::BackwardCompatibility) end if block_given? @data['REFERENCE'].each do |r| yield r end else @data['REFERENCE'] end end # COMMENT -- Returns contents of the COMMENT record as a String. def comment str = get('COMMENT').to_s.sub(/\ACOMMENT /, '') str.gsub!(/^ {12}/, '') str.chomp! str end # FEATURES -- Returns contents of the FEATURES record as an array of # Bio::Feature objects. def features unless @data['FEATURES'] ary = [] in_quote = false get('FEATURES').each_line do |line| next if line =~ /^FEATURES/ # feature type (source, CDS, ...) head = line[0,20].to_s.strip # feature value (position or /qualifier=) body = line[20,60].to_s.chomp # sub-array [ feature type, position, /q="data", ... ] if line =~ /^ {5}\S/ ary.push([ head, body ]) # feature qualifier start (/q="data..., /q="data...", /q=data, /q) elsif body =~ /^ \// and not in_quote # gb:IRO125195 ary.last.push(body) # flag for open quote (/q="data...) if body =~ /="/ and body !~ /"$/ in_quote = true end # feature qualifier continued (...data..., ...data...") else ary.last.last << body # flag for closing quote (/q="data... lines ...") if body =~ /"$/ in_quote = false end end end ary.collect! do |subary| parse_qualifiers(subary) end @data['FEATURES'] = ary.extend(Bio::Features::BackwardCompatibility) end if block_given? @data['FEATURES'].each do |f| yield f end else @data['FEATURES'] end end # ORIGIN -- Returns contents of the ORIGIN record as a String. def origin unless @data['ORIGIN'] ori, seqstr = get('ORIGIN').split("\n", 2) seqstr ||= "" @data['ORIGIN'] = truncate(tag_cut(ori)) @data['SEQUENCE'] = seqstr.tr("0-9 \t\n\r\/", '') end @data['ORIGIN'] end ### private methods private def parse_qualifiers(ary) feature = Feature.new feature.feature = ary.shift feature.position = ary.shift.gsub(/\s/, '') ary.each do |f| if f =~ %r{/([^=]+)=?"?([^"]*)"?} qualifier, value = $1, $2 case qualifier when 'translation' value = Sequence::AA.new(value) when 'codon_start' value = value.to_i else value = true if value.empty? end feature.append(Feature::Qualifier.new(qualifier, value)) end end return feature end end # Common end # GenBank end # Bio