# # = bio/db/fasta.rb - FASTA format class # # Copyright:: Copyright (C) 2001, 2002 # Naohisa Goto , # Toshiaki Katayama # License:: The Ruby License # # $Id:$ # # == Description # # FASTA format class. # # == Examples # # See documents of Bio::FastaFormat class. # # == References # # * FASTA format (WikiPedia) # http://en.wikipedia.org/wiki/FASTA_format # # * Fasta format description (NCBI) # http://www.ncbi.nlm.nih.gov/BLAST/fasta.shtml # require 'bio/db' require 'bio/sequence' require 'bio/sequence/dblink' require 'bio/db/fasta/defline' module Bio # Treats a FASTA formatted entry, such as: # # >id and/or some comments <== comment line # ATGCATGCATGCATGCATGCATGCATGCATGCATGC <== sequence lines # ATGCATGCATGCATGCATGCATGCATGCATGCATGC # ATGCATGCATGC # # The precedent '>' can be omitted and the trailing '>' will be removed # automatically. # # === Examples # # f_str = <sce:YBR160W CDC28, SRM5; cyclin-dependent protein kinase catalytic subunit [EC:2.7.1.-] [SP:CC28_YEAST] # MSGELANYKRLEKVGEGTYGVVYKALDLRPGQGQRVVALKKIRLESEDEG # VPSTAIREISLLKELKDDNIVRLYDIVHSDAHKLYLVFEFLDLDLKRYME # GIPKDQPLGADIVKKFMMQLCKGIAYCHSHRILHRDLKPQNLLINKDGNL # KLGDFGLARAFGVPLRAYTHEIVTLWYRAPEVLLGGKQYSTGVDTWSIGC # IFAEMCNRKPIFSGDSEIDQIFKIFRVLGTPNEAIWPDIVYLPDFKPSFP # QWRRKDLSQVVPSLDPRGIDLLDKLLAYDPINRISARRAAIHPYFQES # >sce:YBR274W CHK1; probable serine/threonine-protein kinase [EC:2.7.1.-] [SP:KB9S_YEAST] # MSLSQVSPLPHIKDVVLGDTVGQGAFACVKNAHLQMDPSIILAVKFIHVP # TCKKMGLSDKDITKEVVLQSKCSKHPNVLRLIDCNVSKEYMWIILEMADG # GDLFDKIEPDVGVDSDVAQFYFQQLVSAINYLHVECGVAHRDIKPENILL # DKNGNLKLADFGLASQFRRKDGTLRVSMDQRGSPPYMAPEVLYSEEGYYA # DRTDIWSIGILLFVLLTGQTPWELPSLENEDFVFFIENDGNLNWGPWSKI # EFTHLNLLRKILQPDPNKRVTLKALKLHPWVLRRASFSGDDGLCNDPELL # AKKLFSHLKVSLSNENYLKFTQDTNSNNRYISTQPIGNELAELEHDSMHF # QTVSNTQRAFTSYDSNTNYNSGTGMTQEAKWTQFISYDIAALQFHSDEND # CNELVKRHLQFNPNKLTKFYTLQPMDVLLPILEKALNLSQIRVKPDLFAN # FERLCELLGYDNVFPLIINIKTKSNGGYQLCGSISIIKIEEELKSVGFER # KTGDPLEWRRLFKKISTICRDIILIPN # END_OF_STRING # # f = Bio::FastaFormat.new(f_str) # puts "### FastaFormat" # puts "# entry" # puts f.entry # puts "# entry_id" # p f.entry_id # puts "# definition" # p f.definition # puts "# data" # p f.data # puts "# seq" # p f.seq # puts "# seq.type" # p f.seq.type # puts "# length" # p f.length # puts "# aaseq" # p f.aaseq # puts "# aaseq.type" # p f.aaseq.type # puts "# aaseq.composition" # p f.aaseq.composition # puts "# aalen" # p f.aalen # # === References # # * FASTA format (WikiPedia) # http://en.wikipedia.org/wiki/FASTA_format # class FastaFormat < DB # Entry delimiter in flatfile text. DELIMITER = RS = "\n>" # (Integer) excess read size included in DELIMITER. DELIMITER_OVERRUN = 1 # '>' # The comment line of the FASTA formatted data. attr_accessor :definition # The seuqnce lines in text. attr_accessor :data attr_reader :entry_overrun # Stores the comment and sequence information from one entry of the # FASTA format string. If the argument contains more than one # entry, only the first entry is used. def initialize(str) @definition = str[/.*/].sub(/^>/, '').strip # 1st line @data = str.sub(/.*/, '') # rests @data.sub!(/^>.*/m, '') # remove trailing entries for sure @entry_overrun = $& end # Returns the stored one entry as a FASTA format. (same as to_s) def entry @entry = ">#{@definition}\n#{@data.strip}\n" end alias to_s entry # Executes FASTA/BLAST search by using a Bio::Fasta or a Bio::Blast # factory object. # # #!/usr/bin/env ruby # require 'bio' # # factory = Bio::Fasta.local('fasta34', 'db/swissprot.f') # flatfile = Bio::FlatFile.open(Bio::FastaFormat, 'queries.f') # flatfile.each do |entry| # p entry.definition # result = entry.fasta(factory) # result.each do |hit| # print "#{hit.query_id} : #{hit.evalue}\t#{hit.target_id} at " # p hit.lap_at # end # end # def query(factory) factory.query(entry) end alias fasta query alias blast query # Returns a joined sequence line as a String. def seq unless defined?(@seq) unless /\A\s*^\#/ =~ @data then @seq = Sequence::Generic.new(@data.tr(" \t\r\n0-9", '')) # lazy clean up else a = @data.split(/(^\#.*$)/) i = 0 cmnt = {} s = [] a.each do |x| if /^# ?(.*)$/ =~ x then cmnt[i] ? cmnt[i] << "\n" << $1 : cmnt[i] = $1 else x.tr!(" \t\r\n0-9", '') # lazy clean up i += x.length s << x end end @comment = cmnt @seq = Bio::Sequence::Generic.new(s.join('')) end end @seq end # Returns comments. def comment seq @comment end # Returns sequence length. def length seq.length end # Returens the Bio::Sequence::NA. def naseq Sequence::NA.new(seq) end # Returens the length of Bio::Sequence::NA. def nalen self.naseq.length end # Returens the Bio::Sequence::AA. def aaseq Sequence::AA.new(seq) end # Returens the length of Bio::Sequence::AA. def aalen self.aaseq.length end # Returns sequence as a Bio::Sequence object. # # Note: If you modify the returned Bio::Sequence object, # the sequence or definition in this FastaFormat object # might also be changed (but not always be changed) # because of efficiency. # def to_biosequence Bio::Sequence.adapter(self, Bio::Sequence::Adapter::FastaFormat) end alias to_seq to_biosequence # Parsing FASTA Defline, and extract IDs. # IDs are NSIDs (NCBI standard FASTA sequence identifiers) # or ":"-separated IDs. # It returns a Bio::FastaDefline instance. def identifiers unless defined?(@ids) then @ids = FastaDefline.new(@definition) end @ids end # Parsing FASTA Defline (using #identifiers method), and # shows a possibly unique identifier. # It returns a string. def entry_id identifiers.entry_id end # Parsing FASTA Defline (using #identifiers method), and # shows GI/locus/accession/accession with version number. # If a entry has more than two of such IDs, # only the first ID are shown. # It returns a string or nil. def gi identifiers.gi end # Returns an accession number. def accession identifiers.accession end # Parsing FASTA Defline (using #identifiers method), and # shows accession numbers. # It returns an array of strings. def accessions identifiers.accessions end # Returns accession number with version. def acc_version identifiers.acc_version end # Returns locus. def locus identifiers.locus end end #class FastaFormat end #module Bio