require 'scbi_fasta' class Seq attr_accessor :name, :comments, :seq_fasta, :db, :master def initialize(name, comments, seq_fasta, master) #master = more representative sequence on a cluster @name = name @comments = comments @seq_fasta = seq_fasta @db= parse_db(name, comments) @master = master end def parse_db(name, comments) db=nil if name =~ /^[sp]/ || comments =~ /^[sp]/ db='sp' elsif comments =~ /^[tr]/ db='tr' end return db end def to_s return ">#{@name} #{@comments}\n#{@seq_fasta}" end end class Cdhit attr_accessor :clusters, :sequence_hash_fasta NAME=0 COMMENTS=1 SEQ_FASTA=2 def initialize(fasta_file, clust_file) @clusters = [] @sequence_hash_fasta=hash_fasta(fasta_file) cd_hit_clusters(clust_file) end def each_cluster @clusters.each do |cluster| yield cluster end end def master_fasta(file_name) fasta=File.open(file_name,'w') each_cluster{|cluster| master=get_master(cluster) fasta.print '>'+master.name+' '+master.comments+"\n"+master.seq_fasta+"\n" } fasta.close end def master_to_sp_seq each_cluster{|cluster| master_seq = get_master(cluster) if master_seq.db != 'sp' sp_seq=get_sp(cluster) if !sp_seq.nil? cluster.map{|seq| seq.master=false} sp_seq.master= true end end } end def recover_different_lengths(percentage) seqs = [] each_cluster{|cluster| master = get_master(cluster) cluster.each do |seq| if seq.name == master.name next else seq_mas_len = seq.seq_fasta.length/master.seq_fasta.length*100 mas_seq_len = master.seq_fasta.length/seq.seq_fasta.length*100 seqs << seq if mas_seq_len < percentage && seq_mas_len < percentage end end } return seqs end def get_master(cluster) master= cluster.select{|seq| seq.master}.first return master end def get_all_master master = [] each_cluster{|cluster| master << get_master(cluster) } return master end def get_sp(cluster) master=cluster.select{|seq| seq.db == 'sp'} if !master.empty? master=master.first else master=nil end return master end def cd_hit_clusters(clust_file) #require 'bio-cd-hit-report' report = Bio::CdHitReport.new(clust_file) report.each_cluster do |cluster| clust=[] cluster.data.each do |member| name, master = parse_member(member) hash_seq = @sequence_hash_fasta[name] sequence = Seq.new(hash_seq[NAME], hash_seq[COMMENTS], hash_seq[SEQ_FASTA], master) clust << sequence end @clusters << clust end end def parse_member(member) member.gsub!('...','') member.gsub!('>','') fields = member.split(',') data = fields[1].split(' ',2) master = false if data[1] == '*' master = true end return data[0],master end def hash_fasta(file) sequence_hash_fasta={} fqr=FastaQualFile.new(file) fqr.each do |name,seq_fasta,comments| sequence_hash_fasta[name[0..18]]=[name, comments, seq_fasta] #Cd-hit cuts sequence's name to 20 character (even > character) so we use 'name[0..18]' like key hash end fqr.close return sequence_hash_fasta end end #Example #cdhit=Cdhit.new('all_rodents.fasta','rodents_cln.clstr') #cdhit.master_to_sp_seq #cdhit.master_fasta('all_rodents_red')