#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Mon Dec 11 17:39:26 2018 @author: dcasas """ from Bio import SeqIO import os import sys import argparse parser = argparse.ArgumentParser(description="Extract ECFs") parser.add_argument('--general',help="general ecf model domtblout file") parser.add_argument('--sigma3',help="sigma3 pfam model domtblout file") parser.add_argument('--pfam',help="sigma2 and sigma4 pfam models domtblout file") parser.add_argument("--infile", help="sequences to extract") parser.add_argument("--conserved", help="outfile: conversed regions of ecfs") args = parser.parse_args() file_seqs = args.infile proteins = [] seqs = {} with open(file_seqs, 'r') as reader: for record in SeqIO.parse(reader, 'fasta'): proteins.append(record.id) seqs[record.id] = str(record.seq) #presence of sigma2 and sigma4 hmmscan_cons = {protein:[False,False] for protein in proteins} with open(args.pfam, 'r') as reader: for row in reader: if not row.startswith('#'): row = row.strip().split(' ') row=list(filter(None, row)) if row[3] == 'Sigma70_r2': if not hmmscan_cons[row[0]][0]: hmmscan_cons[row[0]][0] = (row[17],row[18]) elif (row[3] == 'Sigma70_r4' or row[3] == 'Sigma70_r4_2'): if not hmmscan_cons[row[0]][1]: hmmscan_cons[row[0]][1] = (row[17],row[18]) #presence of sigma3 hmmscan_s3 = {protein:False for protein in proteins} with open(args.sigma3, 'r') as reader: for row in reader: if not row.startswith('#'): row = row.strip().split(' ') row=list(filter(None, row)) if row[3] == 'Sigma70_r3': hmmscan_s3[row[0]] = True #Score of the general HMM positives = {protein:0 for protein in proteins} with open(args.general, 'r') as reader: for row in reader: if not row.startswith('#'): row = row.strip().split(' ') row=list(filter(None, row)) positives[row[0]] = float(row[7]) # Extract postions of sigma2 and sigma4 hmmscan = {protein:{'Sigma2': [0,0,10], 'Sigma4': [0,0,10]} for protein in proteins} with open(args.pfam, 'r') as reader: for row in reader: if not row.startswith('#'): row = row.strip().split(' ') row=list(filter(None, row)) if row[3] == 'Sigma70_r2' and float(row[11]) < hmmscan[row[0]]['Sigma2'][2]: hmmscan[row[0]]['Sigma2'] = [int(row[19]), int(row[20]), float(row[11])] elif (row[3] == 'Sigma70_r4' or row[3] == 'Sigma70_r4_2')\ and float(row[11]) < hmmscan[row[0]]['Sigma4'][2]: hmmscan[row[0]]['Sigma4'] = [int(row[19]), int(row[20]), float(row[11])] ecfs = [] writer = sys.stdout writer.write('\t'.join(['ID', 'sigma3?','sigma2?', 'sigma4?', 'Distance sigma2-sigma4 (aa)','sigma2_start','sigma2_end','sigma4_start', 'sigma4_end', 'Score general ECF HMM','Type'])+'\n') for p in proteins: writer.write(p+'\t') #Check domains for n in [hmmscan_s3[p], hmmscan_cons[p][0], hmmscan_cons[p][1]]: if n: writer.write('x\t') else: writer.write(' \t') #Distance between sigma2-sigma4 if hmmscan_cons[p][0] and hmmscan_cons[p][1]: writer.write(str(hmmscan[p]['Sigma4'][0]-(hmmscan[p]['Sigma2'][1]+1))+'\t') else: writer.write('-\t') #Positions of sigma2 and sigma4 (if available) if hmmscan_cons[p][0]: writer.write("{}\t{}\t".format(*hmmscan[p]['Sigma2'][0:2])) else: writer.write('-\t-\t') if hmmscan_cons[p][1]: writer.write("{}\t{}\t".format(*hmmscan[p]['Sigma4'][0:2])) else: writer.write('-\t-\t') #Check general model score writer.write(str(positives[p])+'\t') #make decision if hmmscan_cons[p][0] and hmmscan_cons[p][1]: if (hmmscan[p]['Sigma4'][0]-(hmmscan[p]['Sigma2'][1]+1)) >=50: writer.write('Non-ECF\n') elif positives[p] >= 60.8: writer.write('ECF\n') ecfs.append(p) else: writer.write('Non-ECF\n') elif hmmscan_cons[p][0] or hmmscan_cons[p][1]: if hmmscan_s3[p]: writer.write('Non-ECF\n') else: writer.write('ECF-like\n') else: writer.write('Non-ECF\n') #Extract only the conserved domains with open(args.conserved, 'w') as writer: for p in ecfs: start2 = hmmscan[p]['Sigma2'][0]-1 end2 = hmmscan[p]['Sigma2'][1] start4 = hmmscan[p]['Sigma4'][0]-1 end4=hmmscan[p]['Sigma4'][1] writer.write('>{}\n{}{}\n'.format(p, seqs[p][start2:end2], seqs[p][start4:end4]))