#!/usr/bin/python # drugbank.seq_setup.py # 20100723 # JAHorst import os from subprocess import call from subprocess import Popen from shutil import copyfile PDBseq = '/guitar1/home/horst/dbs/pdball.fsa' drugbank_j = '/guitar1/home/horst/dbs/drugbank/drugbank.tab' blastbin = '/guitar1/home/horst/bin/blast/bin/' drug_seq_outdir = 'drugbank.seqs' try: os.mkdir(drug_seq_outdir) except: already_there = True no_uniprot_set = [] db_pdb_uniprot = [] for line in open(drugbank_j).readlines(): drugbankID = line.split()[0] # only apply to drugs tht have smile strings = small molecules if line.split()[1] != '-' and drugbankID[-1] == 's': # grab the PDBs listed as targets, and corresponding UniProts target_info = line.split('|')[1].split() for IDs in range( len(target_info) /2 ): pdbID = target_info[ (IDs*2) ].lower() uniprotID = target_info[ (IDs*2)+1 ] db_pdb_uniprot += [[ drugbankID , pdbID , uniprotID ]] for (drugbankID , pdbID , uniprotID) in db_pdb_uniprot: outname_prefix = drug_seq_outdir +'/'+drugbankID # if only UniProt, use that if pdbID == '-' and not uniprotID == '-': # grab UniProt sequence command = 'wget http://www.uniprot.org/uniprot/%s.fasta' % uniprotID call(command.split()) # write to file named by drugbankID try: os.rename(uniprotID+'.fasta', outname_prefix +'-'+ uniprotID +'.fasta') except: # sometimes uniprot IDs are defunct, BUT re-directed files_here = os.listdir('.') for f in files_here: if f.endswith('.fasta'): os.rename( f , outname_prefix+'-'+ f) elif uniprotID == '-' and pdbID == '-': no_uniprot_set += [[drugbankID , pdbID , uniprotID]] ############### # UNRESOLVED # ############### # could check if there is only one chain # but if there is more than one, there may be no way to tell... # again could run blast on the chains & see if they are equal, then pick longest # ... but lets just see if this comes up else: # grab the PDB sequence PDBsequences = [] seqs = open(PDBseq).read().split(pdbID) for seq in seqs[1:]: chain = seq[0] sequence = ''.join(seq.split('>')[0].split('\n')[1:-1]) if sequence.strip(): PDBsequences += [[chain,sequence]] # if only one chain in PDB: grab the sequence; keep it. if len(PDBsequences) == 1: # write to file named by drugbankID writer = open(outname_prefix +'-'+ pdbID +'.fasta' ,'w') writer.write('>'+drugbankID+'\n'+ PDBsequences[0][1]) writer.close() elif len(PDBsequences) == 0: skip = True ############### # UNRESOLVED # ############### # if there is no entry for the PDB # - it may be RNA/DNA # - it may just not be in the DB else: # gather the PDB chain sequences os.mkdir(pdbID) pdbID_searchable = pdbID+'/'+pdbID+'.fasta' writer = open(pdbID_searchable ,'w') for PDBsequence in PDBsequences: writer.write('>'+ pdbID+PDBsequence[0] +'\n'+ PDBsequence[1] +'\n') writer.close() # make a ncbi-blast searchable DB command = '%s/formatdb -i %s' % (blastbin, pdbID_searchable) call(command.split()) # grab the UniProt sequence if not uniprotID == '-': command = 'wget http://www.uniprot.org/uniprot/%s.fasta' % uniprotID call(command.split()) # write to file named by drugbankID try: os.rename(uniprotID+'.fasta', pdbID +'/'+uniprotID+'.fasta') except: # sometimes uniprot IDs are defunct, BUT re-directed files_here = os.listdir('.') for f in files_here: if f.endswith('.fasta'): os.rename( f , outname_prefix+'-'+ f) uniprotID = f.split('.')[0] # search the PDB chain DB with the UniProt sequence, using blast command = "%s/blastpgp -i %s/%s.fasta -o %s/%s.psiblast_output -d %s -m 7" % (blastbin, pdbID,uniprotID, pdbID,uniprotID, pdbID_searchable) print command call(command.split()) # parse the output to find which chain is best, keep it try: best_chain = open(pdbID+'/'+uniprotID+'.psiblast_output').read().split('')[1].split('<')[0] writer = open(outname_prefix +'-'+ best_chain +'.fasta' ,'w') writer.write( '>'+drugbankID+'\n' + open(pdbID_searchable).read().split(best_chain)[1].split('>')[0] ) writer.close() except: # if there is no , the pdb & uniprot do not match # presumably this is the case when the drug disrupts two DIFFERENT proteins # --> keep both # save uniprot os.rename( pdbID+'/'+uniprotID+'.fasta' , outname_prefix+'-'+uniprotID+'.fasta' ) ############### # UNRESOLVED # ############### # save pdb... but which one? # the first is the best bet.. (PDBsequences[0][0]) writer = open(outname_prefix +'-'+ PDBsequences[0][0] +'.fasta' ,'w') writer.write( '>'+drugbankID+'\n' + open(pdbID_searchable).read().split(PDBsequences[0][0])[1].split('>')[0] ) writer.close() else: ############### # UNRESOLVED # ############### # save pdb... but which one? # the first is the best bet.. (PDBsequences[0][0]) writer = open(outname_prefix +'-'+ PDBsequences[0][0] +'.fasta' ,'w') writer.write( '>'+drugbankID+'\n' + open(pdbID_searchable).read().split(PDBsequences[0][0])[1].split('>')[0] ) writer.close() # clean up for f in os.listdir(pdbID): os.remove(pdbID+'/'+f) os.rmdir(pdbID) ############### # UNRESOLVED # ############### os.remove('formatdb.log') print '\n\n\n\nNo uniprot ID for the following:' for i in no_uniprot_set: print '\t'.join(i)