#!/usr/bin/python
# drugbank.seq_setup.py
# 20100723
# JAHorst

import os
from subprocess import call
from subprocess import Popen
from shutil import copyfile

PDBseq = '/guitar1/home/horst/dbs/pdball.fsa'
drugbank_j = '/guitar1/home/horst/dbs/drugbank/drugbank.tab'
blastbin = '/guitar1/home/horst/bin/blast/bin/'

drug_seq_outdir = 'drugbank.seqs'
try:    os.mkdir(drug_seq_outdir)
except: already_there = True


no_uniprot_set = []


db_pdb_uniprot = []
for line in open(drugbank_j).readlines():
	drugbankID = line.split()[0]
	# only apply to drugs tht have smile strings = small molecules
	if line.split()[1] != '-' and drugbankID[-1] == 's':
		# grab the PDBs listed as targets, and corresponding UniProts
		target_info = line.split('|')[1].split()
		for IDs in range( len(target_info) /2 ):
			pdbID     = target_info[ (IDs*2) ].lower()
			uniprotID = target_info[ (IDs*2)+1 ]
			db_pdb_uniprot += [[ drugbankID , pdbID , uniprotID ]]


for (drugbankID , pdbID , uniprotID) in db_pdb_uniprot:
	
	outname_prefix = drug_seq_outdir +'/'+drugbankID
	
	# if only UniProt, use that
	if pdbID == '-' and not uniprotID == '-':
		# grab UniProt sequence
		command = 'wget http://www.uniprot.org/uniprot/%s.fasta' % uniprotID
		call(command.split())
		# write to file named by drugbankID
		try:  os.rename(uniprotID+'.fasta', outname_prefix +'-'+ uniprotID +'.fasta')
		except:  # sometimes uniprot IDs are defunct, BUT re-directed
			files_here = os.listdir('.')
			for f in files_here:
				if f.endswith('.fasta'):  os.rename( f , outname_prefix+'-'+ f)
		
	elif uniprotID == '-' and pdbID == '-':
		no_uniprot_set += [[drugbankID , pdbID , uniprotID]]
		###############
		#  UNRESOLVED #
		###############
		# could check if there is only one chain
		# but if there is more than one, there may be no way to tell...
		# again could run blast on the chains & see if they are equal, then pick longest
		# ... but lets just see if this comes up
		
	else: 
		# grab the PDB sequence
		PDBsequences = []
		seqs = open(PDBseq).read().split(pdbID)
		for seq in seqs[1:]:
			chain = seq[0]
			sequence = ''.join(seq.split('>')[0].split('\n')[1:-1])
			if sequence.strip():
				PDBsequences += [[chain,sequence]]
		
		# if only one chain in PDB: grab the sequence; keep it.
		if len(PDBsequences) == 1:
			# write to file named by drugbankID
			writer = open(outname_prefix +'-'+ pdbID +'.fasta' ,'w')
			writer.write('>'+drugbankID+'\n'+ PDBsequences[0][1])
			writer.close()
			
		elif len(PDBsequences) == 0:
			skip = True
			###############
			#  UNRESOLVED #
			###############
			# if there is no entry for the PDB
			# - it may be RNA/DNA
			# - it may just not be in the DB
			
		else:
			# gather the PDB chain sequences
			os.mkdir(pdbID)
			pdbID_searchable = pdbID+'/'+pdbID+'.fasta'
			writer = open(pdbID_searchable ,'w')
			for PDBsequence in PDBsequences:
				writer.write('>'+ pdbID+PDBsequence[0] +'\n'+ PDBsequence[1] +'\n')
			writer.close()
			
			# make a ncbi-blast searchable DB
			command = '%s/formatdb -i %s' % (blastbin, pdbID_searchable)
			call(command.split())
			
			# grab the UniProt sequence
			if not uniprotID == '-':
				command = 'wget http://www.uniprot.org/uniprot/%s.fasta' % uniprotID
				call(command.split())
				# write to file named by drugbankID
				try: os.rename(uniprotID+'.fasta', pdbID +'/'+uniprotID+'.fasta')
				except:  # sometimes uniprot IDs are defunct, BUT re-directed
					files_here = os.listdir('.')
					for f in files_here:
						if f.endswith('.fasta'):  os.rename( f , outname_prefix+'-'+ f)
					uniprotID = f.split('.')[0]
				
				# search the PDB chain DB with the UniProt sequence, using blast
				command = "%s/blastpgp -i %s/%s.fasta -o %s/%s.psiblast_output -d %s -m 7" % (blastbin, pdbID,uniprotID, pdbID,uniprotID, pdbID_searchable)
				print command
				call(command.split())
				# parse the output to find which chain is best, keep it
				try:
					best_chain = open(pdbID+'/'+uniprotID+'.psiblast_output').read().split('<Hit_def>')[1].split('<')[0]
					writer = open(outname_prefix +'-'+ best_chain +'.fasta' ,'w')
					writer.write( '>'+drugbankID+'\n' + open(pdbID_searchable).read().split(best_chain)[1].split('>')[0] )
					writer.close()
				except:
					# if there is no <Hit_def>, the pdb & uniprot do not match
					# presumably this is the case when the drug disrupts two DIFFERENT proteins
					# --> keep both
					# save uniprot
					os.rename( pdbID+'/'+uniprotID+'.fasta' , outname_prefix+'-'+uniprotID+'.fasta' )
					###############
					#  UNRESOLVED #
					###############
					# save pdb... but which one?
					# the first is the best bet.. (PDBsequences[0][0])
					writer = open(outname_prefix +'-'+ PDBsequences[0][0] +'.fasta' ,'w')
					writer.write( '>'+drugbankID+'\n' + open(pdbID_searchable).read().split(PDBsequences[0][0])[1].split('>')[0] )
					writer.close()
			else:
				###############
				#  UNRESOLVED #
				###############
				# save pdb... but which one?
				# the first is the best bet.. (PDBsequences[0][0])
				writer = open(outname_prefix +'-'+ PDBsequences[0][0] +'.fasta' ,'w')
				writer.write( '>'+drugbankID+'\n' + open(pdbID_searchable).read().split(PDBsequences[0][0])[1].split('>')[0] )
				writer.close()
				
				
			# clean up
			for f in os.listdir(pdbID):
				os.remove(pdbID+'/'+f)
			os.rmdir(pdbID)

###############
#  UNRESOLVED #
###############
os.remove('formatdb.log')
print '\n\n\n\nNo uniprot ID for the following:'
for i in no_uniprot_set:
	print '\t'.join(i)