#!/usr/bin/python # moad.dissect.py # 20100718 # JAHorst ################## ## 1 file for each valid interaction, contains only [Smiles_String_isomeric] ## sum file with 1 line for each interaction, tabulated ############### #source_http:= #all corresponding PDBs: moad_file = '/Users/jeremyhorst/usr/dbs/Drugs/bindingMOAD/every.csv' moad_output_smile_files = 'moad.smi/' unwanted_ligands = 'ALA ARG ASN ASP CYS GLN GLU GLY HIS ILE LEU LYS MET PHE PRO SER THR TRP TYR VAL MSE'.split() ############################### # now dissecting the file import os try: os.mkdir(moad_output_smile_files) except: already_exists = True data = {} for line in open(moad_file).readlines(): if line[0] != '"' and line.strip()!=',': commas = line[:3].count(',') # Enzyme commission categorization if commas == 0 and line.split(',')[0].strip(): EC = line.split(',')[0] # representative PDB (of EC family) elif commas == 1 and line.split(',')[2].strip(): PDBid = line.split(',')[2] # other PDB (non-representative instance of EC family) elif commas == 2 and line.split(',')[2].strip(): PDBid = line.split(',')[2] # main data elif commas == 3: validity, ligand, smile, measure, equipoise, magnitude, units = '','','','','','','' ligand = line.split(',')[3].split()[0] validity = line.split(',')[4] measure = line.split(',')[5] equipoise = line.split(',')[6] magnitude = line.split(',')[7] units = line.split(',')[8] smile = line.split(',')[9] if not smile.strip(): smile = line.split(',')[-1] # if line.split(',')[10].strip(): raw_input(line.split(',')[10].strip()) # filter unuseful entries if validity == 'valid' and smile and smile != 'n/a' and not ligand in unwanted_ligands: affinity = '' if magnitude.strip(): affinity = measure + '=' + magnitude + '_' + units # add to info tab line if not data.has_key(ligand): data[ligand] = [] data[ligand] += [[ smile, PDBid, EC, affinity ]] else: print 'wtf:',line # write out as 1 line per drug summation_writer = open('moad.tab','w') for ligand in data: for entry in data[ligand]: summation_writer.write('\t'.join( [ligand] + entry )+'\n') print '\t'.join( [ligand] + entry ) # add SMILE file smile_file = open(moad_output_smile_files+'/'+ligand+'.smi' , 'w') smile_file.write(data[ligand][0][0]) smile_file.close() summation_writer.close() ####### # notes # Drug_Target_1_PDB_ID: # ?maybe leave out of future MATRIX comparisons because it is already in the PDB?) #KEGG:http://www.bioinformatics.jp/en/keggftp.html #SMSD:http://www.ebi.ac.uk/thornton-srv/software/SMSD/documentation.php