IMP  2.4.0
The Integrative Modeling Platform
em2d/clustering_of_pdb_models.py

This example clusters pdb models of an structure, chosen from a selection file. It is assumed that all the pdb files belong to the same structure and that the order of the atoms in the pdb files is the same in all files.

After the clustering procedure, a linkage matrix is generated.

1 ## \example em2d/clustering_of_pdb_models.py
2 # This example clusters pdb models of an structure, chosen from a
3 # selection file.
4 #
5 # It is assumed that all the pdb files belong to the same structure
6 # and that the order of the atoms in the pdb files is the same in all files.
7 #
8 # After the clustering procedure, a linkage matrix is generated.
9 #
10 
11 from __future__ import print_function
12 import IMP
13 import IMP.algebra
14 import IMP.core
15 import IMP.atom
16 import IMP.em2d
17 import os
18 import sys
19 import csv
20 """
21  Clustering of pdb models.
22  This script clusters pdb models of an structure, chosen from a
23  selection file.
24  - It is assumed that all the pdb files belong to the same structure
25  and that the order of the atoms in the pdb files is the same in all files
26  - After the clustering procedure, a linkage matrix is generated.
27 
28 
29 """
30 
31 if sys.platform == 'win32':
32  sys.stderr.write("this example does not currently work on Windows\n")
33  sys.exit(0)
34 
35 
36 def get_columns(fn, cols=[], delimiter=" ", comment="#"):
37  """ ge the columns of a file:
38  cols - a list of columns to extract. E.g [0,3,5]
39  If empty, all the columns are extracted
40  lines starting with the comment character are ignored """
41  columns = [[] for i in cols]
42  # get a reader for the file
43  reader = csv.reader(
44  open(fn, "r"), delimiter=delimiter, skipinitialspace=True)
45  for row in reader:
46  if(row != [] and row[0][0] != comment): # not empty or comment row
47  if(cols == []):
48  for i in range(0, len(row)):
49  columns[i].append(row[i])
50  else:
51  for i in range(0, len(cols)):
52  columns[i].append(row[cols[i]])
53  return columns
54 
55 
56 def argmin(sequence):
57  """ Argmin function: Returns the pair (min_value,min_index),
58  where min_index is the index of the minimum value
59  """
60  min_value = sequence[0]
61  min_index = 0
62  for i in range(0, len(sequence)):
63 # print "argmin - checking ",sequence[i]
64  if(sequence[i] < min_value):
65  min_value = sequence[i]
66  min_index = i
67 # print "argmin - selecting ",min_value,min_index
68  return min_value, min_index
69 
70 #***************************
71 
72 
73 fn_selection = IMP.em2d.get_example_path("all-models-1z5s.sel")
74 fn_em2d_scores = IMP.em2d.get_example_path("em2d_scores_for_clustering.data")
75 # Load models
76 print("Reading models ...")
77 model = IMP.kernel.Model()
79 coords = []
80 fn_models = IMP.em2d.read_selection_file(fn_selection)
81 n_models = len(fn_models)
82 hierarchies = []
83 for fn in fn_models:
84  fn_model = IMP.em2d.get_example_path(fn)
85  h = IMP.atom.read_pdb(fn_model, model, ssel, True)
86  hierarchies.append(h)
88  coords.append([x.get_coordinates() for x in xyz])
89 
90 print("Computing matrix of RMSD ...")
91 rmsds = [[0.0 for i in range(0, n_models)] for n in range(0, n_models)]
92 transformations = [[[] for i in range(0, n_models)]
93  for j in range(0, n_models)]
94 # fill rmsd and transformations
95 for i in range(0, n_models):
96  for j in range(i + 1, n_models):
97  if(i != j):
99  coords[i],
100  coords[j])
101  transformations[i][j] = t
102  transformations[j][i] = t.get_inverse()
103  temp = [t.get_transformed(v) for v in coords[i]]
104  rmsd = IMP.algebra.get_rmsd(temp, coords[j])
105  rmsds[i][j] = rmsd
106  rmsds[j][i] = rmsd
107 
108 # cluster
109 print("Clustering (Complete linkage method)...")
110 cluster_set = IMP.em2d.do_hierarchical_clustering_complete_linkage(rmsds)
111 mat2 = cluster_set.get_linkage_matrix()
112 print("Complete Linkage Matrix")
113 for m in mat2:
114  print(m)
115 
116 # Read scores from the scores file
117 em2d_scores = get_columns(fn_em2d_scores, [1])
118 em2d_scores = em2d_scores[0]
119 
120 # get biggest clusters below a certain rmsd
121 rmsd_cutoff = 1.4
122 print("clusters below cutoff", rmsd_cutoff, "Angstroms")
123 clusters = cluster_set.get_clusters_below_cutoff(rmsd_cutoff)
124 for c in clusters:
125  elems = cluster_set.get_cluster_elements(c)
126  scores_elements = []
127  for cid in elems:
128  scores_elements.append(em2d_scores[cid])
129  print("Cluster", c, ":", elems, scores_elements, end=' ')
130  # find model with best score
131  min_value, min_index = argmin(scores_elements)
132  min_elem_id = elems[min_index]
133  # The representative element is the one with the minimum em2d score
134  print("representative element", min_elem_id, min_value)
135  for i in elems:
136  pdb_name = "cluster-%03d-elem-%03d.pdb" % (c, i)
137 
138  if(i != min_elem_id):
139  print("Writing element", i, "aligned to ", min_elem_id, ":", pdb_name)
140  T = IMP.core.Transform(transformations[i][min_elem_id])
141  ps = IMP.atom.get_leaves(hierarchies[i])
142  for p in ps:
143  T.apply(p)
144  else:
145  print("Writing representative element", min_elem_id, ":", pdb_name)
146  IMP.atom.write_pdb(hierarchies[i], pdb_name)