import numpy as np
from MDAnalysis.lib.distances import self_distance_array
from sklearn.cluster import DBSCAN
my_proteins = u.select_atoms("protein")
proteins_split_by_segment = my_proteins.split("segment") # list of atomgroups
def analysis_per_frame():
centers = [protein.center_of_mass() for protein in proteins_split_by_segment]
center_array = np.array(centers)
distance_matrix = self_distance_array(center_array, box=my_proteins.dimensions)
dbscan = DBSCAN(metric="precomputed") # we compute distances in MDAnalysis
all_cluster_labels = dbscan.fit_predict(distance_matrix)
cluster_labels, cluster_counts = np.unique(all_cluster_labels, return_counts=True)
# outliers are labeled -1
if cluster_labels[0] == -1:
# ignore outliers
cluster_labels = cluster_labels[1:]
cluster_counts = cluster_counts[1:]
return cluster_labels, cluster_counts
number_of_clusters_per_frame = []
numbers_of_proteins_in_clusters_per_frame = []
for frame in u.trajectory:
labels, counts = analysis_per_frame()
number_of_clusters_per_frame.append(len(labels))
numbers_of_proteins_in_clusters_per_frame.append(counts)
# number_of_clusters_per_frame will be a list of integers
# so that number_of_clusters_per_frame[3] will give you the number of clusters
# found in the 4th frame
# numbers_of_proteins_in_clusters_per_frame will be a list of lists of integers
# so that numbers_of_proteins_in_clusters_per_frame[3][1] will give you
# the number of proteins in the second cluster of the fourth frame