CVI - Basic usage
In this example, we integrate PyCVI into the usual clustering pipeline in order to select the best clustering.
If you wish to run the example scripts on your own computer, please first follow the instructions detailed in Running example scripts on your computer.
1
2from sklearn.cluster import KMeans
3from sklearn.preprocessing import StandardScaler
4from pycvi.cvi import Silhouette
5from pycvi.datasets.benchmark import load_data
6from pycvi.cluster import get_clustering
7
8from pycvi_examples_utils import plot_true_selected
9
10# -------------- Standard data handling operations ---------------------
11# Load data
12data, labels = load_data("xclara", "barton")
13
14# Data pre-processing
15scaler = StandardScaler()
16X = scaler.fit_transform(data)
17# CVI to use, could be any class defined in pycvi.cvi
18cvi = Silhouette()
19
20# ---------- Integrating PyCVI in the clustering pipeline --------------
21# ------ 1. Compute CVI values of the generated clusterings ------------
22# ------ 2. Select the best clustering according to the CVI ------------
23
24clusterings = {}
25cvi_values = {}
26k_range = range(2, 10)
27for k in k_range:
28
29 # Generate the clusters assuming that there are k clusters
30 # Clustering model to use, could be any sklearn-like clustering class
31 model = KMeans(n_clusters=k)
32 labels_pred = model.fit_predict(X)
33
34 # From predicted cluster-label for each datapoint to a list of
35 # datapoints for each cluster.
36 clusters_pred = get_clustering(labels_pred)
37
38 # Compute the CVI value of this clustering
39 cvi_value = cvi(X, clusters_pred)
40
41 # Store clustering and CVI value
42 clusterings[k] = clusters_pred
43 cvi_values[k] = cvi_value
44 print(f"k={k} | CVI value:{cvi_value}")
45
46k_selected = cvi.select(cvi_values)
47print(f"k selected: {k_selected}")
48
49# ---------------------- Summmary fig ----------------------------------
50
51clustering_true = get_clustering(labels)
52fig = plot_true_selected(data, clustering_true, clusterings[k_selected])
53fig_title = "KMeans clustering with Silhouette score"
54fig_name = "basic_usage_KMeans_Silhouette.png"
55fig.suptitle(fig_title)
56fig.savefig(fig_name)
57
k=2 | CVI value:0.6011074539304297
k=3 | CVI value:0.6918330317974523
k=4 | CVI value:0.5065360692943297
k=5 | CVI value:0.4035627994192592
k=6 | CVI value:0.34741111771185523
k=7 | CVI value:0.3733531217848004
k=8 | CVI value:0.3374355431746513
k=9 | CVI value:0.3355841080432418
k selected: 3