CVI - Basic usage

In this example, we integrate PyCVI into the usual clustering pipeline in order to select the best clustering.

If you wish to run the example scripts on your own computer, please first follow the instructions detailed in Running example scripts on your computer.

 1
 2from sklearn.cluster import KMeans
 3from sklearn.preprocessing import StandardScaler
 4from pycvi.cvi import Silhouette
 5from pycvi.datasets.benchmark import load_data
 6from pycvi.cluster import get_clustering
 7
 8from pycvi_examples_utils import plot_true_selected
 9
10# -------------- Standard data handling operations ---------------------
11# Load data
12data, labels = load_data("xclara", "barton")
13
14# Data pre-processing
15scaler = StandardScaler()
16X = scaler.fit_transform(data)
17# CVI to use, could be any class defined in pycvi.cvi
18cvi = Silhouette()
19
20# ---------- Integrating PyCVI in the clustering pipeline --------------
21# ------ 1. Compute CVI values of the generated clusterings ------------
22# ------ 2. Select the best clustering according to the CVI ------------
23
24clusterings = {}
25cvi_values = {}
26k_range = range(2, 10)
27for k in k_range:
28
29    # Generate the clusters assuming that there are k clusters
30    # Clustering model to use, could be any sklearn-like clustering class
31    model = KMeans(n_clusters=k)
32    labels_pred = model.fit_predict(X)
33
34    # From predicted cluster-label for each datapoint to a list of
35    # datapoints for each cluster.
36    clusters_pred = get_clustering(labels_pred)
37
38    # Compute the CVI value of this clustering
39    cvi_value = cvi(X, clusters_pred)
40
41    # Store clustering and CVI value
42    clusterings[k] = clusters_pred
43    cvi_values[k] = cvi_value
44    print(f"k={k}  |  CVI value:{cvi_value}")
45
46k_selected = cvi.select(cvi_values)
47print(f"k selected: {k_selected}")
48
49# ---------------------- Summmary fig ----------------------------------
50
51clustering_true = get_clustering(labels)
52fig = plot_true_selected(data, clustering_true, clusterings[k_selected])
53fig_title = "KMeans clustering with Silhouette score"
54fig_name = "basic_usage_KMeans_Silhouette.png"
55fig.suptitle(fig_title)
56fig.savefig(fig_name)
57
../_images/basic_usage_KMeans_Silhouette.png
k=2  |  CVI value:0.6011074539304297
k=3  |  CVI value:0.6918330317974523
k=4  |  CVI value:0.5065360692943297
k=5  |  CVI value:0.4035627994192592
k=6  |  CVI value:0.34741111771185523
k=7  |  CVI value:0.3733531217848004
k=8  |  CVI value:0.3374355431746513
k=9  |  CVI value:0.3355841080432418
k selected: 3