CVI - Basic usage with time-series

In this example, we integrate PyCVI into the usual clustering pipeline with time series data in order to select the best clustering while using MSM as the distance measure and MBA as the cluster center, specially designed to handle time-series data.

If you wish to run the example scripts on your own computer, please first follow the instructions detailed in Running example scripts on your computer.

 1
 2from aeon.clustering import TimeSeriesKMeans
 3from sklearn.preprocessing import MinMaxScaler
 4from pycvi.cvi import Dunn
 5from pycvi.datasets.benchmark import load_data
 6from pycvi.cluster import get_clustering
 7
 8from pycvi_examples_utils import plot_true_selected
 9
10# -------------- Standard data handling operations ---------------------
11# Load data
12data, labels = load_data("Trace", "ucr")
13(N, T, d) = data.shape
14
15# Data pre-processing
16scaler = MinMaxScaler()
17# Scaling for each variable and not time step wise
18X = scaler.fit_transform(data.reshape(N*T, d)).reshape(N, T, d)
19# CVI to use, could be any class defined in pycvi.cvi
20cvi = Dunn()
21
22# ---------- Integrating PyCVI in the clustering pipeline --------------
23# ------ 1. Compute CVI values of the generated clusterings ------------
24# ------ 2. Select the best clustering according to the CVI ------------
25
26clusterings = {}
27cvi_values = {}
28k_range = range(2, 10)
29for k in k_range:
30
31    # Generate the clusters assuming that there are k clusters
32    # Clustering model to use, could be any sklearn-like clustering class
33    model = TimeSeriesKMeans(n_clusters=k)
34    labels_pred = model.fit_predict(X)
35
36    # From predicted cluster-label for each datapoint to a list of
37    # datapoints for each cluster.
38    clusters_pred = get_clustering(labels_pred)
39
40    # Compute the CVI value of this clustering
41    cvi_value = cvi(X, clusters_pred)
42
43    # Store clustering and CVI value
44    clusterings[k] = clusters_pred
45    cvi_values[k] = cvi_value
46    print(f"k={k}  |  CVI value:{cvi_value}")
47
48k_selected = cvi.select(cvi_values)
49print(f"k selected: {k_selected}")
50
51# ---------------------- Summmary fig ----------------------------------
52
53clustering_true = get_clustering(labels)
54fig = plot_true_selected(data, clustering_true, clusterings[k_selected])
55fig_title = "KMeans clustering with Dunn score"
56fig_name = "basic_usage_TS_KMeans_Dunn.png"
57fig.suptitle(fig_title)
58fig.savefig(fig_name)
59
../_images/basic_usage_TS_KMeans_Dunn.png
k=2  |  CVI value:0.6221769114650815
k=3  |  CVI value:0.28456695254115594
k=4  |  CVI value:0.06275505651149707
k=5  |  CVI value:0.06936780518684031
k=6  |  CVI value:0.10054989078387304
k=7  |  CVI value:0.09125443657890364
k=8  |  CVI value:0.08277205774244745
k=9  |  CVI value:0.08277205774244745
k selected: 2