CVI - Basic usage with time-series

In this example, we integrate PyCVI into the usual clustering pipeline with time series data in order to select the best clustering while using MSM as the distance measure and MBA as the cluster center, specially designed to handle time-series data.

If you wish to run the example scripts on your own computer, please first follow the instructions detailed in Running example scripts on your computer.

from aeon.clustering import TimeSeriesKMeans
from sklearn.preprocessing import MinMaxScaler
from pycvi.cvi import Dunn
from pycvi.datasets.benchmark import load_data
from pycvi.cluster import get_clustering

from pycvi_examples_utils import plot_true_selected

# -------------- Standard data handling operations ---------------------
# Load data
data, labels = load_data("Trace", "ucr")
(N, T, d) = data.shape

# Data pre-processing
scaler = MinMaxScaler()
# Scaling for each variable and not time step wise
X = scaler.fit_transform(data.reshape(N*T, d)).reshape(N, T, d)
# CVI to use, could be any class defined in pycvi.cvi
cvi = Dunn()

# ---------- Integrating PyCVI in the clustering pipeline --------------
# ------ 1. Compute CVI values of the generated clusterings ------------
# ------ 2. Select the best clustering according to the CVI ------------

clusterings = {}
cvi_values = {}
k_range = range(2, 10)
for k in k_range:

    # Generate the clusters assuming that there are k clusters
    # Clustering model to use, could be any sklearn-like clustering class
    model = TimeSeriesKMeans(n_clusters=k)
    labels_pred = model.fit_predict(X)

    # From predicted cluster-label for each datapoint to a list of
    # datapoints for each cluster.
    clusters_pred = get_clustering(labels_pred)

    # Compute the CVI value of this clustering
    cvi_value = cvi(X, clusters_pred)

    # Store clustering and CVI value
    clusterings[k] = clusters_pred
    cvi_values[k] = cvi_value
    print(f"k={k}  |  CVI value:{cvi_value}")

k_selected = cvi.select(cvi_values)
print(f"k selected: {k_selected}")

# ---------------------- Summmary fig ----------------------------------

clustering_true = get_clustering(labels)
fig = plot_true_selected(data, clustering_true, clusterings[k_selected])
fig_title = "KMeans clustering with Dunn score"
fig_name = "basic_usage_TS_KMeans_Dunn.png"
fig.suptitle(fig_title)
fig.savefig(fig_name)

../_images/basic_usage_TS_KMeans_Dunn.png

k=2  |  CVI value:0.6221769114650815
k=3  |  CVI value:0.28456695254115594
k=4  |  CVI value:0.06275505651149707
k=5  |  CVI value:0.06936780518684031
k=6  |  CVI value:0.10054989078387304
k=7  |  CVI value:0.09125443657890364
k=8  |  CVI value:0.08277205774244745
k=9  |  CVI value:0.08277205774244745
k selected: 2