Utils functions for example scripts

All the examples scripts showcased in this documentation produce some figures. The code to generate these figures have nothing to do with PyCVI, so we decided to define plot functions in a separate file so that only PyCVI related features are emphasized in each example. On this page, you can find the code source of the plot functions that are necessary in order to run the examples scripts.

This file has to be copied and saved in the same directory as your examples scripts. Alternatively, you can copy the utils functions directly into your scripts (and remove the line starting with from pycvi_examples_utils import).

If you wish to run the example scripts on your own computer, please first follow the instructions detailed in Running example scripts on your computer.

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.cm import get_cmap
from typing import List, Tuple, Dict, Any
from math import ceil

from pycvi.cvi import CVIs
from pycvi.cluster import get_clustering


def _get_nrows_ncols(nplots: int = None):
    """
    Adapt the figures to the total number of CVIs.

    We want to know before creating the figure how many rows and columns
    we will need. This depends on how many different k will be selected
    and potentially if we have additional plots for the true data.

    Parameters
    ----------
    nplots : int, optional
        Number of plots, by default None, resulting in `nplots=len(CVIs)
        + 2`
    """
    if nplots is None:
        nplots = len(CVIs) + 2
    n_rows = ceil(nplots / 5)
    n_cols = 5
    figsize = (4*n_cols, ceil(4*n_rows))
    return n_rows, n_cols, figsize

def _get_shape_UCR(data: np.ndarray) -> Tuple[Tuple[int], bool]:
    """
    Get the shape (N, T, d) of data and whether it is time series data.

    Parameters
    ----------
    data : np.ndarray
        The original data

    Returns
    -------
    Tuple[Tuple[int], bool]
        the shape (N, T, d) and whether it is time series data
    """
    dims = data.shape
    if len(dims) == 3:
        (N, T, d) = data.shape
        UCR = True
    else:
        (N, d) = data.shape
        T = 1
        UCR = False
    return (N, T, d), UCR

def _get_colors(name: str="Set1") -> List:
    """
    Helper function to get a list of colors

    Parameters
    ----------
    name : str, optional
        Name of the matplotlib cmap, by default "Set1".

    Returns
    -------
    List
        A list of colors
    """
    cmap = get_cmap(name)
    colors = cmap.colors
    return colors

def plot_cluster(
    ax,
    data: np.ndarray,
    cluster: List[int],
    color,
):
    """
    Plot a given cluster on the given ax.

    Works with UCR data (plot lines), and with non-time series data with
    dimensions d = 1,2 or 3.

    In case it is UCR data, use "color" for each line representing a
    datapoint in the cluster.

    Parameters
    ----------
    ax : A matplotlib axes
        Where to plot the cluster
    data : np.ndarray
        The dataset
    cluster : List[int]
        The indices representing the cluster
    color : _type_
        The color to use to plot the cluster

    Returns
    -------
    A matplotlib axes
        The same matplotlib axes, but with the cluster plotted.
    """
    # Get the full shape and whether it is time-series data.
    (N, T, d), UCR = _get_shape_UCR(data)

    # If UCR, use plot type of plots.
    if UCR:
        # Transparency
        alpha = 0.2
        x = np.arange(T)
        y_val = data[cluster, :, 0]

        # Plot lines one by one, with the same color.
        for y in y_val:
            ax.plot(x, y, c=color, alpha=alpha)

    # If non time series data, use scatter plots.
    else:
        # Size of the dots
        s = 2
        if d == 1:
            x_val = np.zeros_like(data[cluster, 0])
            y_val = data[cluster, 0]
            ax.scatter(x_val, y_val, s=s)
        elif d == 2:
            x_val = data[cluster, 0]
            y_val = data[cluster, 1]
            ax.scatter(x_val, y_val, s=s)
        elif d == 3:
            x_val = data[cluster, 0]
            y_val = data[cluster, 1]
            z_val = data[cluster, 2]
            ax.scatter(x_val, y_val, z_val, s=s)

    return ax

def plot_center(
    ax,
    data: np.ndarray,
    center,
    color,
):
    """
    Plot a given cluster center on the given ax.

    Works with UCR data (plot lines), and with non-time series data with
    dimensions d = 1,2 or 3.

    Parameters
    ----------
    ax : A matplotlib axes
        Where to plot the cluster
    data : np.ndarray
        The dataset
    center :
        The center of a cluster
    color : _type_
        The color to use to plot the cluster

    Returns
    -------
    A matplotlib axes
        The same matplotlib axes, but with the cluster and centers
        plotted.
    """
    # Get the full shape and whether it is time-series data.
    (N, T, d), UCR = _get_shape_UCR(data)

    # If UCR, use plot type of plots.
    if UCR:
        # Transparency
        alpha = 1
        x = np.arange(T)
        y_val = center
        ax.plot(x, y_val, c=color, alpha=alpha)

    # If non time series data, use scatter plots.
    else:
        # Size of the dots
        s = 10
        if d == 1:
            x_val = np.zeros_like(center)
            y_val = center
            ax.scatter(x_val, y_val, s=s)
        elif d == 2:
            x_val = center[0]
            y_val = center[1]
            ax.scatter(x_val, y_val, s=s)
        elif d == 3:
            x_val = center[0, 0]
            y_val = center[0, 1]
            z_val = center[0, 2]
            ax.scatter(x_val, y_val, z_val, s=s)

    return ax

def plot_hist_selected(
    summary_selected: Dict[int, Dict[str, Any]],
):
    """
    Bar plot of number of CVIs that selected a given number of clusters.

    :param summary_selected: A dictionary containing for each selected k
        ("k_selected"), all information on the selected clustering
        ("#CVI, "clustering", "ax_title")
    :type summary_selected: List[List[List[int]]]
    :return: a figure with one clustering per CVI (+2 plots first)
    :rtype: A matplotlib figure
    """

    fig, ax = plt.subplots(
        nrows=1, ncols=1, figsize=(5, 5), tight_layout=True
    )
    n_CVIs = [
        summary["#CVI"] for summary in summary_selected.values()
    ]
    # Plot historgram
    ax.bar(summary_selected.keys(), n_CVIs)

    # Force x_ticks to be int and to appear also for 0 values
    k_min, k_max = ax.get_xlim()
    ax.xaxis.set_ticks(np.arange(k_min, k_max, 1, dtype=int))

    # Labels and titles
    ax.set_xlabel("Selected number of clusters")
    ax.set_ylabel("Number of CVIs that selected this number of clusters")
    fig.suptitle("Number of CVIs that selected a given number of clusters")

    return fig

def plot_only_selected(
    data: np.ndarray,
    summary_selected: Dict[int, Dict[str, Any]],
    fig,
):
    """
    Add one plot per selected number of clusters.

    Parameters
    ----------
    data : np.ndarray, shape (N, d)
        Original data, corresponding to a benchmark dataset
    summary_selected : Dict[int, Dict[str, Any]]
        A dictionary containing for each selected k ("k_selected"), all
        information on the selected clustering ("#CVI, "clustering",
        "ax_title")
    fig : A matplotlib figure
        Figure where all the plots are (including 2 about the true
        clusters)

    Returns
    -------
    A matplotlib figure
        A figure with one clustering per CVI (+2 plots first)
    """

    colors = _get_colors()

    # -------  Plot the clustering selected by a given CVI -----------

    # Sort the dictionnary by number of CVI that selected k
    sorted_summary = {
        k: n_cvi for k, n_cvi
        in sorted(
            summary_selected.items(), key=lambda d: d[1]["#CVI"], reverse=True
        )
    }
    for i, (k, summary) in enumerate(summary_selected.items()):

        # Find the ax corresponding to the CVI
        ax = fig.axes[i+2] # i+2 because there are 2 plots already

        # Add predefined title
        ax.set_title(str(summary["ax_title"]))

        # ------------------ Plot clusters one by one ------------------
        for i_label, cluster in enumerate(summary["clustering"]):
            color = colors[i_label % len(colors)]
            ax = plot_cluster(ax, data, cluster, color)

    # Remove empty axes
    for ax in fig.axes[len(summary_selected)+2:]:
        ax.remove()

    return fig

def plot_selected_clusters(
    data: np.ndarray,
    clusterings_selected: List[List[List[int]]],
    fig,
    titles: List[str],
):
    """
    Add one plot per CVI with their corresponding selected clustering.

    The fig should already contain 2 plots first with the true
    clusterings, and the clusterings obtained with k_true.

    :param data: Original data, corresponding to a benchmark dataset
    :type data: np.ndarray, shape (N, d)
    :param clusterings_selected: A list of n_CVI clusterings.
    :type clusterings_selected: List[List[List[int]]]
    :param fig: Figure where all the plots are (including 2 about the
        true clusters)
    :type fig:
    :param titles: List of titles for each CVI
    :type titles: List[str]
    :return: a figure with one clustering per CVI (+2 plots first)
    :rtype: A matplotlib figure
    """

    colors = _get_colors()

    # -------  Plot the clustering selected by a given CVI -----------
    for i_CVI in range(len(clusterings_selected)):

        # Find the ax corresponding to the CVI
        ax = fig.axes[i_CVI+2] # i+2 because there are 2 plots already

        # Add predefined title
        ax.set_title(str(titles[i_CVI]))
        if clusterings_selected[i_CVI] is None:
            continue

        # ------------------ Plot clusters one by one ------------------
        for i_label, cluster in enumerate(clusterings_selected[i_CVI]):
            color = colors[i_label % len(colors)]
            ax = plot_cluster(ax, data, cluster, color)

    # Remove empty axes
    for ax in fig.axes[len(clusterings_selected)+2:]:
        ax.remove()

    return fig

def plot_true_selected(
    data: np.ndarray,
    clustering_true: List[List[int]],
    clustering_pred: List[List[int]],
    ax_titles: List[str] = None,
):
    """
    Plot the true clustering and the selected clustering.

    :param ax_titles: List of titles for the two plots
    :type ax_titles: List[str]
    :return: a figure with one clustering per CVI (+2 plots first)
    :rtype: A matplotlib figure

    Parameters
    ----------
    data : np.ndarray, shape `(N, d)`
        Original data, corresponding to a benchmark dataset
    clustering_true : List[List[int]]
        True clustering
    clustering_pred : List[List[int]]
        Predicted clustering
    ax_titles : List[str], optional
        List of titles for the two plots, by default None

    Returns
    -------
    A matplotlib figure
        A figure with the true clustering and the selected clustering.
    """
    (N, T, d), UCR = _get_shape_UCR(data)
    colors = _get_colors()

    # ----------------------- Create figure ----------------
    fig, axes = plt.subplots(
        nrows=1, ncols=2, sharey=True, figsize=(10, 5), tight_layout=True
        )

    # ------------------- variables for the 2 axes ----------------
    clusters = [
        clustering_true,
        clustering_pred,
    ]
    if ax_titles is None:
        ax_titles = [
            f"True labels, k={len(clustering_true)}",
            f"Clustering selected, with k={len(clustering_pred)}",
        ]

    # ----  Plot the true clustering and the clustering selected  ------
    for i_ax in range(len(clusters)):

        ax = fig.axes[i_ax]
        ax.set_title(str(ax_titles[i_ax]))

        # ------------------ Plot clusters one by one ------------------
        for i_clus, cluster in enumerate(clusters[i_ax]):
            color = colors[i_clus % len(colors)]
            ax = plot_cluster(ax, data, cluster, color)
    return fig

def plot_true_best(
    data: np.ndarray,
    labels: np.ndarray,
    clusterings: List[List[List[int]]],
    VI_best: float = None,
    n_plots: int = None
):
    """
    Plot the true clustering and the clustering obtained with k_true.

    Create also the whole figure that will be used to plot the
    clusterings selected by each CVI.

    Parameters
    ----------
    data : np.ndarray, shape (N, d)
        Original data, corresponding to a benchmark dataset
    labels : np.ndarray, shape (N,)
        True labels
    clusterings : List[List[List[int]]]
        The clusterings obtained with k_true
    VI_best : float, optional
        The VI between the true clustering and the clustering assuming
        the right number of clusters., by default None
    n_plots : int, optional
        Number of plots to add after the two initial plots, by default
        None

    Returns
    -------
    A matplotlib figure
        The figure with 2 plots on it, and many empty axes.
    """

    (N, T, d), UCR = _get_shape_UCR(data)
    colors = _get_colors()

    # ----------------------- Create figure ----------------
    if d <= 2:
        nrows, ncols, figsize = _get_nrows_ncols(n_plots)
        fig, axes = plt.subplots(
            nrows=nrows, ncols=ncols, sharex=True, sharey=True,
            figsize=figsize, tight_layout=True
        )
    else:
        return None

    # ----------------------- Labels ----------------
    if labels is None:
        labels = np.zeros(N)
    classes = np.unique(labels)
    n_labels = len(classes)
    if n_labels == N:
        labels = np.zeros(N)
        n_labels = 1

    # ------------------- variables for the 2 axes ----------------
    clusters = [
        # The true clustering
        [labels == classes[i] for i in range(n_labels)],
        # The clustering obtained with k_true
        clusterings
    ]
    if VI_best is not None:
        ax_titles = [
            f"True labels, k={n_labels}",
            f"Clustering assuming k={n_labels} | VI={VI_best:.4f}",
        ]
    else:
        ax_titles = [
            f"True labels, k={n_labels}",
            f"Clustering assuming k={n_labels}",
        ]

    # ------ True clustering and clustering assuming n_labels ----------
    for i_ax in range(2):
        if d <= 2:
            ax = fig.axes[i_ax]

        # ---------------  Plot clusters one by one --------------------
        for i_label in range(n_labels):
            c = clusters[i_ax][i_label]
            color = colors[i_label % len(colors)]
            ax = plot_cluster(ax, data, c, color)

        # Add title
        ax.set_title(ax_titles[i_ax])

    return fig

def plot_centers(
    data: np.ndarray,
    clustering: List[List[int]],
    cluster_centers: List,
):
    """
    Plot the clustering and their cluster centers

    Parameters
    ----------
    data : np.ndarray
        The data.
    clustering : List[List[int]]
        The labels.
    cluster_centers : List
        The cluster centers of the clusters.

    Returns
    -------
    A matplotlib figure
        A figure with 2 plots: the clustering and the cluster centers
    """
    (N, T, d), UCR = _get_shape_UCR(data)
    colors = _get_colors()

    # ----------------------- Create figure ----------------
    fig, axes = plt.subplots(
        nrows=1, ncols=2, sharex=True, sharey=True, figsize=(10, 5),
        tight_layout=True
    )

    ax_titles = [
        f"Clustering",
        f"Cluster centers",
    ]

    ax = fig.axes[0]
    ax.set_title(ax_titles[0])
    for i, cluster in enumerate(clustering):
        ax = plot_cluster(ax, data, cluster, colors[i])


    ax = fig.axes[1]
    ax.set_title(ax_titles[1])
    for i, center in enumerate(cluster_centers):
        ax = plot_center(ax, data, center, colors[i])

    return fig

def plot_aggregator(
    data: np.ndarray,
    labels: np.ndarray,
    clustering: List[List[int]],
    votes: Dict[int, int],
    ax_title: str,
):
    """
    Plot the true, the selected clustering and the Aggregator votes.

    Create the whole figure.

    Parameters
    ----------
    data : np.ndarray, shape (N, d)
        Original data, corresponding to a benchmark dataset
    labels : np.ndarray, shape (N,)
        True labels
    clusterings : List[List[int]]
        The clusterings obtained with k_true
    votes: Dict[int, int]
        The number of CVI votes each k value got.
    ax_title : str
        Ax title for the selected clustering

    Returns
    -------
    A matplotlib figure
        The figure with 3 plots on it.
    """

    (N, T, d), UCR = _get_shape_UCR(data)
    colors = _get_colors()

    # ----------------------- Create figure ----------------
    if d <= 2:
        fig, axes = plt.subplots(
            nrows=1, ncols=3, sharex=False, sharey=False,
            figsize=(24,8), tight_layout=True
        )
    else:
        return None

    # ----------------------- Labels ----------------
    if labels is None:
        labels = np.zeros(N)
    classes = np.unique(labels)
    n_labels = len(classes)
    if n_labels == N:
        labels = np.zeros(N)
        n_labels = 1

    # ====================== Plotting clusters ======================
    # ------------------- variables for the 2 axes ----------------
    clusters = [
        # The true clustering
        [labels == classes[i] for i in range(n_labels)],
        # The clustering obtained with k_true
        clustering
    ]
    ax_titles = [
        f"True labels, k={n_labels}",
        ax_title,
    ]

    # ------ True clustering and clustering assuming n_labels ----------
    for i_ax in range(2):
        if d <= 2:
            ax = fig.axes[i_ax]

        # ---------------  Plot clusters one by one --------------------
        for i_label in range(n_labels):
            c = clusters[i_ax][i_label]
            color = colors[i_label % len(colors)]
            ax = plot_cluster(ax, data, c, color)

        # Add title
        ax.set_title(ax_titles[i_ax])

    # ====================== Plotting votes ======================
    # Plot historgram
    ax = fig.axes[2]
    ax.bar(votes.keys(), votes.values())

    # Force x_ticks to be int and to appear also for 0 values
    xticks = list(votes.keys())
    ax.xaxis.set_ticks(xticks, xticks)

    # Force y_ticks to be int
    vote_min, vote_max = ax.get_ylim()
    ax.yaxis.set_ticks(np.arange(vote_min, vote_max+1, 1, dtype=int))

    # Labels and titles
    ax.set_xlabel("Selected number of clusters")
    ax.set_ylabel("Number of CVIs' votes")
    ax.set_title("Number of CVIs that selected a given number of clusters")

    return fig