"""
Base class for cluster sets, i.e. clustering for a range of number of clusters
"""
from collections.abc import Sequence
import numpy as np
from phasik.classes.clustering import ClusterSet
from phasik.drawing.drawing import plot_events, plot_phases
from phasik.drawing.drawing_clusters import plot_cluster_sets, relabel_clustersets
from phasik.drawing.utils import adjust_margin, display_name
__all__ = ["ClusterSets"]
[docs]class ClusterSets(Sequence):
"""Base class for sets of clusters (partition) of timepoints
Attributes
-----------
cluster_sets : iterable of phasik.ClusterSet
List of ClusterSets
clusters : numpy array of int
Summary array of the cluster labels, with dim (len(ns_max), len(times))
n_clusters : list of int
Number of clusters in the cluster set (partition)
times : list of (int or float)
Sorted list of time associated to each clustered snapshot
distance_metric : str
Distance metric used to compute the distance between snapshots, e.g. 'euclidean',
with sklearn.metrics.pairwise.paired_distances.
It must be one of the options allowed by scipy.spatial.distance.pdist
for its metric parameter (e.g. 'chebyshev', 'cityblock', 'correlation',
'cosine', 'euclidean', 'hamming', 'jaccard', etc.), or a metric listed
in pairwise.PAIRWISE_DISTANCE_FUNCTIONS.
n_max_type : float
Method that was used to determine when to stop clustering when creating this cluster
set. e.g. A cluster set can be created by clustering until a particular number of clusters has been
reached ('maxclust'), or until every cluster is at least a certain distance away from each other
('distance').
ns_max : list of int
List of values corresponding to the n_max_type described above, in other words,
list of numbers clusters to be computed. The number of elements in this list
is the number of ClusterSet computed.
silhouettes_average : numpy array
Value of average silouette for each clustering
"""
def __init__(self, cluster_sets, n_max_type, ns_max):
"""
Parameters
----------
cluster_sets : iterable of ClusterSet
n_max_type : str
Method that was used to determine when to stop clustering when creating these cluster
sets. e.g. A cluster set can be created by clustering until a particular number of clusters has been
reached ('maxclust'), or until every cluster is at least a certain distance away from each other
('distance')
ns_max : list of int
List of values corresponding to the n_max_type described above, in other words,
list of numbers clusters to be computed. The number of elements in this list
is the number of ClusterSet computed.
"""
self._cluster_sets = cluster_sets
self.clusters = np.array(
[cluster_set.clusters for cluster_set in cluster_sets]
) # array of cluster labels
self.n_clusters = np.array(
[cluster_set.n_clusters for cluster_set in cluster_sets]
)
self.times = cluster_sets[0].times # times must be the same in all sets
self.distance_metric = cluster_sets[
0
].distance_metric # must be the same in all sets
self.n_max_type = n_max_type
self.ns_max = ns_max
self.silhouettes_average = np.array(
[cluster_set.silhouette_average for cluster_set in cluster_sets]
)
def __len__(self):
return len(self._cluster_sets)
def __getitem__(self, key):
return self._cluster_sets[key]
@property
def clusters_sets(self):
"""Returns the list of ClusterSet"""
return self._cluster_sets
[docs] @classmethod
def from_distance_matrix(
cls, distance_matrix, n_max_type, ns_clusters_max, cluster_method
):
"""Generates ClusterSets from a distance matrix
Parameters
----------
distance_matrix : phasik.DistanceMatrix
Distance matrix from which to cluster
n_max_type : str
The method that determines when to stop clustering. For example, cluster set
can be created by clustering until a particular number of clusters has been
reached ('maxclust'), or until every cluster is at least a certain distance
away from each other ('distance').
ns_clusters_max : list of int
List of values corresponding to the n_max_type described above, in other words,
list of numbers clusters to be computed. The number of elements in this list
is the number of ClusterSet computed.
cluster_method : str
Clustering method used to cluster the temporal network snapshots. Examples :
'k_means', 'centroid', 'average', 'complete', 'weighted', 'median', 'single', 'ward'
Returns
-------
ClusterSets
"""
cluster_sets = [
ClusterSet.from_distance_matrix(
distance_matrix, n_max_type, n_max, cluster_method
)
for n_max in ns_clusters_max
]
return cls(cluster_sets, n_max_type, ns_clusters_max)
[docs] def plot(
self,
axs=None,
coloring="consistent",
translation=None,
with_silhouettes=False,
with_n_clusters=False,
):
"""Plots these cluster sets as a scatter graph
Parameters
----------
ax : matplotlib.Axes, optional
Axes on which to plot
coloring : {'ascending', 'consistent', None}
Method for consistent coloring. Default: "consistent".
translation : dict, optional
Dictionary with old labels as keys and new labels as values.
If None (default), has no effect. For example {1: 2, 2: 3, 3: 1}.
It is applied after the order relabling from `method`.
with_silhouettes : bool
If True, also plot the average silhouettes on a 2nd axis. Defaults to False.
with_n_clusters : bool
If True, also plot the actual number of clusters on a 3rd axis. Defaults to False.
Returns
-------
None
"""
return plot_cluster_sets(
self,
axs=axs,
coloring=coloring,
translation=translation,
with_silhouettes=with_silhouettes,
with_n_clusters=with_n_clusters,
)
[docs] def plot_silhouette_samples(self, axs, coloring="consistent"):
"""Plot the average silhouettes across this range of cluster sets
Parameters
----------
axs : list of matplotlib.Axes
Axes on which to plot; should be an iterable object with at least as many items as there
are cluster sets in this class.
coloring : {'ascending', 'consistent', None}
Method for consistent coloring. Default: "consistent".
Returns
-------
None
"""
if coloring == "consistent":
self = relabel_clustersets(self)
for cluster_set, ax in zip(self._cluster_sets, axs.flatten()):
cluster_set.plot_silhouette_samples(ax=ax)