From 1171c9419025e886095ff4a6c27360024a8ad3c2 Mon Sep 17 00:00:00 2001 From: jakehume Date: Sun, 23 Nov 2025 22:50:38 +0000 Subject: [PATCH] Add temporal spectral community detection --- .gitnore => .gitignore | 6 +- docs/conf.py | 9 +- docs/requirements.txt | 4 +- docs/teneto.communitydetection.rst | 7 + requirements.txt | 1 + setup.py | 3 +- teneto/communitydetection/__init__.py | 14 +- teneto/communitydetection/spectral.py | 938 ++++++++++++++++++ test/communitydetection/__init__.py | 1 + .../test_temporal_spectral.py | 165 +++ 10 files changed, 1134 insertions(+), 14 deletions(-) rename .gitnore => .gitignore (80%) create mode 100644 teneto/communitydetection/spectral.py create mode 100644 test/communitydetection/__init__.py create mode 100644 test/communitydetection/test_temporal_spectral.py diff --git a/.gitnore b/.gitignore similarity index 80% rename from .gitnore rename to .gitignore index 974f2d92..092ac7f9 100644 --- a/.gitnore +++ b/.gitignore @@ -1,4 +1,3 @@ - 117 lines (94 sloc) 1.32 KB # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] @@ -29,5 +28,10 @@ share/python-wheels/ # Sphinx documentation docs/_build/ +docs/api/ +# IDE/editor metadata +.vscode/ +# macOS Finder artifacts +teneto/__MACOSX/ diff --git a/docs/conf.py b/docs/conf.py index 4f20eb64..59f86058 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -18,7 +18,7 @@ # -- Project information ----------------------------------------------------- project = 'teneto' -copyright = '2018-2021, William Hedley Thompson' +copyright = '2018-2025, William Hedley Thompson' author = 'William Hedley Thompson' version = "../teneto/_version.py" @@ -45,7 +45,6 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'matplotlib.sphinxext.only_directives', 'matplotlib.sphinxext.plot_directive', 'sphinx.ext.autodoc', 'sphinx.ext.autosummary', @@ -56,7 +55,7 @@ 'sphinx.ext.viewcode', 'sphinx.ext.githubpages', 'sphinx.ext.napoleon', - 'm2r', + 'myst_parser', 'sphinx_automodapi.automodapi', 'sphinx.ext.autosectionlabel', 'nbsphinx', @@ -79,7 +78,7 @@ # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. -language = None +language = 'en' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. @@ -200,5 +199,3 @@ numpydoc_show_class_members = False # MOCK MODULESS autodoc_mock_imports = ['tables', 'scikit-learn', 'joblib', 'nilearn', 'NiftiLabelsMasker'] - - diff --git a/docs/requirements.txt b/docs/requirements.txt index e2b6903a..a059238d 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -5,10 +5,10 @@ pybids>=0.11.1 #python-igraph>=0.7.1 #louvain >= 0.6.1 statsmodels>=0.8.0 -m2r>=0.2.1 +myst-parser>=2.0 nbsphinx>=0.4.2 sphinx-automodapi>=0.9 networkx>=2.0 python-louvain>=0.13 -matplotlib==2.2.0 +matplotlib>=3.8 templateflow>=0.6.2 diff --git a/docs/teneto.communitydetection.rst b/docs/teneto.communitydetection.rst index 4752ff6e..1fe4b156 100644 --- a/docs/teneto.communitydetection.rst +++ b/docs/teneto.communitydetection.rst @@ -1,6 +1,13 @@ teneto.communitydetection ================================= +Spectral +---------------------------------------- + +.. automodule:: teneto.communitydetection.spectral + :members: + :show-inheritance: + Louvain ---------------------------------------- diff --git a/requirements.txt b/requirements.txt index 91aff603..d0240542 100755 --- a/requirements.txt +++ b/requirements.txt @@ -9,3 +9,4 @@ statsmodels>=0.8.0 networkx>=2.0 python-louvain>=0.13 tables>=3.4.4 +scikit-learn>=1.0 diff --git a/setup.py b/setup.py index 7e233a02..ca3fa6da 100755 --- a/setup.py +++ b/setup.py @@ -20,7 +20,8 @@ 'pandas>=0.21', 'scipy>=1.4.1', 'numpy>=1.19.5', - 'templateflow>=0.6.2'], + 'templateflow>=0.6.2', + 'scikit-learn>=1.0'], description='Temporal network tools', packages=find_packages(), author='William Hedley Thompson', diff --git a/teneto/communitydetection/__init__.py b/teneto/communitydetection/__init__.py index 45166a19..0e233578 100755 --- a/teneto/communitydetection/__init__.py +++ b/teneto/communitydetection/__init__.py @@ -1,6 +1,12 @@ -"""Import of community detection""" +"""Community detection convenience imports.""" from teneto.communitydetection.louvain import * from teneto.communitydetection.tctc import tctc -#from teneto.communitydetection.louvain import find_tctc -__all__ = ['tctc', 'temporal_louvain', - 'make_consensus_matrix', 'make_temporal_consensus', ] +from teneto.communitydetection.spectral import temporal_spectral + +__all__ = [ + "tctc", + "temporal_louvain", + "make_consensus_matrix", + "make_temporal_consensus", + "temporal_spectral", +] diff --git a/teneto/communitydetection/spectral.py b/teneto/communitydetection/spectral.py new file mode 100644 index 00000000..a3c31681 --- /dev/null +++ b/teneto/communitydetection/spectral.py @@ -0,0 +1,938 @@ + +"""Temporal community detection by fitting a Grassmannian geodesic to spectral embeddings.""" +from __future__ import annotations + +import logging +import time +from abc import ABC, abstractmethod +from typing import Iterable, List, Tuple + +import numpy as np +from scipy import sparse +from scipy.linalg import cosm, sinm +from scipy.ndimage import gaussian_filter1d +from scipy.signal import medfilt +from scipy.sparse import csr_matrix, issparse +from scipy.sparse.linalg import eigsh, svds +from sklearn.cluster import KMeans + +from teneto.classes import TemporalNetwork +from teneto.utils import process_input + + +__all__ = ["temporal_spectral"] + +logger = logging.getLogger(__name__) + + +def _as_sparse_snapshots(tnet: TemporalNetwork) -> Tuple[list[csr_matrix], int, int]: + """Return sparse adjacency snapshots and their dimensions.""" + arr = tnet.df_to_array(start_at="auto") + if arr.ndim != 3: + raise ValueError(f"Expected a 3D array (nodes, nodes, time); got {arr.shape}") + + n_nodes, _, n_time = arr.shape + snapshots: list[csr_matrix] = [] + for t in range(n_time): + # Copy to avoid mutating the TemporalNetwork backing array. + mat = np.array(arr[:, :, t], copy=True) + np.fill_diagonal(mat, 0) + mat = np.triu(mat, k=1) + mat = mat + mat.T + snapshots.append(csr_matrix(mat)) + return snapshots, n_nodes, n_time + + +def _labels_list_to_matrix(labels_list: Iterable[Iterable[int]]) -> np.ndarray: + """Convert a list of per-time label vectors into a (node, time) array.""" + label_series = list(labels_list) + if not label_series: + raise ValueError("No community assignments were returned.") + + n_nodes = len(label_series[0]) + labels = np.empty((n_nodes, len(label_series)), dtype=int) + for t_idx, lab in enumerate(label_series): + lab = np.asarray(lab, dtype=int) + if lab.shape[0] != n_nodes: + raise ValueError("Community label lengths must match node count at each timepoint.") + labels[:, t_idx] = lab + return labels + + +def temporal_spectral( + tnet, + ke: int, + kc: int | list[int] | str = "auto", + mode: str = "simple-nsc", + stable_communities: bool = False, + smoothing_filter: str | None = None, + smoothing_parameter: float | None = None, + return_embeddings: bool = False, + use_intermediate_iterations: bool = False, + num_intermediate_iterations: int = 20, + which_eig: str = "smallest", + **mode_kwargs, +) -> np.ndarray | Tuple[np.ndarray, List[np.ndarray]]: + """ + Detect temporal communities by geodesically modeling spectral embeddings. + + Parameters + ---------- + tnet : array | dict | TemporalNetwork + Temporal network input accepted by :func:`teneto.utils.process_input`. + ke : int + The dimension of the learned spectral embeddings. Should be an upper bound on the expected number of communities at any time. (In classical static spectral clustering, ``ke`` is chosen to be equal to the target number of communities ``kc`` .) + kc : int | list[int] | 'auto' + Number of communities to detect per snapshot. Either a single int to detect a fixed given number of communities at every snapshot, a list of ints coindexed with the snapshots, or 'auto' to select per snapshot using a benefit curve. + mode : str + Algorithm mode, e.g. ``'simple-nsc'`` (normalized spectral clustering, default), ``'simple-usc'`` (unnormalized), ``'simple-smm'`` (spectral modularity maximization). + stable_communities : bool + If ``True`` and ``kc='auto'``, a single community count is enforced across time by maximizing the benefit curve aggregated over all snapshots. + smoothing_filter : str | None + ``'median'`` or ``'gaussian'`` to optionally smooth the benefit-vs-``k`` curve when ``kc='auto'`` toward (slightly) more stable community counts. + smoothing_parameter : float | None + Parameter for the selected smoothing filter (kernel size or sigma). + return_embeddings : bool + If ``True``, also return the per-time embeddings used for clustering. + use_intermediate_iterations : bool + If ``True``, keep intermediate geodesic-fitting iterations and concatenate their embeddings. + num_intermediate_iterations : int + Number of intermediate iterations retained if ``use_intermediate_iterations`` is set. The idea is that since MM algorithms monotonically improve their objective, intermediate fits may also be useful as 'softer regularization' than the final geodesic fit. + which_eig : str + Controls which eigensolver is used inside the base smoother (``'smallest'``, ``'largest'``, ``'svd'``). + mode_kwargs : + Additional keyword arguments forwarded to the smoother subclass. + + Returns + ------- + communities : ndarray (nodes, time) + node,time array of community assignment + embeddings : list[np.ndarray], optional + Returned when ``return_embeddings`` is ``True``. + + + References + ---------- + Hume, J. and Balzano, L. (2025). + A Spectral Framework for Tracking Communities in Evolving Networks. + In *Proceedings of the Third Learning on Graphs Conference (LoG 2024)*, + Proceedings of Machine Learning Research, vol. 269, pp. 9:1–9:34. + Available at https://proceedings.mlr.press/v269/hume25a.html. + + + """ + tnet = process_input(tnet, ["C", "G", "TN"], "TN") + snapshots, n_nodes, n_time = _as_sparse_snapshots(tnet) + + assignments, embeddings = _spectral_geodesic_smoothing( + sadj_list=snapshots, + T=n_time, + num_nodes=n_nodes, + ke=ke, + kc=kc, + stable_communities=stable_communities, + mode=mode, + smoothing_filter=smoothing_filter, + smoothing_parameter=smoothing_parameter, + return_geo_embeddings_only=False, + use_intermediate_iterations=use_intermediate_iterations, + num_intermediate_iterations=num_intermediate_iterations, + which_eig=which_eig, + **mode_kwargs, + ) + + communities = _labels_list_to_matrix(assignments) + if return_embeddings: + return communities, embeddings + return communities + + +def _is_symmetric(matrix, tol=1e-8): + if issparse(matrix): + return (matrix != matrix.T).nnz == 0 + return np.allclose(matrix, matrix.T, atol=tol) + +def _theta_majorizer(H, Y, X): + T = len(X) + k = H.shape[1] + + r = np.empty((T, k), dtype=np.float64) + phi = np.empty((T, k), dtype=np.float64) + bias = np.empty((T, k), dtype=np.float64) + + symmetric_flags = [_is_symmetric(Xi) for Xi in X] + + if not H.flags['C_CONTIGUOUS']: + H = np.ascontiguousarray(H) + if not Y.flags['C_CONTIGUOUS']: + Y = np.ascontiguousarray(Y) + + for ii in range(T): + Xi = X[ii] + symmetric = symmetric_flags[ii] + + if symmetric: + XiT_H = Xi @ H + XiT_Y = Xi @ Y + else: + if issparse(Xi): + XiT = Xi.transpose().tocsr() + else: + XiT = Xi.T + XiT_H = XiT @ H + XiT_Y = XiT @ Y + + + a = np.sum(np.abs(XiT_H)**2, axis=0) + + + b = np.real(np.sum(XiT_Y.conj() * XiT_H, axis=0)) + + c = np.sum(np.abs(XiT_Y)**2, axis=0) + + a_minus_c_over_2 = (a - c) / 2.0 + two_b = 2.0 * b + + np.sqrt(a_minus_c_over_2**2 + b**2, out=r[ii, :]) + + np.arctan2(two_b, a - c, out=phi[ii, :]) + + bias[ii, :] = (a + c) / 2.0 + + return r, phi, bias + +def _estimate_theta(H, Y, X, t, niter=5, tH=0, Theta_init=None): + r, phi, bias = _theta_majorizer(H, Y, X) + + t = np.asarray(t, dtype=np.float64)[:, np.newaxis] - tH + tt = 2.0 * t + + n2tr = -tt * r + L = tt * n2tr + + if Theta_init is None: + Theta = np.ones(H.shape[1], dtype=np.float64) # Shape: (k,) + else: + Theta = Theta_init.astype(np.float64).copy() # Shape: (k,) + + for _ in range(niter): + arg = tt * Theta + arg -= phi + + gradf = n2tr * np.sin(arg) + + denom = (arg + np.pi) % (2 * np.pi) - np.pi + + with np.errstate(divide='ignore', invalid='ignore'): + curvf = np.divide(tt * gradf, denom, out=np.zeros_like(tt * gradf), where=denom!=0) + curvf += L * (denom == 0) + + gradf_sum = np.sum(gradf, axis=0) + curvf_sum = np.sum(curvf, axis=0) + step = np.divide(gradf_sum, curvf_sum, out=np.zeros_like(gradf_sum), where=curvf_sum!=0) # Shape: (k,) + + Theta -= step + + return Theta + + + + +def _estimate_point_tangent(H, Y, Theta, X, t, tH=0): + M_AB = 0.0 + + for ti, Xi in zip(t, X): + arg = Theta * (ti - tH) # (k,) + cos_arg = np.cos(arg) # (k,) + sin_arg = np.sin(arg) # (k,) + Ui = H * cos_arg + Y * sin_arg + + if _is_symmetric(Xi): + G = Xi @ Ui # (n_i, k) + XG = G # for symmetric matrices Xi @ G.T = G + else: + G = Ui.T @ Xi # (k, n_i) + XG = Xi @ G.T # (d, k) + + XGcos = XG * cos_arg + XGsin = XG * sin_arg + + M_AB_i = np.concatenate((XGcos, XGsin), axis=1) #(d, 2k) + M_AB += M_AB_i + + return M_AB + +def _sfit_point_tangent_geodesic(data, k, max_iter, tol=1e-5, rel_tol=1e-2, return_intermediate_iterations=False, num_intermediate_iterations=20): + X, t = data # X is list of matrices, t is list of times + init_start = time.time() + # Extract M1 and MT + M1 = X[0] + MT = X[-1] + + # are M1 and MT symmetric? + M1_symmetric = _is_symmetric(M1) + MT_symmetric = _is_symmetric(MT) + + logger.debug("M1 symmetric: %s", M1_symmetric) + logger.debug("MT symmetric: %s", MT_symmetric) + + # rank-k truncated SVD of M1 + if M1_symmetric: + if issparse(M1): + S1, U1 = eigsh(M1, k=k, which='LM') + else: + S1, U1 = np.linalg.eigh(M1) + idx = np.argsort(-S1)[:k] + U1 = U1[:, idx] + S1 = S1[idx] + else: + U1, S1, _ = svds(M1, k=k) + idx = np.argsort(-S1) + U1 = U1[:, idx] + S1 = S1[idx] + H1 = U1 # (d, k) + + # ..... and for MT also + if MT_symmetric: + if issparse(MT): + ST, UT = eigsh(MT, k=k, which='LM') + else: + ST, UT = np.linalg.eigh(MT) + idx = np.argsort(-ST)[:k] + UT = UT[:, idx] + ST = ST[idx] + else: + UT, ST, _ = svds(MT, k=k) + idx = np.argsort(-ST) + UT = UT[:, idx] + ST = ST[idx] + H_T = UT # (d, k) + + H1T_H_T = H1.T @ H_T # (k, k) + + Z, S, Q_T = np.linalg.svd(H1T_H_T, full_matrices=False) + Q = Q_T.T # (k, k) + + + if np.any((S < -1 - 1e-6) | (S > 1 + 1e-6)): # clip S + logger.warning("Singular values exceeded [-1,1] by more than 1e-6; clipping.") + S = np.clip(S, -1, 1) + + H_TQ = H_T @ Q # (d, k) + + # orthogonal component + H1_H1T_HTQ = H1 @ (H1.T @ H_TQ) # (d, k) + Orth_comp = H_TQ - H1_H1T_HTQ # (d, k) + + # SVD of orthocomplement + F, D, G_T = np.linalg.svd(Orth_comp, full_matrices=False) + + # direction Y = F G_T + Y = F @ G_T # (d, k) + + + + # Theta init (see paper) + Theta = np.arccos(S) # (k,) + + + + #print("init time:", time.time() - init_start) + + #print("Starting iterations") + iter_start = time.time() + point_tangent_times = [] + theta_times = [] + conv_check_times = [] + + # init H = H1 + H = H1.copy() + + # set the initial values for convergence checking + H_old = H.copy() + Y_old = Y.copy() + Theta_old = Theta.copy() + + # Storage for intermediate iterations if requested + if return_intermediate_iterations: + intermediate_H = [] + intermediate_Y = [] + intermediate_Theta = [] + + # fix tH=0, at least until there seems to be a reason to not do so + tH = 0 + + for itr in range(max_iter): + if itr == max_iter - 1: + logger.warning("Maximum point-tangent iterations reached without convergence.") + + #print(f"Iteration {itr}") + + # P-Update, P = [H, Y] + pt_time = time.time() + M_AB = _estimate_point_tangent(H, Y, Theta, X, t, tH) + try: + U, _, Vh = np.linalg.svd(M_AB, full_matrices=False) + except np.linalg.LinAlgError as err: + logger.error("SVD failed for M_AB (shape=%s): %s", M_AB.shape, err) + logger.debug("Matrix contains NaN? %s | Inf? %s", np.any(np.isnan(M_AB)), np.any(np.isinf(M_AB))) + logger.info("Retrying SVD for M_AB after adding jitter.") + M_AB += 1e-6 * np.random.randn(*M_AB.shape) + U, _, Vh = np.linalg.svd(M_AB, full_matrices=False) + C = U @ Vh # (d, 2k) + H = C[:, :k] + Y = C[:, k:] + point_tangent_times.append(time.time() - pt_time) + + # Theta-Update + theta_time = time.time() + Theta = _estimate_theta(H, Y, X, t, niter=5, tH=tH, Theta_init=Theta) + theta_times.append(time.time() - theta_time) + + # Store intermediate iterations if requested (only last num_intermediate_iterations) + if return_intermediate_iterations: + intermediate_H.append(H.copy()) + intermediate_Y.append(Y.copy()) + intermediate_Theta.append(Theta.copy()) + + # Keep only the last num_intermediate_iterations + if len(intermediate_H) > num_intermediate_iterations: + intermediate_H.pop(0) + intermediate_Y.pop(0) + intermediate_Theta.pop(0) + + # Check convergence? + conv_check_time = time.time() + delta_H = np.linalg.norm(H - H_old) + delta_Y = np.linalg.norm(Y - Y_old) + delta_Theta = np.linalg.norm(Theta - Theta_old) + + + rel_delta_H = delta_H / (np.linalg.norm(H) + 1e-15) + rel_delta_Y = delta_Y / (np.linalg.norm(Y) + 1e-15) + rel_delta_Theta = delta_Theta / (np.linalg.norm(Theta) + 1e-15) + + if (delta_H < tol and delta_Y < tol and delta_Theta < tol) or (rel_delta_H < rel_tol and rel_delta_Y < rel_tol and rel_delta_Theta < rel_tol): + logger.info("Point-tangent geodesic converged in %s iterations.", itr) + break + + # update the old values + H_old = H.copy() + Y_old = Y.copy() + Theta_old = Theta.copy() + + conv_check_times.append(time.time() - conv_check_time) + + #print("point tangent time", np.sum(np.array(point_tangent_times))) + #print("theta time", np.sum(np.array(theta_times))) + #print("conv check time", np.sum(np.array(conv_check_times))) + + if return_intermediate_iterations: + return H, Y, Theta, intermediate_H, intermediate_Y, intermediate_Theta + return H, Y, Theta + + + +class _SpectralGeodesicSmoother(ABC): + def __init__(self, *args, d, T, sadj_list=None, precomputed_embeddings=None, ke='auto', kc_list='auto', stable_communities=False, which_eig='smallest', t=None, benefit_fn=None, smoothing_filter=None, smoothing_parameter=None, max_iter=1000, use_intermediate_iterations=False, num_intermediate_iterations=20): + if len(args) > 0: + raise ValueError("This class does not accept positional arguments") + + if precomputed_embeddings is not None and sadj_list is not None: + logger.warning("Both precomputed_embeddings and sadj_list are provided. The sadj_list will be ignored.") + + self.d = d + self.max_iter = max_iter + self.sadj_list = sadj_list + self.precomputed_embeddings = precomputed_embeddings + + self.ke = ke if ke != 'auto' else self.choose_ke() + if isinstance(kc_list, int): + kc_list = [kc_list]*T + self.kc_list = kc_list + self.stable_communities = stable_communities + self.which_eig = which_eig + if t is not None: + self.t = t + + elif sadj_list is not None: + self.t = np.linspace(0.0, 1.0, len(sadj_list)) + + elif precomputed_embeddings is not None: + self.t = np.linspace(0.0, 1.0, len(precomputed_embeddings)) + + else: + raise ValueError("Either sadj_list or precomputed_embeddings must be provided") + + + self.t = t if t is not None else np.linspace(0.0, 1.0, len(sadj_list)) if sadj_list is not None else np.linspace(0.0, 1.0, len(precomputed_embeddings)) + self.benefit_fn = benefit_fn + self.smoothing_filter = smoothing_filter + self.smoothing_parameter = smoothing_parameter + self.T = T + self.use_intermediate_iterations = use_intermediate_iterations + self.num_intermediate_iterations = num_intermediate_iterations + + if self.stable_communities and kc_list != 'auto' and len(set(kc_list)) > 1: + raise ValueError("stable_communities=True requires a single kc value or 'auto'.") + + def benefit_fn_broadcast(self, sadj_list, labels_list): + return [self.benefit_fn(sadj, labels) for sadj, labels in zip(sadj_list, labels_list)] + + + @abstractmethod + def make_clustering_matrix(self, sadj): + pass + + def choose_ke(self): + raise NotImplementedError("must implement choose_ke") + + def make_clustering_matrices(self): + self.clustering_matrices = [self.make_clustering_matrix(sadj) for sadj in self.sadj_list] + + def make_modeled_clustering_matrices(self): + # this is permitted to be overwritten by subclasses, but usually no need; the following default implementation usually works fine + assert self.clustering_matrices is not None, "You must first run make_clustering_matrices" + self.modeled_clustering_matrices = [] + for R in self.clustering_matrices: + frobenius_norm = sparse.linalg.norm(R, 'fro') + n,m= R.shape + I = sparse.eye(n, m, format=R.format) + if self.which_eig == 'smallest': + self.modeled_clustering_matrices.append(frobenius_norm*I - R) + elif self.which_eig == 'largest': + self.modeled_clustering_matrices.append(frobenius_norm*I + R) + elif self.which_eig == 'svd': + self.modeled_clustering_matrices.append(R) + else: + raise ValueError("Invalid value for which_eig") + + + # this never overwritten by subclasses + def get_geodesic_embeddings(self): + # This method now assumes self.Xs is already prepared. + + # Get geodesic parameters, optionally with intermediate iterations + if self.use_intermediate_iterations: + H, Y, Theta, intermediate_H, intermediate_Y, intermediate_Theta = _sfit_point_tangent_geodesic( + (self.Xs, self.t), k=self.ke, max_iter=self.max_iter, + return_intermediate_iterations=True, + num_intermediate_iterations=self.num_intermediate_iterations + ) + self.intermediate_H = intermediate_H + self.intermediate_Y = intermediate_Y + self.intermediate_Theta = intermediate_Theta + else: + H, Y, Theta = _sfit_point_tangent_geodesic((self.Xs, self.t), k=self.ke, max_iter=self.max_iter) + + # Compute main geodesic embeddings + self.Us = [H @ cosm(np.diag(Theta)*self.t[i]) + Y @ sinm(np.diag(Theta)*self.t[i]) for i in range(self.T)] + + # Optionally compute intermediate geodesic embeddings for positional encodings + if self.use_intermediate_iterations: + self.intermediate_Us = [] + for H_inter, Y_inter, Theta_inter in zip(intermediate_H, intermediate_Y, intermediate_Theta): + Us_inter = [H_inter @ cosm(np.diag(Theta_inter)*self.t[i]) + Y_inter @ sinm(np.diag(Theta_inter)*self.t[i]) for i in range(self.T)] + self.intermediate_Us.append(Us_inter) + + # Concatenate intermediate embeddings to main embeddings for enriched positional encodings + self.enriched_Us = [] + for i in range(self.T): + # Start with main embedding + enriched_embedding = [self.Us[i]] + # Add intermediate embeddings for this timestep + for inter_Us in self.intermediate_Us: + enriched_embedding.append(inter_Us[i]) + # Concatenate along feature dimension + self.enriched_Us.append(np.concatenate(enriched_embedding, axis=1)) + else: + self.enriched_Us = self.Us + + # this overwritten sometimes by subclasses + def clustering_Euclidean(self): + # Use enriched embeddings if available, otherwise use standard embeddings + embeddings_to_use = self.enriched_Us if hasattr(self, 'enriched_Us') else self.Us + T = len(embeddings_to_use) + + if all(k == self.kc_list[0] for k in self.kc_list): # Constant kc_list + k_val = self.kc_list[0] + kmeans = KMeans(n_clusters=k_val, n_init=10) + labels = [kmeans.fit_predict(U_i) for U_i in embeddings_to_use] + return labels + elif all(isinstance(k, int) for k in self.kc_list): # Non-constant, provided kc_list + labels = [] + for U_i, k_val in zip(embeddings_to_use, self.kc_list): + kmeans = KMeans(n_clusters=k_val, n_init=10) + labels.append(kmeans.fit_predict(U_i)) + return labels + else: # Auto kc_list + logger.info("Auto community-count selection uses placeholder kmin=2. Might later consider implementing a heuristic to guess higher when appropriate.") + kmin, kmax = 2, self.ke + k_vals = range(kmin, kmax + 1) + benefit_vs_time_and_k = np.zeros((len(k_vals), T)) - np.inf + labels_by_k = {k: [] for k in k_vals} + + start = time.time() + for i, U_i in enumerate(embeddings_to_use): + for j, k_val in enumerate(k_vals): + kmeans = KMeans(n_clusters=k_val, n_init=10) + labels_i = kmeans.fit_predict(U_i) + labels_by_k[k_val].append(labels_i) + benefit = self.benefit_fn_broadcast([self.sadj_list[i]], [labels_i])[0] + benefit_vs_time_and_k[j, i] = benefit + + + if self.smoothing_filter == 'median': + kernel_size = self.smoothing_parameter + smoothed_benefit = np.array([medfilt(row, kernel_size) for row in benefit_vs_time_and_k]) + elif self.smoothing_filter == 'gaussian': + sigma = self.smoothing_parameter + smoothed_benefit = np.array([gaussian_filter1d(row, sigma) for row in benefit_vs_time_and_k]) + else: + smoothed_benefit = benefit_vs_time_and_k + + if self.stable_communities: + aggregated_benefit = smoothed_benefit.mean(axis=1) + best_k = k_vals[int(np.argmax(aggregated_benefit))] + best_labels_all = labels_by_k[best_k] + else: + k_maximizing_benefit = [k_vals[l] for l in np.argmax(smoothed_benefit, axis=0)] + best_labels_all = [labels_by_k[k][i] for i, k in enumerate(k_maximizing_benefit)] + + return best_labels_all + + + # this is never overwritten by subclasses + def run_dcd(self): + self.run_geo_embeddings() + time_start = time.time() + assignments = self.clustering_Euclidean() + return assignments + + def run_geo_embeddings(self): + time_start = time.time() + + # If custom embeddings are provided, use them directly for fitting. + if self.precomputed_embeddings is not None: + logger.info("Using pre-computed embeddings for geodesic fitting.") + self.Xs = self.precomputed_embeddings + # Otherwise, run the standard matrix processing pipeline. + else: + if self.sadj_list is None: + raise ValueError("sadj_list must be provided if precomputed_embeddings is not used.") + self.make_clustering_matrices() + self.make_modeled_clustering_matrices() + + self.Xs = self.modeled_clustering_matrices + + # Now, call the fitting method, which assumes self.Xs is ready + self.get_geodesic_embeddings() + logger.debug("Time to get geodesic embeddings: %.3fs", time.time() - time_start) + + +def _spectral_geodesic_smoothing(sadj_list=None, T=None, num_nodes=None, ke=None, precomputed_embeddings=None, kc='auto', stable_communities=False, mode='simple-nsc', smoothing_filter=None, smoothing_parameter=None, return_geo_embeddings_only=False, use_intermediate_iterations=False, num_intermediate_iterations=20, **mode_kwargs): + if T is None: + if sadj_list is not None: + T = len(sadj_list) + elif precomputed_embeddings is not None: + T = len(precomputed_embeddings) + else: + raise ValueError("sadj_list or precomputed_embeddings must be provided") + if not isinstance(kc, list) and kc != 'auto': + try: + kc = [kc] * T + except TypeError: + raise ValueError("kc must be a list, a natural number, or the default 'auto'") + + algorithms = { + 'simple': ['nsc', 'smm', 'bhc', 'usc'], + 'signed': ['srsc', 'gmsc', 'spmsc'], + 'overlapping': ['osc', 'csc'], + 'directed': ['ddsc', 'bsc', 'rwsc'], + 'multiview': ['gmsc', 'pmlc'], + 'cocommunity': ['scc'], + 'hierarchical': ['hsc'], + } + + simple_class_lookup = { + 'nsc': _NSC, + 'smm': _SMM, + 'bhc': _BHC, + 'usc': _USC, + } + simple_modes = { + f"simple-{alg}": simple_class_lookup[alg] + for alg in algorithms['simple'] + } + disabled_modes = { + f"{category}-{alg}" + for category, algs in algorithms.items() + if category != 'simple' + for alg in algs + } + + if mode in simple_modes: + smoother_class = simple_modes[mode] + elif mode in disabled_modes: + raise NotImplementedError("Non-'simple-*' temporal spectral modes are currently disabled.") + else: + valid_modes = ', '.join(sorted(simple_modes.keys())) + raise ValueError(f"Invalid simple mode '{mode}'. Valid simple modes are: {valid_modes}") + + smoother = smoother_class(T=T, d=num_nodes, sadj_list=sadj_list, precomputed_embeddings=precomputed_embeddings, ke=ke, kc_list=kc, + stable_communities=stable_communities, + smoothing_filter=smoothing_filter, + smoothing_parameter=smoothing_parameter, + use_intermediate_iterations=use_intermediate_iterations, + num_intermediate_iterations=num_intermediate_iterations, + **mode_kwargs) + + if return_geo_embeddings_only: + smoother.run_geo_embeddings() + # Return enriched embeddings if available, otherwise standard embeddings + return smoother.enriched_Us if hasattr(smoother, 'enriched_Us') else smoother.Us + assignments = smoother.run_dcd() + # Return enriched embeddings if available, otherwise standard embeddings + embeddings_to_return = smoother.enriched_Us if hasattr(smoother, 'enriched_Us') else smoother.Us + return assignments, embeddings_to_return + + +## Begin subclass implementations +class _Simple(_SpectralGeodesicSmoother): + @staticmethod + def calculate_modularity(adj_matrix: csr_matrix, communities: list) -> float: + """ + Calculate the modularity of a network given its adjacency matrix and community assignments. + + :param adj_matrix: Sparse adjacency matrix of the network (scipy.sparse.csr_matrix) + :param communities: List of community assignments for each node + :return: Modularity value + """ + if not isinstance(adj_matrix, csr_matrix): + raise ValueError("adj_matrix must be a scipy.sparse.csr_matrix") + + if len(communities) != adj_matrix.shape[0]: + raise ValueError("Number of community assignments must match number of nodes") + + n_edges = adj_matrix.sum() / 2 + n_nodes = adj_matrix.shape[0] + + modularity = 0 + for i in range(n_nodes): + for j in range(n_nodes): + if communities[i] == communities[j]: + a_ij = adj_matrix[i, j] + k_i = adj_matrix.getrow(i).sum() + k_j = adj_matrix.getrow(j).sum() + expected = (k_i * k_j) / (2 * n_edges) + modularity += a_ij - expected + + modularity /= (2 * n_edges) + return modularity + + @staticmethod + def calculate_modularity_vectorized(adj_matrix: csr_matrix, communities: list) -> float: + """ + Calculate the modularity of a network given its adjacency matrix and community assignments + using a fully vectorized approach. + + :param adj_matrix: Sparse adjacency matrix of the network (scipy.sparse.csr_matrix) + :param communities: List of community assignments for each node + :return: Modularity value + """ + if not isinstance(adj_matrix, csr_matrix): + raise ValueError("adj_matrix must be a scipy.sparse.csr_matrix") + + n_nodes = adj_matrix.shape[0] + + if len(communities) != n_nodes: + raise ValueError("Number of community assignments must match number of nodes") + + m = adj_matrix.sum() / 2 + if m == 0: + raise ValueError("The network has no edges.") + + communities = np.array(communities) + unique_communities, inverse_indices = np.unique(communities, return_inverse=True) + n_communities = unique_communities.size + + degrees = np.array(adj_matrix.sum(axis=1)).flatten() + + degree_sum_per_community = np.bincount(inverse_indices, weights=degrees) + + community_matrix = csr_matrix( + (np.ones(n_nodes), (inverse_indices, np.arange(n_nodes))), + shape=(n_communities, n_nodes) + ) + + connections_within = community_matrix.dot(adj_matrix).dot(community_matrix.transpose()) + + internal_edges_per_community = connections_within.diagonal() / 2 + + modularity = (internal_edges_per_community.sum() / m) - np.sum((degree_sum_per_community / (2 * m)) ** 2) + + return modularity + + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.benefit_fn = self.calculate_modularity_vectorized + + +class _Signed(_SpectralGeodesicSmoother): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + + def power_mean_laplacian(self, A_pos, A_neg, p, epsilon=1e-6): + pass + + def create_clustering_matrix(self, A_pos, A_neg, p, epsilon=1e-6): + pass + +class _Directed(_SpectralGeodesicSmoother): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + +class _Overlapping(_SpectralGeodesicSmoother): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + +class _Multiview(_SpectralGeodesicSmoother): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + +class _Cocommunity(_SpectralGeodesicSmoother): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + +class _Hierarchical(_SpectralGeodesicSmoother): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + +## end subclass implementations + +## Begin subsubclass implementations + +class _NSC(_Simple): + def make_clustering_matrix(self, sadj): + pass # we override the full make_modeled_clustering_matrices method using normalized_signless_laplacian (recall that normalized signless laplacian SVD is equivalent to spectral clustering with normalized graph laplacian) + + + + @staticmethod + def normalized_signless_laplacian(A): + degrees = A.sum(axis=1).A1 + + with np.errstate(divide='ignore', invalid='ignore'): + D_inv_sqrt = 1.0 / np.sqrt(degrees) + D_inv_sqrt[np.isinf(D_inv_sqrt) | np.isnan(D_inv_sqrt)] = 0 + D_inv_sqrt_matrix = sparse.diags(D_inv_sqrt) + D_plus_A = sparse.diags(degrees) + A + L_signless = D_inv_sqrt_matrix @ D_plus_A @ D_inv_sqrt_matrix + return L_signless + + def make_modeled_clustering_matrices(self): + self.modeled_clustering_matrices = [_NSC.normalized_signless_laplacian(sadj) for sadj in self.sadj_list] + + + + +class _SMM(_Simple): + def __init__(self, *args, **kwargs): + kwargs['which_eig'] = 'largest' + super().__init__(*args, **kwargs) + + def make_clustering_matrix(self, A): + degrees = A.sum(axis=1).A1 + m = A.sum() / 2 + expected = sparse.csr_matrix((np.outer(degrees, degrees) / (2 * m)).astype(A.dtype)) + B = A - expected + return B + +class _BHC(_Simple): + def make_clustering_matrix(self, sadj): + r=None + if r is None: + r = np.sqrt(sadj.mean() * sadj.shape[0]) + + n = sadj.shape[0] + d = sadj.sum(axis=1).A1 + H = (r**2 - 1) * sparse.eye(n) - r * sadj + sparse.diags(d) + return H + + +class _USC(_Simple): + """Unnormalized Spectral Clustering using the unnormalized graph Laplacian L = D - A.""" + + def make_clustering_matrix(self, sadj): + pass # we override the full make_modeled_clustering_matrices method using unnormalized_laplacian + + @staticmethod + def unnormalized_laplacian(A): + """ + Compute the unnormalized graph Laplacian L = D - A. + + Args: + A: Adjacency matrix (sparse) + + Returns: + L: Unnormalized Laplacian matrix (sparse) + """ + degrees = A.sum(axis=1).A1 + D = sparse.diags(degrees) + L = D - A + return L + + def make_modeled_clustering_matrices(self): + self.modeled_clustering_matrices = [_USC.unnormalized_laplacian(sadj) for sadj in self.sadj_list] + + +def _signed_power_mean_laplacian(A_pos, A_neg, p, epsilon=1e-6): + pass + +class _SRSC(_Signed): + def make_clustering_matrix(self, sadj_plus, sadj_minus): + pass + +class _GMSC(_Signed): + def make_clustering_matrix(self, sadj_plus, sadj_minus): + pass + +class _SPMSC(_Signed): + def make_clustering_matrix(self, sadj_plus, sadj_minus): + pass + +class _OSC(_Overlapping): + def make_clustering_matrix(self, sadj): + pass + +class _CSC(_Overlapping): + def make_clustering_matrix(self, sadj): + pass + +class _DDSC(_Directed): + def make_clustering_matrix(self, sadj): + pass + +class _BSC(_Directed): + def make_clustering_matrix(self, sadj): + pass + +class _RWSC(_Directed): + def make_clustering_matrix(self, sadj): + pass + +class _PMLC(_Multiview): + def make_clustering_matrix(self, sadj): + pass + +class _SCC(_Cocommunity): + def make_clustering_matrix(self, sadj): + pass + +class _HSC(_Hierarchical): + def make_clustering_matrix(self, sadj): + pass + +## End subsubclass implementations diff --git a/test/communitydetection/__init__.py b/test/communitydetection/__init__.py new file mode 100644 index 00000000..ceccbfc4 --- /dev/null +++ b/test/communitydetection/__init__.py @@ -0,0 +1 @@ +"""Test helpers for community detection.""" diff --git a/test/communitydetection/test_temporal_spectral.py b/test/communitydetection/test_temporal_spectral.py new file mode 100644 index 00000000..91460117 --- /dev/null +++ b/test/communitydetection/test_temporal_spectral.py @@ -0,0 +1,165 @@ +from __future__ import annotations + +from typing import List, Optional, Sequence, Tuple + +import numpy as np +from numpy.random import SeedSequence +from scipy.sparse import csr_matrix, issparse +from sklearn.metrics import ( + adjusted_mutual_info_score, + adjusted_rand_score, + normalized_mutual_info_score, +) + +from teneto.classes import TemporalNetwork +from teneto.communitydetection import temporal_louvain, temporal_spectral + + +def _adj_list_to_temporalnetwork_raw(adj_list: Sequence, nettype: str = "bu", forcesparse: bool = False) -> TemporalNetwork: + dense = [] + for A in adj_list: + if issparse(A): + A = A.toarray() + else: + A = np.asarray(A) + if A.ndim != 2 or A.shape[0] != A.shape[1]: + raise ValueError("Each snapshot must be square 2D.") + dense.append(A.astype(float, copy=False)) + arr = np.stack(dense, axis=2) + tnet = TemporalNetwork(N=arr.shape[0], T=arr.shape[2], nettype=nettype, timetype="discrete") + tnet.network_from_array(arr, forcesparse=forcesparse) + return tnet + + +def sbm_dynamic_model_2( + N: int = 150, + k: int = 3, + pin: Sequence[float] | float = (0.3, 0.3, 0.2), + pout: float = 0.15, + p_switch: float = 0.15, + T: int = 10, + Totalsims: int = 2, + base_seed: Optional[int] = None, + community_probs: Optional[np.ndarray] = None, + verbose: bool = False, + return_teneto: bool = False, + teneto_nettype: str = "bu", +) -> Tuple[List[List[csr_matrix]], np.ndarray, Optional[List[TemporalNetwork]]]: + ss = SeedSequence(base_seed) + child_seeds = ss.spawn(Totalsims) + + adjacency_all: List[List[csr_matrix]] = [[] for _ in range(Totalsims)] + true_labels_all = np.zeros((Totalsims, T, N), dtype=int) + teneto_list: List[TemporalNetwork] = [] + + for sims in range(Totalsims): + rng = np.random.default_rng(child_seeds[sims]) + G = rng.binomial(1, pout, (N, N)) + G = np.diag(np.diag(G)) + np.tril(G, -1) + np.tril(G, -1).T + if community_probs is None: + labels = rng.integers(1, k + 1, size=N) + else: + labels = rng.choice(np.arange(1, k + 1), size=N, p=community_probs) + true_labels_all[sims, 0, :] = labels + clusters = {i: np.where(labels == i)[0] for i in range(1, k + 1)} + + for i in range(1, k + 1): + Gk = rng.binomial(1, pin[i - 1], (len(clusters[i]), len(clusters[i]))) + Gk = np.diag(np.diag(Gk)) + np.tril(Gk, -1) + np.tril(Gk, -1).T + G[np.ix_(clusters[i], clusters[i])] = Gk + + for t in range(T): + if verbose: + print("making graph for time step ", t) + if t > 0: + G = rng.binomial(1, pout, (N, N)) + G = np.diag(np.diag(G)) + np.tril(G, -1) + np.tril(G, -1).T + for i in range(1, k + 1): + clusters[i] = np.where(labels == i)[0] + if clusters[i].size > 0: + permute_indices = rng.permutation(len(clusters[i])) + changing_members = permute_indices[: int(np.ceil(len(clusters[i]) * p_switch))] + if changing_members.size > 0: + z = rng.binomial(1, p_switch, len(changing_members)) + labels_temp = rng.integers(1, k + 1, size=np.sum(z)) + labels[changing_members[z == 1]] = labels_temp + + for i in range(1, k + 1): + clusters[i] = np.where(labels == i)[0] + Gk = rng.binomial(1, pin[i - 1], (len(clusters[i]), len(clusters[i]))) + Gk = np.diag(np.diag(Gk)) + np.tril(Gk, -1) + np.tril(Gk, -1).T + G[np.ix_(clusters[i], clusters[i])] = Gk + + adjacency_all[sims].append(csr_matrix(G)) + true_labels_all[sims, t, :] = labels + + if return_teneto: + teneto_list.append(_adj_list_to_temporalnetwork_raw(adjacency_all[sims], nettype=teneto_nettype, forcesparse=True)) + + if return_teneto: + return adjacency_all, true_labels_all, teneto_list + return adjacency_all, true_labels_all + + +def test_temporal_spectral_recovers_sbm_partitions(): + T = 50 + k=2 + _, true_labels, tnet_list = sbm_dynamic_model_2( + N=200, + k=k, + pin=(0.3, 0.3), + pout=0.1, + p_switch=0.05, + T=T, + Totalsims=1, + base_seed=7, + return_teneto=True, + ) + + labels = temporal_spectral( + tnet_list[0], ke=k, kc=k, mode="simple-nsc", stable_communities=True, smoothing_filter="median", smoothing_parameter=T // 10 + 1, which_eig="smallest", + ) + + ari_scores = [] + nmi_scores = [] + ami_scores = [] + for t in range(labels.shape[1]): + gt = true_labels[0, t, :] + est = labels[:, t] + ari_scores.append(adjusted_rand_score(gt, est)) + nmi_scores.append(normalized_mutual_info_score(gt, est)) + ami_scores.append(adjusted_mutual_info_score(gt, est)) + + ari_scores = np.array(ari_scores) + nmi_scores = np.array(nmi_scores) + ami_scores = np.array(ami_scores) + + assert np.all(ari_scores >= 0.94) + assert np.all(nmi_scores >= 0.88) + assert np.all(ami_scores >= 0.88) + + +def test_temporal_spectral_aligns_with_temporal_louvain(): + _, _, tnet_list = sbm_dynamic_model_2( + N=20, + k=2, + pin=(0.9, 0.9), + pout=0.05, + p_switch=0.1, + T=4, + Totalsims=1, + base_seed=11, + return_teneto=True, + ) + + labels_spectral = temporal_spectral( + tnet_list[0], ke=2, kc=2, mode="simple-nsc", stable_communities=True + ) + labels_louvain = temporal_louvain(tnet_list[0], intersliceweight=0.1, n_iter=5) + + ari_scores = [] + for t in range(labels_spectral.shape[1]): + ari_scores.append(adjusted_rand_score(labels_spectral[:, t], labels_louvain[:, t])) + + ari_scores = np.array(ari_scores) + assert np.all(ari_scores >= 0.75)