Source code for pyLOM.MANIFOLD.wrapper

#!/usr/bin/env python
#
# Manifold learning methodologies.
#
# Last revision: 11/02/2025
import numpy as np
from scipy.sparse.csgraph import shortest_path
from scipy.linalg import eigh

from ..utils.gpu import cp
from ..vmmath    import euclidean_d
from ..utils     import cr_nvtx as cr, raiseError, pprint



[docs]
@cr('MANIFOLD.isomap')
def isomap(X:np.ndarray, dims:int, n_size:int, comp:int = 1 ,verbose:bool = True):
    """
    Computes Isomap embedding using the algorithm of Tenenbaum, de Silva, and Langford (2000).
    
    Parameters:
    X : ndarray
        NxM Data matrix with N points in the mesh for M simulations
    dims: int
        Embedding dimensionality to use 
    n_size : int
        Neighborhood size (number of neighbors for 'k' method)
    comp: int
        Component to embed, if more than 1 (defaults to 1, the largest)
    verbose: bool
        Display information (default is True)
    
    Returns:
    Y : ndarray
        Contains coordinates for d-dimensional embeddings in Y.
    R : list
        Residual variances for the embedding in Y.
    E : ndarray
        Edge matrix for neighborhood graph.
    """    
    # Compute pairwise distances in a condensed form and convert to a square form
    D = cp.asnumpy(euclidean_d(X)) if type(X) is cp.ndarray else euclidean_d(X)

    # Step 0: Initialization and Parameters
    N = D.shape[0]
    if D.shape[1] != N:
        raise ValueError("D must be a square matrix")

    K = n_size
    if not isinstance(K, int) or K <= 0:
        raiseError("Number of neighbors for 'k' method must be a positive integer")
    
    INF = 1000 * np.max(D) * N  # Effectively infinite distance
        
    # Step 1: Construct neighborhood graph
    if verbose:
        pprint(0,"Constructing neighborhood graph...",flush=True)
        
    # Sort distances and keep only K-nearest neighbors
    sorted_indices = np.argsort(D, axis=1)
    for i in range(N):
        D[i, sorted_indices[i, K+1:]] = INF  # Only keep the K nearest neighbors

    D = np.minimum(D, D.T)  # Ensure distance matrix is symmetric

    E = (D != INF).astype(int)  # Edge matrix for neighborhood graph

    # Step 2: Compute shortest paths using Floyd-Warshall algorithm
    if verbose:
        pprint(0,"Computing shortest paths...",flush=True)

    G = shortest_path(D, method='FW', directed=False, unweighted=False)

    # Step 3: Identify connected components
    firsts = np.min((G == INF).astype(int) * np.arange(N), axis=1)
    comps, indices = np.unique(firsts, return_inverse=True)
    size_comps = np.array([np.sum(indices == i) for i in range(len(comps))])
    
    sorted_indices = np.argsort(size_comps)[::-1]
    comps = comps[sorted_indices]
    size_comps = size_comps[sorted_indices]

    if comp > len(comps):
        comp = 1  # Default to largest component if specified component is out of range

    if verbose:
        pprint(0,f"Number of connected components in graph: {len(comps)}",flush=True)
        pprint(0,f"Embedding component {comp} with {size_comps[comp-1]} points.",flush=True)

    # Select the largest connected component
    index = np.where(firsts == comps[comp-1])[0]
    G = G[np.ix_(index, index)]
    N = len(index)

    # Step 4: Construct low-dimensional embeddings using Classical MDS
    H = np.eye(N) - np.ones((N, N)) / N
    B = -0.5 * H @ (G**2) @ H  # Centering the matrix
    
    eigenvalues, eigenvectors = eigh(B, subset_by_index=[N - dims, N - 1])
    
    # Sorting eigenvalues and corresponding eigenvectors in descending order
    idx = np.argsort(eigenvalues)[::-1]
    eigenvalues = eigenvalues[idx]
    eigenvectors = eigenvectors[:, idx]

    # Compute embeddings for each specified dimension
    if dims <= N:
        coords = eigenvectors[:, :dims] * np.sqrt(eigenvalues[:dims])
        Y = coords.T
        
        # Compute residual variance
        L2_D = np.linalg.norm(coords[:, None] - coords[None, :], axis=2).flatten()
        r2 = 1 - np.corrcoef(L2_D, G.flatten())[0, 1]**2
        R = r2

        if verbose:
            pprint(0,f"Isomap on {N} points with dimensionality {dims} --> residual variance = {R}",flush=True)
    else:
        raiseError('Selected number of dimensions is higher than the number of samples')
        return None, None, None

    return Y, R, E




[docs]
@cr('MANIFOLD.mds')
def mds(X:np.ndarray, dims:int, verbose:bool = True):
    """
    Computes the MDS embedding using a custom approach with squared distances and eigen-decomposition.
    
    Parameters:
    X : ndarray
        NxM Data matrix with N points in the mesh for M simulations
    dims : int
        Embedding dimensionality to use (p in your MATLAB code)
    
    Returns:
    Y : ndarray
        Contains coordinates for d-dimensional embeddings in Y.
    """
    # Step 1: Compute the pairwise Euclidean distance matrix and square it
    D = cp.asnumpy(euclidean_d(X)) if type(X) is cp.ndarray else euclidean_d(X)
    D = D * D

    # Step 2: Apply the custom centering formula to get matrix B
    n = D.shape[0]
    C = np.eye(n) - np.ones((n,n))/n
    B = -0.5*np.matmul(np.matmul(C,D),C)

    # Step 3: Compute eigen-decomposition
    # eigenvalues, eigenvectors = eigh((B + B.T) / 2)
    eigenvalues, eigenvectors = eigh(B)

    # Step 4: Sort eigenvalues and corresponding eigenvectors in descending order
    idx = np.argsort(eigenvalues)[::-1]
    eigenvalues = eigenvalues[idx]
    eigenvectors = eigenvectors[:, idx]
    eigenvalues = eigenvalues[:dims]
    eigenvectors = eigenvectors[:,:dims]

    # Step 5: Keep only positive eigenvalues beyond a roundoff threshold
    threshold = np.max(np.abs(eigenvalues)) * np.finfo(eigenvalues.dtype).eps**(3 / 4)
    keep = eigenvalues > threshold
    if not np.any(keep):
        Y = np.zeros((n, 1))
    else:
        # Compute the embedding with kept eigenvalues and corresponding eigenvectors
        Y = eigenvectors[:, keep] * np.sqrt(eigenvalues[keep])

    return Y.T