Source code for svgbit.core.cluster

from __future__ import annotations

import pandas as pd
from scipy.cluster import hierarchy as sch


[docs]def cluster( hotspot_df: pd.DataFrame, AI_series: pd.Series, n_svgs: int = 1000, n_svg_clusters: int = 8, threshold: float = 0.3, ) -> pd.Series: """ Cluster SVGs using hotspot matrix. Parameters ========== hotspot_df : pd.DataFrame A hotspot DataFrame generated by svgbit. AI_series : pd.Series A Series for AI value. n_svgs : int, default 1000 Number of SVGs to find clusters. n_svg_clusters : int, default 8 Number of SVG clusters to find. threshold : float, dafault 0.3 min value to identify multiple svg clusters to spot. Returns ======= gene_result : pd.Series A Series of clustering result. """ selected_genes = AI_series.sort_values(ascending=False)[:n_svgs].index hotspot_set = hotspot_df[selected_genes] gene_distmat = sch.distance.pdist(hotspot_set.T, metric="jaccard") Z_gene = sch.linkage(gene_distmat, method="ward") gene_result = pd.Series( sch.fcluster(Z_gene, t=n_svg_clusters, criterion="maxclust"), index=selected_genes, ).sort_values() mean_df = pd.DataFrame(index=hotspot_df.index) for i in range(1, n_svg_clusters + 1): genes = gene_result[gene_result == i].index mean_df[i] = hotspot_df[genes].T.mean() columns = ["spot_type"] [columns.append(f"type_{i}") for i in range(1, 1 + n_svg_clusters)] type_df = pd.DataFrame(columns=columns) for spot in mean_df.index: temp_df = pd.DataFrame(index=[spot], columns=columns) sort_series = mean_df.loc[spot, :].sort_values(ascending=False) n_multi = len(sort_series[sort_series > threshold]) spot_type = "singlet" if sort_series.iloc[0] < 0.2: spot_type = "uncertain" if n_multi > 1: spot_type = f"{n_multi}_multi_types" temp_df.loc[spot, "spot_type"] = spot_type temp_df.loc[spot, columns[1:]] = sort_series.index type_df = pd.concat([type_df, temp_df]) return gene_result, type_df