from __future__ import annotations
import pandas as pd
from scipy.cluster import hierarchy as sch
[docs]def cluster(
hotspot_df: pd.DataFrame,
AI_series: pd.Series,
n_svgs: int = 1000,
n_svg_clusters: int = 8,
threshold: float = 0.3,
) -> pd.Series:
"""
Cluster SVGs using hotspot matrix.
Parameters
==========
hotspot_df : pd.DataFrame
A hotspot DataFrame generated by svgbit.
AI_series : pd.Series
A Series for AI value.
n_svgs : int, default 1000
Number of SVGs to find clusters.
n_svg_clusters : int, default 8
Number of SVG clusters to find.
threshold : float, dafault 0.3
min value to identify multiple svg clusters to spot.
Returns
=======
gene_result : pd.Series
A Series of clustering result.
"""
selected_genes = AI_series.sort_values(ascending=False)[:n_svgs].index
hotspot_set = hotspot_df[selected_genes]
gene_distmat = sch.distance.pdist(hotspot_set.T, metric="jaccard")
Z_gene = sch.linkage(gene_distmat, method="ward")
gene_result = pd.Series(
sch.fcluster(Z_gene, t=n_svg_clusters, criterion="maxclust"),
index=selected_genes,
).sort_values()
mean_df = pd.DataFrame(index=hotspot_df.index)
for i in range(1, n_svg_clusters + 1):
genes = gene_result[gene_result == i].index
mean_df[i] = hotspot_df[genes].T.mean()
columns = ["spot_type"]
[columns.append(f"type_{i}") for i in range(1, 1 + n_svg_clusters)]
type_df = pd.DataFrame(columns=columns)
for spot in mean_df.index:
temp_df = pd.DataFrame(index=[spot], columns=columns)
sort_series = mean_df.loc[spot, :].sort_values(ascending=False)
n_multi = len(sort_series[sort_series > threshold])
spot_type = "singlet"
if sort_series.iloc[0] < 0.2:
spot_type = "uncertain"
if n_multi > 1:
spot_type = f"{n_multi}_multi_types"
temp_df.loc[spot, "spot_type"] = spot_type
temp_df.loc[spot, columns[1:]] = sort_series.index
type_df = pd.concat([type_df, temp_df])
return gene_result, type_df