Skip to content

Commit 1f0fd61

Browse files
committed
scNiche v1
1 parent 40432d1 commit 1f0fd61

24 files changed

+6857
-0
lines changed

images/workflow.jpg

371 KB
Loading

requirements.txt

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
squidpy==1.2.3
2+
anndata==0.10.1
3+
scanpy==1.9.1
4+
pandas==1.5.0
5+
matplotlib==3.6.2
6+
seaborn==0.13.0
7+
scikit-misc==0.2.0

scniche/__init__.py

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
import sys
2+
from importlib.metadata import version
3+
4+
from . import datasets
5+
from . import plot as pl
6+
from . import preprocess as pp
7+
from . import trainer as tr
8+
from . import analysis as al
9+
10+
# has to be done at the end, after everything has been imported
11+
sys.modules.update({f"{__name__}.{m}": globals()[m] for m in ["tr", "pp", "pl", "al"]})
12+
13+
__version__ = version("scniche")
14+
15+
__all__ = ["__version__", "datasets", "tr", "pp", "pl", "al"]
16+

scniche/analysis/__init__.py

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
from ._utils import *
2+
3+
__all__ = [
4+
"enrichment",
5+
"spatial_link",
6+
"calculate_average_exp_multi",
7+
"calculate_composition_multi",
8+
"average_exp",
9+
"cal_composition",
10+
]

scniche/analysis/_utils.py

+260
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,260 @@
1+
from anndata import AnnData
2+
from typing import Optional
3+
from scipy.stats import mannwhitneyu
4+
from tqdm import tqdm
5+
import pandas as pd
6+
import numpy as np
7+
import squidpy as sq
8+
from concurrent.futures import ThreadPoolExecutor
9+
10+
11+
def enrichment(adata: AnnData, id_key: str, val_key: str, library_key: Optional[str] = None):
12+
print(f"Calculating the enrichment of each cluster ({id_key}) in group ({val_key})...")
13+
obs = adata.obs.copy()
14+
id_list = sorted(list(set(obs[id_key])))
15+
val_list = sorted(list(set(obs[val_key])))
16+
if library_key is None:
17+
library_key = 'library'
18+
if library_key in obs.columns:
19+
library_key = library_key + '_new'
20+
obs[library_key] = 1
21+
library_list = sorted(list(set(obs[library_key])))
22+
23+
df = obs.groupby([library_key, id_key, val_key]).size().unstack().fillna(0)
24+
df_list = []
25+
for i in library_list:
26+
df_tmp = df.loc[(i,)]
27+
# avoid false positive (for example, (2 / 4 = 0.5) > (100 / 400 = 0.25))
28+
# min niche size
29+
MIN_NUM = 20
30+
df_tmp.loc[:, df_tmp.sum() < MIN_NUM] = 0
31+
32+
df_list_tmp = df_tmp.div(df_tmp.sum(axis=0), axis=1)
33+
df_list.append(df_list_tmp)
34+
35+
fc = []
36+
pval = []
37+
for idx in id_list:
38+
fc_tmp = []
39+
pval_tmp = []
40+
41+
pbar = tqdm(val_list)
42+
for val in pbar:
43+
# prop
44+
observed = [df.loc[idx, val] for df in df_list]
45+
expected = [df.drop(val, axis=1).loc[idx, ].mean() for df in df_list]
46+
47+
# filter NA, some niches don't exist in every library
48+
observed_new = [x for x in observed if not np.isnan(x)]
49+
expected_new = [x for x in expected if not np.isnan(x)]
50+
51+
# calculate enrichment
52+
_, p_value = mannwhitneyu(observed_new, expected_new, alternative='greater')
53+
# add a small value (1e-6) to avoind inf / -inf
54+
observed_mean = np.mean(observed_new)
55+
expected_mean = np.mean(expected_new)
56+
if observed_mean == 0:
57+
observed_mean = observed_mean + 1e-6
58+
if expected_mean == 0:
59+
expected_mean = expected_mean + 1e-6
60+
61+
fold_change = np.log2(observed_mean / expected_mean)
62+
pval_tmp.append(p_value)
63+
fc_tmp.append(fold_change)
64+
65+
pbar.set_description(f"Cluster: {idx}")
66+
67+
pval.append(pval_tmp)
68+
fc.append(fc_tmp)
69+
70+
pval = pd.DataFrame(pval)
71+
fc = pd.DataFrame(fc)
72+
pval.columns = fc.columns = val_list
73+
pval.index = fc.index = id_list
74+
75+
adata.uns[f"{val_key}_{id_key}_fc"] = fc
76+
adata.uns[f"{val_key}_{id_key}_pval"] = pval
77+
adata.uns[f"{val_key}_{id_key}_proportion"] = df_list
78+
79+
80+
def _remove_intra_cluster_links(
81+
adata: AnnData,
82+
cluster_key: str,
83+
connectivity_key: str = 'spatial_connectivities',
84+
distances_key: str = 'spatial_distances',
85+
copy: bool = False):
86+
conns = adata.obsp[connectivity_key].copy() if copy else adata.obsp[connectivity_key]
87+
dists = adata.obsp[distances_key].copy() if copy else adata.obsp[distances_key]
88+
89+
for matrix in [conns, dists]:
90+
target_clusters = np.array(adata.obs[cluster_key][matrix.indices])
91+
source_clusters = np.array(
92+
adata.obs[cluster_key][np.repeat(np.arange(matrix.indptr.shape[0] - 1), np.diff(matrix.indptr))]
93+
)
94+
95+
inter_cluster_mask = (source_clusters != target_clusters).astype(int)
96+
97+
matrix.data *= inter_cluster_mask
98+
matrix.eliminate_zeros()
99+
100+
if copy:
101+
return conns, dists
102+
103+
104+
def _observed_n_clusters_links(adj, labels):
105+
labels_unique = labels.cat.categories
106+
obs = np.zeros((len(labels_unique), len(labels_unique)))
107+
for i, l1 in enumerate(labels_unique):
108+
total_cluster_links = adj[labels == l1]
109+
110+
for j, l2 in enumerate(labels_unique):
111+
other_cluster_links = total_cluster_links[:, labels == l2]
112+
113+
obs[i, j] = np.sum(other_cluster_links)
114+
115+
obs = pd.DataFrame(obs, columns=labels_unique, index=labels_unique)
116+
return obs
117+
118+
119+
def spatial_link(adata: AnnData, cluster_key: str, only_inter: bool = True, normalize: bool = False,
120+
connectivity_key: str = 'spatial_connectivities', distances_key: str = 'spatial_distances',):
121+
adata_select = adata.copy()
122+
123+
# spatial graph
124+
sq.gr.spatial_neighbors(adata_select, delaunay=True)
125+
126+
if only_inter:
127+
_remove_intra_cluster_links(adata_select, cluster_key=cluster_key,
128+
connectivity_key=connectivity_key, distances_key=distances_key)
129+
130+
adj = adata_select.obsp[connectivity_key]
131+
label = adata_select.obs[cluster_key]
132+
observed = _observed_n_clusters_links(adj, label)
133+
134+
if normalize:
135+
for i in list(set(label)):
136+
for j in list(set(label)):
137+
if observed.loc[i, j] == 0:
138+
continue
139+
observed.loc[i, j] = observed.loc[i, j] / adata_select[adata_select.obs[cluster_key] == i].shape[0]
140+
141+
adata.uns[f"{cluster_key}_spatial_link"] = observed
142+
143+
144+
def _calculate_composition_ratio(df, library_key, niche_key, celltype_key, cutoff, selected):
145+
df_select = df[df[library_key] == selected]
146+
niche_ratios = df_select[niche_key].value_counts(normalize=True)
147+
niche_to_keep = niche_ratios[niche_ratios >= cutoff].index
148+
149+
result = []
150+
for niche in niche_to_keep:
151+
df_slice = df_select[df_select[niche_key] == niche]
152+
celltype_ratios = df_slice[celltype_key].value_counts(normalize=True)
153+
niche_ratio = niche_ratios[niche]
154+
row = {
155+
library_key: selected,
156+
niche_key: niche,
157+
'Niche_ratio': niche_ratio,
158+
**{f'{c}_ratio': ratio for c, ratio in celltype_ratios.items()}
159+
}
160+
result.append(row)
161+
162+
return pd.DataFrame(result)
163+
164+
165+
def _calculate_average_exp(df, library_key, niche_key, celltype_key, gene_list, cutoff, selected_celltype, selected):
166+
df_select = df[df[library_key] == selected]
167+
niche_ratios = df_select[niche_key].value_counts(normalize=True)
168+
niche_to_keep = niche_ratios[niche_ratios >= cutoff].index
169+
170+
result = []
171+
for niche in niche_to_keep:
172+
df_slice = df_select[(df_select[niche_key] == niche) & (df_select[celltype_key].isin(selected_celltype))]
173+
avg_values = {gene: df_slice[gene].mean() for gene in gene_list}
174+
niche_ratio = niche_ratios[niche]
175+
row = {
176+
library_key: selected,
177+
niche_key: niche,
178+
'Niche_ratio': niche_ratio,
179+
**avg_values
180+
}
181+
result.append(row)
182+
183+
return pd.DataFrame(result)
184+
185+
186+
def calculate_composition_multi(adata: AnnData, library_key: str, niche_key: str, celltype_key: str, cutoff: float = 0.05):
187+
obs = adata.obs.copy()
188+
obs[celltype_key] = obs[celltype_key].astype('str')
189+
190+
with ThreadPoolExecutor() as executor:
191+
results = [
192+
executor.submit(
193+
_calculate_composition_ratio, obs, library_key, niche_key, celltype_key, cutoff, sample
194+
) for sample in obs[library_key].unique()
195+
]
196+
197+
dfs = [r.result() for r in results]
198+
199+
df = pd.concat(dfs, ignore_index=True)
200+
201+
adata.uns["composition_multi"] = df
202+
203+
204+
def calculate_average_exp_multi(adata: AnnData, layer_key: str, library_key: str, niche_key: str, celltype_key: str,
205+
gene_list: Optional[list] = None, selected_celltype: Optional[list] = None, cutoff: float = 0.05):
206+
adata_use = adata.copy()
207+
if layer_key not in adata_use.layers.keys():
208+
adata_use.layers[layer_key] = adata_use.X
209+
exp = adata_use.to_df(layer=layer_key)
210+
if gene_list is None:
211+
gene_list = list(exp.columns)
212+
213+
obs = adata_use.obs.copy()
214+
obs = pd.concat([obs, exp], axis=1)
215+
216+
if selected_celltype is None:
217+
selected_celltype = list(set(obs[celltype_key]))
218+
219+
with ThreadPoolExecutor() as executor:
220+
results = [
221+
executor.submit(
222+
_calculate_average_exp, obs, library_key, niche_key, celltype_key, gene_list, cutoff, selected_celltype, sample
223+
) for sample in obs[library_key].unique()
224+
]
225+
226+
dfs = [r.result() for r in results]
227+
228+
df = pd.concat(dfs, ignore_index=True)
229+
230+
adata.uns["expression_multi"] = df
231+
232+
233+
def average_exp(adata: AnnData, layer_key: str, id_key: str, val_key: str, select_idx: Optional[list] = None,
234+
select_val: Optional[list] = None):
235+
adata_use = adata.copy()
236+
if select_idx is not None:
237+
adata_use = adata_use[adata_use.obs[id_key].isin(select_idx)].copy()
238+
239+
if select_val is not None:
240+
adata_use = adata_use[adata_use.obs[val_key].isin(select_val)].copy()
241+
242+
if layer_key not in adata_use.layers.keys():
243+
adata_use.layers[layer_key] = adata_use.X
244+
245+
df = adata_use.to_df(layer=layer_key)
246+
average_df = df.groupby(adata_use.obs[id_key]).mean()
247+
248+
return average_df
249+
250+
251+
def cal_composition(adata: AnnData, id_key: str, val_key: str, ):
252+
obs = adata.obs.copy()
253+
df = obs.groupby([val_key, id_key]).size().unstack().fillna(0)
254+
df = df.div(df.sum(axis=1), axis=0)
255+
return df
256+
257+
258+
259+
260+

scniche/datasets/__init__.py

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
from ._dataset import *
2+
3+
4+
__all__ = [
5+
"simulated_data",
6+
"mouse_spleen_codex",
7+
"human_utuc_imc",
8+
"human_tnbc_mibi_tof",
9+
"mouse_liver_seq_scope",
10+
]

scniche/datasets/_dataset.py

+75
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
import scanpy as sc
2+
from pathlib import Path
3+
4+
5+
def simulated_data():
6+
"""
7+
Simulated dataset generated by scCbe (Nat Commun, 2024).
8+
The human PBMCs scRNA-seq dataset from Kang et al. (Nat Biotechnol, 2018) contains eight control and eight IFN-β treated samples
9+
:return:
10+
"""
11+
HERE = Path(__file__).parent
12+
filename = HERE / "simulated_data.h5ad.gz"
13+
adata = sc.read_h5ad(filename)
14+
return adata
15+
16+
17+
def mouse_spleen_codex():
18+
"""
19+
Processed mouse spleen CODEX dataset from Goltsev et al. (Cell, 2018), containing 1 wild-type spleen sample
20+
(BALBc-1) with the compartment labels.
21+
22+
This downloads 25.83 MB of data upon the first call of the function and stores it in `./scniche_data/BALBc-1.h5ad`.
23+
:return: AnnData
24+
"""
25+
url = "https://figshare.com/ndownloader/files/46669087"
26+
datasetdir = './scniche_data/BALBc-1.h5ad'
27+
adata = sc.read(datasetdir, backup_url=url)
28+
return adata
29+
30+
31+
def human_utuc_imc():
32+
"""
33+
Processed human upper tract urothelial carcinoma (UTUC) IMC dataset from Ohara et al. (Nat Commun, 2024),
34+
containing 16 images with manually annotated topological domain labels.
35+
36+
This downloads 70.62 MB of data upon the first call of the function and stores it in `./scniche_data/UTUC.h5ad`.
37+
:return: AnnData
38+
"""
39+
url = "https://figshare.com/ndownloader/files/46669108"
40+
datasetdir = './scniche_data/UTUC.h5ad'
41+
adata = sc.read(datasetdir, backup_url=url)
42+
return adata
43+
44+
45+
def human_tnbc_mibi_tof():
46+
"""
47+
Processed human triple-negative breast cancer (TNBC) MIBI-TOF dataset from Keren et al. (Cell, 2018),
48+
containing 173,205 cells from 19 mixed subtype samples and 15 compartmentalized subtype samples.
49+
The cell niches have been identified by scNiche.
50+
51+
This downloads 106.84 MB of data upon the first call of the function and stores it in `./scniche_data/TNBC.h5ad`.
52+
:return: AnnData
53+
"""
54+
url = "https://figshare.com/ndownloader/files/47153020"
55+
datasetdir = './scniche_data/TNBC.h5ad'
56+
adata = sc.read(datasetdir, backup_url=url)
57+
58+
return adata
59+
60+
61+
def mouse_liver_seq_scope():
62+
"""
63+
Processed mouse liver Seq-Scope dataset from Cho et al. (Cell, 2021), containing 37,505 cells from 6 normal donors
64+
and 4 early-onset liver failure induced by excessive mTORC1 signaling (Tsc1Δhep/Depdc5Δhep or TD model) donors.
65+
The cell niches have been identified by scNiche.
66+
67+
This downloads 57.15 MB of data upon the first call of the function and stores it in `./scniche_data/Liver.h5ad`.
68+
:return: AnnData
69+
"""
70+
url = "https://figshare.com/ndownloader/files/47153125"
71+
datasetdir = './scniche_data/Liver.h5ad'
72+
adata = sc.read(datasetdir, backup_url=url)
73+
74+
return adata
75+
5.14 MB
Binary file not shown.

scniche/plot/__init__.py

+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
from ._utils import *
2+
from . import palettes
3+
4+
5+
__all__ = [
6+
"palettes",
7+
"stacked_barplot",
8+
"enrichment_heatmap",
9+
"multi_boxplot",
10+
"multi_lineplot",
11+
"multi_linrplot_group",
12+
]

0 commit comments

Comments
 (0)