theislab · mumichae · Apr 13, 2022 · Sep 27, 2021 · Sep 27, 2021 · Sep 27, 2021
diff --git a/scIB/metrics/graph_connectivity.py b/scIB/metrics/graph_connectivity.py
@@ -3,25 +3,29 @@
 from scipy.sparse.csgraph import connected_components
 
 
-def graph_connectivity(adata_post, label_key):
+def graph_connectivity(adata, label_key):
     """"
-    Metric that quantifies how connected the subgraph corresponding to each batch cluster is.
+    Quantify how connected the subgraph corresponding to each batch cluster is.
+    Calculate per label: #cells_in_largest_connected_component/#all_cells
+    Final score: Average over labels
+
+    :param adata: adata with computed neighborhood graph
+    :param label_key: name in adata.obs containing the cell identity labels
     """
-    if 'neighbors' not in adata_post.uns:
-        raise KeyError('Please compute the neighborhood graph before running this '
-                       'function!')
+    if 'neighbors' not in adata.uns:
+        raise KeyError(
+            'Please compute the neighborhood graph before running this function!'
+        )
 
     clust_res = []
 
-    for ct in adata_post.obs[label_key].cat.categories:
-        adata_post_sub = adata_post[adata_post.obs[label_key].isin([ct]),]
-        _,labs = connected_components(
-            adata_post_sub.obsp['connectivities'],
+    for label in adata.obs[label_key].cat.categories:
+        adata_sub = adata[adata.obs[label_key].isin([label])]
+        _, labels = connected_components(
+            adata_sub.obsp['connectivities'],
             connection='strong'
         )
-        tab = pd.value_counts(labs)
-        clust_res.append(tab[0]/sum(tab))
+        tab = pd.value_counts(labels)
+        clust_res.append(tab.max() / sum(tab))
 
     return np.mean(clust_res)
-
-
diff --git a/scIB/metrics/highly_variable_genes.py b/scIB/metrics/highly_variable_genes.py
@@ -28,32 +28,20 @@ def hvg_overlap(adata_pre, adata_post, batch, n_hvg=500, verbose=False):
     adata_post_list = splitBatches(adata_post, batch)
     overlap = []
 
-    if ('hvg_before' in adata_pre.uns_keys()) and (set(hvg_post) == set(adata_pre.var_names)):
-        print('Using precomputed hvgs per batch')
-        hvg_pre_list = adata_pre.uns['hvg_before']
-    else:
-        hvg_pre_list = precompute_hvg_batch(adata_pre, batch, hvg_post)
-
-        for i in range(len(adata_post_list)):  # range(len(adata_pre_list)):
-            sc.pp.filter_genes(adata_post_list[i], min_cells=1)  # remove genes unexpressed (otherwise hvg might break)
-
-            # ov = list(set(adata_pre_list[i].var_names).intersection(set(hvg_pre_list[i])))
-            # adata_pre_list[i] = adata_pre_list[i][:,ov]
-            # adata_post_list[i] = adata_post_list[i][:,ov]
-            batch_var = adata_post_list[i].obs[batch][0]
-
-            n_hvg_tmp = len(hvg_pre_list[batch_var])
-            # adata_pre.uns['n_hvg'][hvg_post]#np.minimum(n_hvg, int(0.5*adata_post_list[i].n_vars))
-            if verbose:
-                print(n_hvg_tmp)
-            # if n_hvg_tmp<n_hvg:
-            #    print(adata_post_list[i].obs[batch][0]+' has less than the specified number of genes')
-            #    print('Number of genes: '+str(adata_post_list[i].n_vars))
-            # hvg_pre = sc.pp.highly_variable_genes(adata_pre_list[i], flavor='cell_ranger', n_top_genes=n_hvg_tmp, inplace=False)
-            tmp_pre = hvg_pre_list[batch_var]  # adata_pre_list[i].var.index[hvg_pre['highly_variable']]
-            hvg_post = sc.pp.highly_variable_genes(adata_post_list[i], flavor='cell_ranger', n_top_genes=n_hvg_tmp,
-                                                   inplace=False)
-            tmp_post = adata_post_list[i].var.index[hvg_post['highly_variable']]
-            n_hvg_real = np.minimum(len(tmp_pre), len(tmp_post))
-            overlap.append((len(set(tmp_pre).intersection(set(tmp_post)))) / n_hvg_real)
+    hvg_pre_list = precompute_hvg_batch(adata_pre, batch, hvg_post)
+
+    for i in range(len(adata_post_list)):  # range(len(adata_pre_list)):
+        sc.pp.filter_genes(adata_post_list[i], min_cells=1)  # remove genes unexpressed (otherwise hvg might break)
+        batch_var = adata_post_list[i].obs[batch][0]
+        n_hvg_tmp = len(hvg_pre_list[batch_var])
+
+        if verbose:
+            print(n_hvg_tmp)
+
+        tmp_pre = hvg_pre_list[batch_var]
+        hvg_post = sc.pp.highly_variable_genes(adata_post_list[i], flavor='cell_ranger', n_top_genes=n_hvg_tmp,
+                                               inplace=False)
+        tmp_post = adata_post_list[i].var.index[hvg_post['highly_variable']]
+        n_hvg_real = np.minimum(len(tmp_pre), len(tmp_post))
+        overlap.append((len(set(tmp_pre).intersection(set(tmp_post)))) / n_hvg_real)
     return np.mean(overlap)
diff --git a/tests/metrics/test_graph_connectivity.py b/tests/metrics/test_graph_connectivity.py
@@ -0,0 +1,7 @@
+from tests.common import *
+
+
+def test_graph_connectivity(adata_neighbors):
+    score = scIB.me.graph_connectivity(adata_neighbors, label_key='celltype')
+    LOGGER.info(f"score: {score}")
+    assert score == 0.9670013350457753