Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update highly_variable_genes.py #269

Merged
merged 9 commits into from
Apr 13, 2022
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 43 additions & 28 deletions scib/metrics/highly_variable_genes.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,17 @@


def precompute_hvg_batch(adata, batch, features, n_hvg=500, save_hvg=False):
"""
Compute HVGs per batch

:param adata: anndata object
:param batch: key in adata.obs
:param features: features to subset to
:param n_hvg: maximum number of HVGs to compute
:save_hvg: whether to add hvg per batch information to adata object
:return:
dictionary of batch to HVG list
"""
adata_list = split_batches(adata, batch, hvg=features)
hvg_dir = {}
for i in adata_list:
Expand All @@ -23,37 +34,41 @@ def precompute_hvg_batch(adata, batch, features, n_hvg=500, save_hvg=False):


def hvg_overlap(adata_pre, adata_post, batch, n_hvg=500, verbose=False):
"""
Metric that computes the average percentage of overlapping
highly variable genes per batch pre post integration.

:param adata_pre: Unintegrated anndata object
:param adata_post: Integrated anndata object
:param batch: Batch variable
:param n_hvg: Number of hvgs to compute per batch
:return:
Average percentage of overlapping highly variable genes
"""
hvg_post = adata_post.var_names

adata_post_list = split_batches(adata_post, batch)
overlap = []

if ('hvg_before' in adata_pre.uns_keys()) and (set(hvg_post) == set(adata_pre.var_names)):
print('Using precomputed hvgs per batch')
hvg_pre_list = adata_pre.uns['hvg_before']
else:
hvg_pre_list = precompute_hvg_batch(adata_pre, batch, hvg_post)

for i in range(len(adata_post_list)): # range(len(adata_pre_list)):
sc.pp.filter_genes(adata_post_list[i], min_cells=1) # remove genes unexpressed (otherwise hvg might break)

# ov = list(set(adata_pre_list[i].var_names).intersection(set(hvg_pre_list[i])))
# adata_pre_list[i] = adata_pre_list[i][:,ov]
# adata_post_list[i] = adata_post_list[i][:,ov]
batch_var = adata_post_list[i].obs[batch][0]

n_hvg_tmp = len(hvg_pre_list[batch_var])
# adata_pre.uns['n_hvg'][hvg_post]#np.minimum(n_hvg, int(0.5*adata_post_list[i].n_vars))
if verbose:
print(n_hvg_tmp)
# if n_hvg_tmp<n_hvg:
# print(adata_post_list[i].obs[batch][0]+' has less than the specified number of genes')
# print('Number of genes: '+str(adata_post_list[i].n_vars))
# hvg_pre = sc.pp.highly_variable_genes(adata_pre_list[i], flavor='cell_ranger', n_top_genes=n_hvg_tmp, inplace=False)
tmp_pre = hvg_pre_list[batch_var] # adata_pre_list[i].var.index[hvg_pre['highly_variable']]
hvg_post = sc.pp.highly_variable_genes(adata_post_list[i], flavor='cell_ranger', n_top_genes=n_hvg_tmp,
inplace=False)
tmp_post = adata_post_list[i].var.index[hvg_post['highly_variable']]
n_hvg_real = np.minimum(len(tmp_pre), len(tmp_post))
overlap.append((len(set(tmp_pre).intersection(set(tmp_post)))) / n_hvg_real)
hvg_pre_list = precompute_hvg_batch(adata_pre, batch, hvg_post)

for ad_post in adata_post_list: # range(len(adata_pre_list)):
# remove genes unexpressed (otherwise hvg might break)
sc.pp.filter_genes(ad_post, min_cells=1)
batch_var = ad_post.obs[batch][0]
n_hvg_tmp = len(hvg_pre_list[batch_var])

if verbose:
print(n_hvg_tmp)

tmp_pre = hvg_pre_list[batch_var]
hvg_post = sc.pp.highly_variable_genes(
ad_post,
flavor='cell_ranger',
n_top_genes=n_hvg_tmp,
inplace=False
)
tmp_post = ad_post.var.index[hvg_post['highly_variable']]
n_hvg_real = np.minimum(len(tmp_pre), len(tmp_post))
overlap.append((len(set(tmp_pre).intersection(set(tmp_post)))) / n_hvg_real)
return np.mean(overlap)