Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

De delta eps #1043

Merged
merged 15 commits into from
May 6, 2021
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 93 additions & 4 deletions scvi/utils/_differential.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
import numpy as np
import pandas as pd
import torch
from scipy.sparse import issparse
from sklearn.mixture import GaussianMixture

from scvi._compat import Literal

Expand Down Expand Up @@ -48,6 +50,7 @@ def get_bayes_factors(
change_fn: Optional[Union[str, Callable]] = None,
m1_domain_fn: Optional[Callable] = None,
delta: Optional[float] = 0.5,
eps: float = 0.0,
PierreBoyeau marked this conversation as resolved.
Show resolved Hide resolved
cred_interval_lvls: Optional[Union[List[float], np.ndarray]] = None,
) -> Dict[str, np.ndarray]:
r"""
Expand Down Expand Up @@ -165,7 +168,6 @@ def get_bayes_factors(
# "Differential expression requires a Posterior object created with all indices."
# )

eps = 1e-8 # used for numerical stability
# Normalized means sampling for both populations
scales_batches_1 = self.scale_sampler(
selection=idx1,
Expand Down Expand Up @@ -236,6 +238,18 @@ def get_bayes_factors(
m_permutation=m_permutation,
)

# Adding pseudocounts to the scales
if eps is None:
logger.debug("Estimating pseudocounts offet from the data")
where_zero_a = densify(np.max(self.adata[idx1].X, 0)) == 0
where_zero_b = densify(np.max(self.adata[idx2].X, 0)) == 0
PierreBoyeau marked this conversation as resolved.
Show resolved Hide resolved
eps = estimate_pseudocounts_offset(
scales_a=scales_1,
scales_b=scales_2,
where_zero_a=where_zero_a,
where_zero_b=where_zero_b,
)
logger.debug("Using epsilon ~ {}".format(eps))
# Core of function: hypotheses testing based on the posterior samples we obtained above
if mode == "vanilla":
logger.debug("Differential expression using vanilla mode")
Expand All @@ -254,7 +268,7 @@ def get_bayes_factors(

# step 1: Construct the change function
def lfc(x, y):
return np.log2(x) - np.log2(y)
return np.log2(x + eps) - np.log2(y + eps)

if change_fn == "log-fold" or change_fn is None:
change_fn = lfc
Expand All @@ -263,10 +277,15 @@ def lfc(x, y):

# step2: Construct the DE area function
if m1_domain_fn is None:
delta = delta if delta is not None else 0.5

def m1_domain_fn(samples):
return np.abs(samples) >= delta
delta_ = (
delta
if delta is not None
else estimate_delta(lfc_means=samples.mean(0))
)
logger.debug("Using delta ~ {:.2f}".format(delta_))
return np.abs(samples) >= delta_
PierreBoyeau marked this conversation as resolved.
Show resolved Hide resolved

change_fn_specs = inspect.getfullargspec(change_fn)
domain_fn_specs = inspect.getfullargspec(m1_domain_fn)
Expand Down Expand Up @@ -403,6 +422,70 @@ def scale_sampler(
return dict(scale=px_scales, batch=batch_ids)


def estimate_delta(lfc_means: List[np.ndarray], coef=0.6, min_thres=0.3):
"""Computes a threshold LFC value based on means of LFCs.
adamgayoso marked this conversation as resolved.
Show resolved Hide resolved

Parameters
----------
lfc_means
LFC means for each gene, should be 1d.
coef
Tunable hyperparameter to choose the threshold based on estimated modes, defaults to 0.6
min_thres
Minimum returned threshold value, defaults to 0.3
"""
logger.debug("Estimating delta from effect size samples")
assert lfc_means.ndim == 1
gmm = GaussianMixture(n_components=3)
gmm.fit(lfc_means[:, None])
vals = np.sort(gmm.means_.squeeze())
res = coef * np.abs(vals[[0, -1]]).mean()
res = np.maximum(min_thres, res)
return res


def estimate_pseudocounts_offset(
scales_a: List[np.ndarray],
scales_b: List[np.ndarray],
where_zero_a: List[np.ndarray],
where_zero_b: List[np.ndarray],
):
"""Determines pseudocount offset to shrink
LFCs asssociated with non-expressed genes to zero.
PierreBoyeau marked this conversation as resolved.
Show resolved Hide resolved

Parameters
----------
scales_a
Scales in first population
scales_b
Scales in second population
where_zero_a
mask where no observed counts
where_zero_b
mask where no observed counts
"""

adamgayoso marked this conversation as resolved.
Show resolved Hide resolved
max_scales_a = np.max(scales_a, 0)
max_scales_b = np.max(scales_b, 0)
assert max_scales_a.shape == where_zero_a.shape
assert max_scales_b.shape == where_zero_b.shape
assert where_zero_a.shape == where_zero_b.shape

PierreBoyeau marked this conversation as resolved.
Show resolved Hide resolved
if where_zero_a.sum() >= 1:
artefact_scales_a = max_scales_a[where_zero_a]
eps_a = np.percentile(artefact_scales_a, q=90)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe should make this percentile an argument with default value

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you think this should be a parameter in get_bayes_factors? I replaced this default value in estimate_pseudocounts_offset but not in get_bayes_factors yet

else:
eps_a = 1e-10

if where_zero_b.sum() >= 1:
artefact_scales_b = max_scales_b[where_zero_b]
eps_b = np.percentile(artefact_scales_b, q=90)
else:
eps_b = 1e-10
res = np.maximum(eps_a, eps_b)
return res


def pairs_sampler(
arr1: Union[List[float], np.ndarray, torch.Tensor],
arr2: Union[List[float], np.ndarray, torch.Tensor],
Expand Down Expand Up @@ -577,3 +660,9 @@ def save_cluster_xlsx(
for i, x in enumerate(cluster_names):
de_results[i].to_excel(writer, sheet_name=str(x))
writer.close()


def densify(arr):
if issparse(arr):
return np.asarray(arr.todense()).squeeze()
return arr
40 changes: 40 additions & 0 deletions tests/core/test_differential.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,40 @@
from scvi.data import synthetic_iid
from scvi.model import SCVI
from scvi.utils import DifferentialComputation
from scvi.utils._differential import estimate_delta, estimate_pseudocounts_offset


def test_features():
a = np.random.randn(
100,
)
b = 3 + np.random.randn(
100,
)
c = -3 + np.random.randn(
100,
)
alls = np.concatenate([a, b, c])
delta = estimate_delta(alls)
assert delta >= 0.4 * 3
assert delta <= 6
PierreBoyeau marked this conversation as resolved.
Show resolved Hide resolved

scales_a = np.random.rand(100, 50)
where_zero_a = np.zeros(50, dtype=bool)
where_zero_a[:10] = True
scales_a[:, :10] = 1e-6

scales_b = np.random.rand(100, 50)
where_zero_b = np.zeros(50, dtype=bool)
where_zero_b[-10:] = True
scales_b[:, -10:] = 1e-7
offset = estimate_pseudocounts_offset(
scales_a=scales_a,
scales_b=scales_b,
where_zero_a=where_zero_a,
where_zero_b=where_zero_b,
)
assert offset <= 1e-6
PierreBoyeau marked this conversation as resolved.
Show resolved Hide resolved


def test_differential_computation(save_path):
Expand All @@ -23,6 +57,12 @@ def test_differential_computation(save_path):

dc.get_bayes_factors(cell_idx1, cell_idx2, mode="vanilla", use_permutation=True)
dc.get_bayes_factors(cell_idx1, cell_idx2, mode="change", use_permutation=False)
dc.get_bayes_factors(
cell_idx1, cell_idx2, mode="change", use_permutation=False, delta=None
)
dc.get_bayes_factors(
cell_idx1, cell_idx2, mode="change", use_permutation=False, delta=None, eps=None
)
dc.get_bayes_factors(cell_idx1, cell_idx2, mode="change", cred_interval_lvls=[0.75])

delta = 0.5
Expand Down