Merge pull request #1041 from YosefLab/DE_string_queries

De string queries
scverse · Apr 27, 2021 · 75eeb81 · 75eeb81
2 parents 19b9c4f + 26c04ee
commit 75eeb81
Show file tree

Hide file tree

Showing 9 changed files with 111 additions and 24 deletions.
diff --git a/docs/conf.py b/docs/conf.py
@@ -36,13 +36,13 @@
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 extensions = [
     "sphinx.ext.autodoc",
+    "sphinx.ext.intersphinx",
     "sphinx.ext.viewcode",
     "nbsphinx",
     "nbsphinx_link",
     "sphinx.ext.mathjax",
     "sphinx.ext.napoleon",
     "sphinx_autodoc_typehints",  # needs to be after napoleon
-    "sphinx.ext.intersphinx",
     "sphinx.ext.autosummary",
     "scanpydoc.elegant_typehints",
     "scanpydoc.definition_list_typed_field",
@@ -80,8 +80,8 @@
     anndata=("https://anndata.readthedocs.io/en/stable/", None),
     ipython=("https://ipython.readthedocs.io/en/stable/", None),
     matplotlib=("https://matplotlib.org/", None),
-    numpy=("https://docs.scipy.org/doc/numpy/", None),
-    pandas=("https://pandas.pydata.org/pandas-docs/stable/", None),
+    numpy=("https://numpy.org/doc/stable/", None),
+    pandas=("https://pandas.pydata.org/docs/", None),
     python=("https://docs.python.org/3", None),
     scipy=("https://docs.scipy.org/doc/scipy/reference/", None),
     sklearn=("https://scikit-learn.org/stable/", None),

diff --git a/docs/release_notes/v0.11.0.rst b/docs/release_notes/v0.11.0.rst
@@ -0,0 +1,27 @@
+New in 0.11.0 (2021-04-28)
+--------------------------
+
+Changes
+~~~~~~~
+- Includes new optional variance parameterization for the `Encoder` module (`#1037`_).
+- Provides new way to select subpopulations for DE using Pandas queries (`#1041`_).
+- Optional pseudocounts and automatic effect-size threshold for DE in the change mode (`#1043`_).
+
+
+Contributors
+~~~~~~~~~~~~
+- `@adamgayoso`_
+- `@romain-lopez`_
+- `@PierreBoyeau`_
+
+
+.. _`@adamgayoso`: https://github.com/adamgayoso
+.. _`@romain-lopez`: https://github.com/romain-lopez
+.. _`@PierreBoyeau`: https://github.com/PierreBoyeau
+
+
+.. _`#1037`: https://github.com/YosefLab/scvi-tools/pull/1037
+.. _`#1041`: https://github.com/YosefLab/scvi-tools/pull/1041
+.. _`#1043`: https://github.com/YosefLab/scvi-tools/pull/1043
+
+
diff --git a/scvi/_docs.py b/scvi/_docs.py
@@ -13,11 +13,16 @@
     If `None`, compare each group in `group1` to the union of the rest of the groups
     in `groupby`. If a group identifier, compare with respect to this group.
 idx1
-    Boolean mask or indices for `group1`. `idx1` and `idx2` can be used as an alternative
-    to the AnnData keys. If `idx1` is not `None`, this option overrides `group1`
+    `idx1` and `idx2` can be used as an alternative to the AnnData keys.
+    Custom identifier for `group1` that can be of three sorts: (1) a boolean mask,
+    (2) indices, or (3) a string. If it is a string, then it will query indices that
+    verifies conditions on `adata.obs`, as described in :meth:`pandas.DataFrame.query`
+    If `idx1` is not `None`, this option overrides `group1`
     and `group2`.
 idx2
-    Boolean mask or indices for `group2`. By default, includes all cells not specified in
+    Custom identifier for `group2` that has the same
+    properties as `idx1`.
+    By default, includes all cells not specified in
     `idx1`.
 mode
     Method for differential expression. See user guide for full explanation.

diff --git a/scvi/model/_peakvi.py b/scvi/model/_peakvi.py
@@ -359,8 +359,8 @@ def differential_accessibility(
         groupby: Optional[str] = None,
         group1: Optional[Iterable[str]] = None,
         group2: Optional[str] = None,
-        idx1: Optional[Union[Sequence[int], Sequence[bool]]] = None,
-        idx2: Optional[Union[Sequence[int], Sequence[bool]]] = None,
+        idx1: Optional[Union[Sequence[int], Sequence[bool], str]] = None,
+        idx2: Optional[Union[Sequence[int], Sequence[bool], str]] = None,
         mode: Literal["vanilla", "change"] = "change",
         delta: float = 0.05,
         batch_size: Optional[int] = None,

diff --git a/scvi/model/_totalvi.py b/scvi/model/_totalvi.py
@@ -654,8 +654,8 @@ def differential_expression(
         groupby: Optional[str] = None,
         group1: Optional[Iterable[str]] = None,
         group2: Optional[str] = None,
-        idx1: Optional[Union[Sequence[int], Sequence[bool]]] = None,
-        idx2: Optional[Union[Sequence[int], Sequence[bool]]] = None,
+        idx1: Optional[Union[Sequence[int], Sequence[bool], str]] = None,
+        idx2: Optional[Union[Sequence[int], Sequence[bool], str]] = None,
         mode: Literal["vanilla", "change"] = "change",
         delta: float = 0.25,
         batch_size: Optional[int] = None,

diff --git a/scvi/model/base/_base_model.py b/scvi/model/base/_base_model.py
@@ -254,7 +254,7 @@ def save(
         save_anndata
             If True, also saves the anndata
         anndata_write_kwargs
-            Kwargs for :func:`~anndata.AnnData.write`
+            Kwargs for :meth:`~anndata.AnnData.write`
         """
         # get all the user attributes
         user_attributes = self._get_user_attributes()

diff --git a/scvi/model/base/_rnamixin.py b/scvi/model/base/_rnamixin.py
@@ -162,8 +162,8 @@ def differential_expression(
         groupby: Optional[str] = None,
         group1: Optional[Iterable[str]] = None,
         group2: Optional[str] = None,
-        idx1: Optional[Union[Sequence[int], Sequence[bool]]] = None,
-        idx2: Optional[Union[Sequence[int], Sequence[bool]]] = None,
+        idx1: Optional[Union[Sequence[int], Sequence[bool], str]] = None,
+        idx2: Optional[Union[Sequence[int], Sequence[bool], str]] = None,
         mode: Literal["vanilla", "change"] = "change",
         delta: float = 0.25,
         batch_size: Optional[int] = None,

diff --git a/scvi/model/base/_utils.py b/scvi/model/base/_utils.py
@@ -3,12 +3,12 @@
 import pickle
 import warnings
 from collections.abc import Iterable as IterableClass
-from typing import Optional
+from typing import List, Optional, Union
 
 import numpy as np
 import pandas as pd
 import torch
-from anndata import read
+from anndata import AnnData, read
 
 from scvi._compat import Literal
 from scvi.utils import DifferentialComputation, track
@@ -86,6 +86,54 @@ def _validate_var_names(adata, source_var_names):
         )
 
 
+def _prepare_obs(
+    idx1: Union[List[bool], np.ndarray, str],
+    idx2: Union[List[bool], np.ndarray, str],
+    adata: AnnData,
+):
+    """
+    Construct an array used for masking.
+
+    Given population identifiers `idx1` and potentially `idx2`,
+    this function creates an array `obs_col` that identifies both populations
+    for observations contained in `adata`.
+    In particular, `obs_col` will take values `group1` (resp. `group2`)
+    for `idx1` (resp `idx2`).
+
+    Parameters
+    ----------
+    idx1
+        Can be of three types. First, it can corresponds to a boolean mask that
+        has the same shape as adata. It can also corresponds to a list of indices.
+        Last, it can correspond to string query of adata.obs columns.
+    idx2
+        Same as above
+    adata
+        Anndata
+    """
+
+    def ravel_idx(my_idx, obs_df):
+        return (
+            obs_df.index.isin(obs_df.query(my_idx).index)
+            if isinstance(my_idx, str)
+            else np.asarray(my_idx).ravel()
+        )
+
+    obs_df = adata.obs
+    idx1 = ravel_idx(idx1, obs_df)
+    g1_key = "one"
+    obs_col = np.array(["None"] * adata.shape[0], dtype=str)
+    obs_col[idx1] = g1_key
+    group1 = [g1_key]
+    group2 = None if idx2 is None else "two"
+    if idx2 is not None:
+        idx2 = ravel_idx(idx2, obs_df)
+        obs_col[idx2] = group2
+    if (obs_col[idx1].shape[0] == 0) or (obs_col[idx2].shape[0] == 0):
+        raise ValueError("One of idx1 or idx2 has size zero.")
+    return obs_col, group1, group2
+
+
 def _de_core(
     adata,
     model_fn,
@@ -120,18 +168,10 @@ def _de_core(
     # make a temp obs key using indices
     temp_key = None
     if idx1 is not None:
-        idx1 = np.asarray(idx1).ravel()
-        g1_key = "one"
-        obs_col = np.array(["None"] * adata.shape[0], dtype=str)
-        obs_col[idx1] = g1_key
-        group2 = None if idx2 is None else "two"
-        if idx2 is not None:
-            idx2 = np.asarray(idx2).ravel()
-            obs_col[idx2] = group2
+        obs_col, group1, group2 = _prepare_obs(idx1, idx2, adata)
         temp_key = "_scvi_temp_de"
         adata.obs[temp_key] = obs_col
         groupby = temp_key
-        group1 = [g1_key]
 
     df_results = []
     dc = DifferentialComputation(model_fn, adata)

diff --git a/tests/core/test_differential.py b/tests/core/test_differential.py
@@ -5,6 +5,7 @@
 
 from scvi.data import synthetic_iid
 from scvi.model import SCVI
+from scvi.model.base._utils import _prepare_obs
 from scvi.utils import DifferentialComputation
 
 
@@ -50,6 +51,20 @@ def m1_domain_fn_test(samples):
         adata[adata.obs["labels"] == "label_1"], groupby="batch"
     )
 
+    # Test query features
+    obs_col, group1, _, = _prepare_obs(
+        idx1="(labels == 'label_1') & (batch == 'batch_1')", idx2=None, adata=adata
+    )
+    assert (obs_col == group1).sum() == adata.obs.loc[
+        lambda x: (x.labels == "label_1") & (x.batch == "batch_1")
+    ].shape[0]
+    model.differential_expression(
+        idx1="labels == 'label_1'",
+    )
+    model.differential_expression(
+        idx1="labels == 'label_1'", idx2="(labels == 'label_2') & (batch == 'batch_1')"
+    )
+
     # test that ints as group work
     a = synthetic_iid()
     a.obs["test"] = [0] * 200 + [1] * 200