scverse · justjhong · Sep 23, 2021 · Sep 2, 2021 · Sep 2, 2021 · Sep 2, 2021
diff --git a/docs/api/developer.rst b/docs/api/developer.rst
@@ -90,6 +90,7 @@ Existing module classes with respective generative and inference procedures.
    module.TOTALVAE
    module.VAE
    module.VAEC
+   module.AmortizedLDAPyroModule
 
 
 External module

diff --git a/docs/api/user.rst b/docs/api/user.rst
@@ -30,6 +30,7 @@ Model
    model.SCVI
    model.TOTALVI
    model.MULTIVI
+   model.AmortizedLDA
 
 
 

diff --git a/docs/references.rst b/docs/references.rst
@@ -60,3 +60,7 @@ References
 .. [Lopez18] Romain Lopez, Jeffrey Regier, Michael Cole, Michael I. Jordan, Nir Yosef (2018),
    *Deep generative modeling for single-cell transcriptomics*,
    `Nature Methods <https://www.nature.com/articles/s41592-018-0229-2.epdf?author_access_token=5sMbnZl1iBFitATlpKkddtRgN0jAjWel9jnR3ZoTv0P1-tTjoP-mBfrGiMqpQx63aBtxToJssRfpqQ482otMbBw2GIGGeinWV4cULBLPg4L4DpCg92dEtoMaB1crCRDG7DgtNrM_1j17VfvHfoy1cQ%3D%3D>`__.
+
+.. [Blei03] David M. Blei, Andrew Y. Ng, Michael I. Jordan (2003),
+   *Latent Dirichlet Allocation*,
+   `Journal of Machine Learning Research <https://www.jmlr.org/papers/volume3/blei03a/blei03a.pdf>`__.
diff --git a/scvi/dataloaders/_anntorchdataset.py b/scvi/dataloaders/_anntorchdataset.py
@@ -5,7 +5,6 @@
 import h5py
 import numpy as np
 import pandas as pd
-import torch
 from anndata._core.sparse_dataset import SparseDataset
 from torch.utils.data import Dataset
 
@@ -93,7 +92,7 @@ def setup_getitem(self):
 
         self.attributes_and_types = keys_to_type
 
-    def __getitem__(self, idx: List[int]) -> Dict[str, torch.Tensor]:
+    def __getitem__(self, idx: List[int]) -> Dict[str, np.ndarray]:
         """Get tensors in dictionary from anndata at idx."""
         data_numpy = {}
         for key, dtype in self.attributes_and_types.items():

diff --git a/scvi/external/stereoscope/_model.py b/scvi/external/stereoscope/_model.py
@@ -1,5 +1,4 @@
 import logging
-import warnings
 from typing import Optional, Tuple, Union
 
 import numpy as np
@@ -220,10 +219,7 @@ def get_proportions(self, keep_noise=False) -> pd.DataFrame:
         keep_noise
             whether to account for the noise term as a standalone cell type in the proportion estimate.
         """
-        if self.is_trained_ is False:
-            warnings.warn(
-                "Trying to query inferred values from an untrained model. Please train the model first."
-            )
+        self._check_if_trained()
 
         column_names = self.cell_type_mapping
         if keep_noise:
@@ -249,10 +245,7 @@ def get_scale_for_ct(
         -------
         gene_expression
         """
-        if self.is_trained_ is False:
-            warnings.warn(
-                "Trying to query inferred values from an untrained model. Please train the model first."
-            )
+        self._check_if_trained()
         ind_y = np.array([np.where(ct == self.cell_type_mapping)[0][0] for ct in y])
         if ind_y.shape != y.shape:
             raise ValueError(

diff --git a/scvi/model/__init__.py b/scvi/model/__init__.py
@@ -1,3 +1,4 @@
+from ._amortizedlda import AmortizedLDA
 from ._autozi import AUTOZI
 from ._condscvi import CondSCVI
 from ._destvi import DestVI
@@ -18,4 +19,5 @@
     "CondSCVI",
     "DestVI",
     "MULTIVI",
+    "AmortizedLDA",
 ]
diff --git a/scvi/model/_amortizedlda.py b/scvi/model/_amortizedlda.py
@@ -0,0 +1,240 @@
+import collections.abc
+import logging
+from typing import Optional, Sequence, Union
+
+import numpy as np
+import pandas as pd
+import pyro
+import torch
+from anndata import AnnData
+
+from scvi._constants import _CONSTANTS
+from scvi.module import AmortizedLDAPyroModule
+
+from .base import BaseModelClass, PyroSviTrainMixin
+
+logger = logging.getLogger(__name__)
+
+
+class AmortizedLDA(PyroSviTrainMixin, BaseModelClass):
+    """
+    Amortized Latent Dirichlet Allocation [Blei03]_.
+
+    Parameters
+    ----------
+    adata
+        AnnData object that has been registered via :func:`~scvi.data.setup_anndata`.
+    n_topics
+        Number of topics to model.
+    n_hidden
+        Number of nodes in the hidden layer of the encoder.
+    cell_topic_prior
+        Prior of cell topic distribution. If `None`, defaults to `1 / n_topics`.
+    topic_gene_prior
+        Prior of topic gene distribution. If `None`, defaults to `1 / n_topics`.
+
+    Examples
+    --------
+    >>> adata = anndata.read_h5ad(path_to_anndata)
+    >>> scvi.data.setup_anndata(adata)
+    >>> model = scvi.model.AmortizedLDA(adata)
+    >>> model.train()
+    >>> gene_by_topic = model.get_gene_by_topic()
+    >>> adata.obsm["X_LDA"] = model.get_latent_representation()
+    """
+
+    def __init__(
+        self,
+        adata: AnnData,
+        n_topics: int = 20,
+        n_hidden: int = 128,
+        cell_topic_prior: Optional[Union[float, Sequence[float]]] = None,
+        topic_gene_prior: Optional[Union[float, Sequence[float]]] = None,
+    ):
+        # in case any other model was created before that shares the same parameter names.
+        pyro.clear_param_store()
+
+        super().__init__(adata)
+
+        n_input = self.summary_stats["n_vars"]
+
+        if (
+            cell_topic_prior is not None
+            and not isinstance(cell_topic_prior, float)
+            and (
+                not isinstance(cell_topic_prior, collections.abc.Sequence)
+                or len(cell_topic_prior) != n_topics
+            )
+        ):
+            raise ValueError(
+                f"cell_topic_prior, {cell_topic_prior}, must be None, "
+                f"a float or a Sequence of length n_topics."
+            )
+        if (
+            topic_gene_prior is not None
+            and not isinstance(topic_gene_prior, float)
+            and (
+                not isinstance(topic_gene_prior, collections.abc.Sequence)
+                or len(topic_gene_prior) != n_input
+            )
+        ):
+            raise ValueError(
+                f"topic_gene_prior, {topic_gene_prior}, must be None, "
+                f"a float or a Sequence of length n_input."
+            )
+
+        self.module = AmortizedLDAPyroModule(
+            n_input=n_input,
+            n_topics=n_topics,
+            n_hidden=n_hidden,
+            cell_topic_prior=cell_topic_prior,
+            topic_gene_prior=topic_gene_prior,
+        )
+
+        self.init_params_ = self._get_init_params(locals())
+
+    def get_gene_by_topic(self, give_mean=True) -> pd.DataFrame:
+        """
+        Gets the gene by topic matrix.
+
+        Parameters
+        ----------
+        adata
+            AnnData to transform. If None, returns the gene by topic matrix for
+            the source AnnData.
+        give_mean
+            Give mean of distribution if True or sample from it.
+
+        Returns
+        -------
+        A `n_var x n_topics` Pandas DataFrame containing the gene by topic matrix.
+        """
+        self._check_if_trained(warn=False)
+
+        topic_by_gene = self.module.topic_by_gene(give_mean=give_mean)
+
+        return pd.DataFrame(
+            data=topic_by_gene.numpy().T,
+            index=self.adata.var_names,
+            columns=[f"topic_{i}" for i in range(topic_by_gene.shape[0])],
+        )
+
+    def get_latent_representation(
+        self,
+        adata: Optional[AnnData] = None,
+        indices: Optional[Sequence[int]] = None,
+        batch_size: Optional[int] = None,
+        give_mean: bool = True,
+    ) -> pd.DataFrame:
+        """
+        Converts a count matrix to an inferred topic distribution.
+
+        Parameters
+        ----------
+        adata
+            AnnData object with equivalent structure to initial AnnData. If `None`, defaults to the
+            AnnData object used to initialize the model.
+        indices
+            Indices of cells in adata to use. If `None`, all cells are used.
+        batch_size
+            Minibatch size for data loading into model. Defaults to `scvi.settings.batch_size`.
+        give_mean
+            Give mean of distribution or sample from it.
+
+        Returns
+        -------
+        A `n_obs x n_topics` Pandas DataFrame containing the normalized estimate
+        of the topic distribution for each observation.
+        """
+        self._check_if_trained(warn=False)
+        adata = self._validate_anndata(adata)
+
+        dl = self._make_data_loader(adata=adata, indices=indices, batch_size=batch_size)
+
+        transformed_xs = []
+        for tensors in dl:
+            x = tensors[_CONSTANTS.X_KEY]
+            transformed_xs.append(
+                self.module.get_topic_distribution(x, give_mean=give_mean)
+            )
+        transformed_x = torch.cat(transformed_xs).numpy()
+
+        return pd.DataFrame(
+            data=transformed_x,
+            index=adata.obs_names,
+            columns=[f"topic_{i}" for i in range(transformed_x.shape[1])],
+        )
+
+    def get_elbo(
+        self,
+        adata: Optional[AnnData] = None,
+        indices: Optional[Sequence[int]] = None,
+        batch_size: Optional[int] = None,
+    ) -> float:
+        """
+        Return the ELBO for the data.
+
+        The ELBO is a lower bound on the log likelihood of the data used for optimization
+        of VAEs. Note, this is not the negative ELBO, higher is better.
+
+        Parameters
+        ----------
+        adata
+            AnnData object with equivalent structure to initial AnnData. If `None`, defaults to the
+            AnnData object used to initialize the model.
+        indices
+            Indices of cells in adata to use. If `None`, all cells are used.
+        batch_size
+            Minibatch size for data loading into model. Defaults to `scvi.settings.batch_size`.
+
+        Returns
+        -------
+        The positive ELBO.
+        """
+        self._check_if_trained(warn=False)
+        adata = self._validate_anndata(adata)
+
+        dl = self._make_data_loader(adata=adata, indices=indices, batch_size=batch_size)
+
+        elbos = []
+        for tensors in dl:
+            x = tensors[_CONSTANTS.X_KEY]
+            library = x.sum(dim=1)
+            elbos.append(self.module.get_elbo(x, library, len(dl.indices)))
+        return np.mean(elbos)
+
+    def get_perplexity(
+        self,
+        adata: Optional[AnnData] = None,
+        indices: Optional[Sequence[int]] = None,
+        batch_size: Optional[int] = None,
+    ) -> float:
+        """
+        Computes approximate perplexity for `adata`.
+
+        Perplexity is defined as exp(-1 * log-likelihood per count).
+
+        Parameters
+        ----------
+        adata
+            AnnData object with equivalent structure to initial AnnData. If `None`, defaults to the
+            AnnData object used to initialize the model.
+        indices
+            Indices of cells in adata to use. If `None`, all cells are used.
+        batch_size
+            Minibatch size for data loading into model. Defaults to `scvi.settings.batch_size`.
+
+        Returns
+        -------
+        Perplexity.
+        """
+        self._check_if_trained(warn=False)
+        adata = self._validate_anndata(adata)
+
+        dl = self._make_data_loader(adata=adata, indices=indices, batch_size=batch_size)
+        total_counts = sum(tensors[_CONSTANTS.X_KEY].sum().item() for tensors in dl)
+
+        return np.exp(
+            self.get_elbo(adata=adata, indices=indices, batch_size=batch_size)
+            / total_counts
+        )
diff --git a/scvi/model/_destvi.py b/scvi/model/_destvi.py
@@ -1,5 +1,4 @@
 import logging
-import warnings
 from collections import OrderedDict
 from typing import Dict, Optional, Sequence, Union
 
@@ -161,10 +160,7 @@ def get_proportions(
         batch_size
             Minibatch size for data loading into model. Only used if amortization. Defaults to `scvi.settings.batch_size`.
         """
-        if self.is_trained_ is False:
-            warnings.warn(
-                "Trying to query inferred values from an untrained model. Please train the model first."
-            )
+        self._check_if_trained()
 
         column_names = self.cell_type_mapping
         index_names = self.adata.obs.index
@@ -216,10 +212,7 @@ def get_gamma(
         return_numpy
             if activated, will return a numpy array of shape is n_spots x n_latent x n_labels.
         """
-        if self.is_trained_ is False:
-            warnings.warn(
-                "Trying to query inferred values from an untrained model. Please train the model first."
-            )
+        self._check_if_trained()
 
         column_names = np.arange(self.module.n_latent)
         index_names = self.adata.obs.index
@@ -276,10 +269,7 @@ def get_scale_for_ct(
         -------
         Pandas dataframe of gene_expression
         """
-        if self.is_trained_ is False:
-            warnings.warn(
-                "Trying to query inferred values from an untrained model. Please train the model first."
-            )
+        self._check_if_trained()
 
         if label not in self.cell_type_mapping:
             raise ValueError("Unknown cell type")

diff --git a/scvi/model/_scanvi.py b/scvi/model/_scanvi.py
@@ -176,8 +176,9 @@ def from_scvi_model(
         scanvi_kwargs
             kwargs for scanVI model
         """
-        if scvi_model.is_trained_ is False:
-            warnings.warn("Passed in scvi model hasn't been trained yet.")
+        scvi_model._check_if_trained(
+            message="Passed in scvi model hasn't been trained yet."
+        )
 
         scanvi_kwargs = dict(scanvi_kwargs)
         init_params = scvi_model.init_params_

diff --git a/scvi/model/_totalvi.py b/scvi/model/_totalvi.py
@@ -295,8 +295,7 @@ def get_latent_library_size(
         batch_size
             Minibatch size for data loading into model. Defaults to `scvi.settings.batch_size`.
         """
-        if not self.is_trained_:
-            raise RuntimeError("Please train the model first.")
+        self._check_if_trained(warn=False)
 
         adata = self._validate_anndata(adata)
         post = self._make_data_loader(