pymc-devs · twiecki · Mar 31, 2023 · Mar 31, 2023 · Mar 31, 2023 · Mar 31, 2023
diff --git a/pymc_experimental/model_builder.py b/pymc_experimental/model_builder.py
@@ -15,6 +15,7 @@
 
 import hashlib
 import json
+from abc import abstractmethod
 from pathlib import Path
 from typing import Dict, Union
 
@@ -24,22 +25,20 @@
 import pymc as pm
 
 
-class ModelBuilder(pm.Model):
+class ModelBuilder:
     """
     ModelBuilder can be used to provide an easy-to-use API (similar to scikit-learn) for models
     and help with deployment.
-
-    Extends the pymc.Model class.
     """
 
     _model_type = "BaseClass"
     version = "None"
 
     def __init__(
         self,
-        model_config: Dict,
-        sampler_config: Dict,
-        data: Dict[str, Union[np.ndarray, pd.DataFrame, pd.Series]] = None,
+        data: Dict[str, Union[np.ndarray, pd.DataFrame, pd.Series]],
+        model_config: Dict = None,
+        sampler_config: Dict = None,
     ):
         """
         Initializes model configuration and sampler configuration for the model
@@ -48,10 +47,10 @@ def __init__(
         ----------
         model_config : Dictionary
             dictionary of parameters that initialise model configuration. Generated by the user defined create_sample_input method.
-        sampler_config : Dictionary
-            dictionary of parameters that initialise sampler configuration. Generated by the user defined create_sample_input method.
         data : Dictionary
             It is the data we need to train the model on.
+        sampler_config : Dictionary
+            dictionary of parameters that initialise sampler configuration. Generated by the user defined create_sample_input method.
         Examples
         --------
         >>> class LinearModel(ModelBuilder):
@@ -60,20 +59,30 @@ def __init__(
         """
 
         super().__init__()
+        if sampler_config is None:
+            sampler_config = {}
+        if model_config is None:
+            model_config = {}
         self.model_config = model_config  # parameters for priors etc.
-        self.sample_config = sampler_config  # parameters for sampling
-        self.idata = None  # inference data object
+        self.sampler_config = sampler_config  # parameters for sampling
         self.data = data
-        self.build()
+        self.idata = (
+            None  # inference data object placeholder, idata is generated during build execution
+        )
 
-    def build(self):
+    def build(self) -> None:
         """
         Builds the defined model.
         """
 
-        with self:
-            self.build_model(self.model_config, self.data)
+        self.build_model(
+            model_instance=self,
+            data=self.data,
+            model_config=self.model_config,
+            sampler_config=self.sampler_config,
+        )
 
+    @abstractmethod
     def _data_setter(
         self, data: Dict[str, Union[np.ndarray, pd.DataFrame, pd.Series]], x_only: bool = True
     ):
@@ -100,8 +109,9 @@ def _data_setter(
 
         raise NotImplementedError
 
-    @classmethod
-    def create_sample_input(cls):
+    @staticmethod
+    @abstractmethod
+    def create_sample_input():
         """
         Needs to be implemented by the user in the inherited class.
         Returns examples for data, model_config, sampler_config.
@@ -135,7 +145,7 @@ def create_sample_input(cls):
 
         raise NotImplementedError
 
-    def save(self, fname):
+    def save(self, fname: str) -> None:
         """
         Saves inference data of the model.
 
@@ -159,8 +169,9 @@ def save(self, fname):
         self.idata.to_netcdf(file)
 
     @classmethod
-    def load(cls, fname):
+    def load(cls, fname: str):
         """
+        Creates a ModelBuilder instance from a file,
         Loads inference data for the model.
 
         Parameters
@@ -170,7 +181,7 @@ def load(cls, fname):
 
         Returns
         -------
-        Returns the inference data that is loaded from local system.
+        Returns an instance of ModelBuilder.
 
         Raises
         ------
@@ -187,22 +198,25 @@ def load(cls, fname):
 
         filepath = Path(str(fname))
         idata = az.from_netcdf(filepath)
-        self = cls(
-            json.loads(idata.attrs["model_config"]),
-            json.loads(idata.attrs["sampler_config"]),
-            idata.fit_data.to_dataframe(),
+        model_builder = cls(
+            model_config=json.loads(idata.attrs["model_config"]),
+            sampler_config=json.loads(idata.attrs["sampler_config"]),
+            data=idata.fit_data.to_dataframe(),
         )
-        self.idata = idata
-        if self.id != idata.attrs["id"]:
+        model_builder.idata = idata
+        model_builder.build()
+        if model_builder.id != idata.attrs["id"]:
             raise ValueError(
-                f"The file '{fname}' does not contain an inference data of the same model or configuration as '{self._model_type}'"
+                f"The file '{fname}' does not contain an inference data of the same model or configuration as '{cls._model_type}'"
             )
 
-        return self
+        return model_builder
 
-    def fit(self, data: Dict[str, Union[np.ndarray, pd.DataFrame, pd.Series]] = None):
+    def fit(
+        self, data: Dict[str, Union[np.ndarray, pd.DataFrame, pd.Series]] = None
+    ) -> az.InferenceData:
         """
-        As the name suggests fit can be used to fit a model using the data that is passed as a parameter.
+        Fit a model using the data passed as a parameter.
         Sets attrs to inference data of the model.
 
         Parameter
@@ -225,35 +239,36 @@ def fit(self, data: Dict[str, Union[np.ndarray, pd.DataFrame, pd.Series]] = None
 
         if data is not None:
             self.data = data
-            self._data_setter(data)
-
-        if self.basic_RVs == []:
             self.build()
+            self._data_setter(data)
 
-        with self:
-            self.idata = pm.sample(**self.sample_config)
+        with self.model:
+            self.idata = pm.sample(**self.sampler_config)
             self.idata.extend(pm.sample_prior_predictive())
             self.idata.extend(pm.sample_posterior_predictive(self.idata))
 
         self.idata.attrs["id"] = self.id
         self.idata.attrs["model_type"] = self._model_type
         self.idata.attrs["version"] = self.version
-        self.idata.attrs["sampler_config"] = json.dumps(self.sample_config)
+        self.idata.attrs["sampler_config"] = json.dumps(self.sampler_config)
         self.idata.attrs["model_config"] = json.dumps(self.model_config)
         self.idata.add_groups(fit_data=self.data.to_xarray())
         return self.idata
 
     def predict(
         self,
         data_prediction: Dict[str, Union[np.ndarray, pd.DataFrame, pd.Series]] = None,
-    ):
+        extend_idata: bool = True,
+    ) -> dict:
         """
         Uses model to predict on unseen data and return point prediction of all the samples
 
         Parameters
         ---------
         data_prediction : Dictionary of string and either of numpy array, pandas dataframe or pandas Series
             It is the data we need to make prediction on using the model.
+        extend_idata : Boolean determining whether the predictions should be added to inference data object.
+            Defaults to True.
 
         Returns
         -------
@@ -275,7 +290,8 @@ def predict(
 
         with self.model:  # sample with new input data
             post_pred = pm.sample_posterior_predictive(self.idata)
-
+            if extend_idata:
+                self.idata.extend(post_pred)
         # reshape output
         post_pred = self._extract_samples(post_pred)
         for key in post_pred:
@@ -286,16 +302,17 @@ def predict(
     def predict_posterior(
         self,
         data_prediction: Dict[str, Union[np.ndarray, pd.DataFrame, pd.Series]] = None,
-    ):
+        extend_idata: bool = True,
+    ) -> Dict[str, np.array]:
         """
         Uses model to predict samples on unseen data.
 
         Parameters
         ---------
         data_prediction : Dictionary of string and either of numpy array, pandas dataframe or pandas Series
             It is the data we need to make prediction on using the model.
-        point_estimate : bool
-            Adds point like estimate used as mean passed as
+        extend_idata : Boolean determining whether the predictions should be added to inference data object.
+            Defaults to True.
 
         Returns
         -------
@@ -317,6 +334,8 @@ def predict_posterior(
 
         with self.model:  # sample with new input data
             post_pred = pm.sample_posterior_predictive(self.idata)
+            if extend_idata:
+                self.idata.extend(post_pred)
 
         # reshape output
         post_pred = self._extract_samples(post_pred)
@@ -357,5 +376,4 @@ def id(self) -> str:
         hasher.update(str(self.model_config.values()).encode())
         hasher.update(self.version.encode())
         hasher.update(self._model_type.encode())
-        # hasher.update(str(self.sample_config.values()).encode())
         return hasher.hexdigest()[:16]