Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Model Builder refactoring #131

Merged
merged 6 commits into from
Mar 31, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 59 additions & 41 deletions pymc_experimental/model_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

import hashlib
import json
from abc import abstractmethod
from pathlib import Path
from typing import Dict, Union

Expand All @@ -24,22 +25,20 @@
import pymc as pm


class ModelBuilder(pm.Model):
class ModelBuilder:
"""
ModelBuilder can be used to provide an easy-to-use API (similar to scikit-learn) for models
and help with deployment.

Extends the pymc.Model class.
"""

_model_type = "BaseClass"
version = "None"

def __init__(
self,
model_config: Dict,
sampler_config: Dict,
data: Dict[str, Union[np.ndarray, pd.DataFrame, pd.Series]] = None,
data: Dict[str, Union[np.ndarray, pd.DataFrame, pd.Series]],
model_config: Dict = None,
sampler_config: Dict = None,
):
"""
Initializes model configuration and sampler configuration for the model
Expand All @@ -48,10 +47,10 @@ def __init__(
----------
model_config : Dictionary
dictionary of parameters that initialise model configuration. Generated by the user defined create_sample_input method.
sampler_config : Dictionary
dictionary of parameters that initialise sampler configuration. Generated by the user defined create_sample_input method.
data : Dictionary
It is the data we need to train the model on.
sampler_config : Dictionary
dictionary of parameters that initialise sampler configuration. Generated by the user defined create_sample_input method.
Examples
--------
>>> class LinearModel(ModelBuilder):
Expand All @@ -60,20 +59,30 @@ def __init__(
"""

super().__init__()
michaelraczycki marked this conversation as resolved.
Show resolved Hide resolved
if sampler_config is None:
sampler_config = {}
if model_config is None:
model_config = {}
self.model_config = model_config # parameters for priors etc.
self.sample_config = sampler_config # parameters for sampling
self.idata = None # inference data object
self.sampler_config = sampler_config # parameters for sampling
self.data = data
self.build()
self.idata = (
None # inference data object placeholder, idata is generated during build execution
)

def build(self):
def build(self) -> None:
michaelraczycki marked this conversation as resolved.
Show resolved Hide resolved
"""
Builds the defined model.
"""

with self:
self.build_model(self.model_config, self.data)
self.build_model(
model_instance=self,
michaelraczycki marked this conversation as resolved.
Show resolved Hide resolved
data=self.data,
model_config=self.model_config,
sampler_config=self.sampler_config,
)

@abstractmethod
def _data_setter(
self, data: Dict[str, Union[np.ndarray, pd.DataFrame, pd.Series]], x_only: bool = True
):
Expand All @@ -100,8 +109,9 @@ def _data_setter(

raise NotImplementedError

@classmethod
def create_sample_input(cls):
@staticmethod
@abstractmethod
def create_sample_input():
"""
Needs to be implemented by the user in the inherited class.
Returns examples for data, model_config, sampler_config.
Expand Down Expand Up @@ -135,7 +145,7 @@ def create_sample_input(cls):

raise NotImplementedError

def save(self, fname):
def save(self, fname: str) -> None:
"""
Saves inference data of the model.

Expand All @@ -159,8 +169,9 @@ def save(self, fname):
self.idata.to_netcdf(file)

@classmethod
def load(cls, fname):
def load(cls, fname: str):
"""
Creates a ModelBuilder instance from a file,
Loads inference data for the model.

Parameters
Expand All @@ -170,7 +181,7 @@ def load(cls, fname):

Returns
-------
Returns the inference data that is loaded from local system.
Returns an instance of ModelBuilder.

Raises
------
Expand All @@ -187,22 +198,25 @@ def load(cls, fname):

filepath = Path(str(fname))
idata = az.from_netcdf(filepath)
self = cls(
json.loads(idata.attrs["model_config"]),
json.loads(idata.attrs["sampler_config"]),
idata.fit_data.to_dataframe(),
model_builder = cls(
model_config=json.loads(idata.attrs["model_config"]),
sampler_config=json.loads(idata.attrs["sampler_config"]),
data=idata.fit_data.to_dataframe(),
)
self.idata = idata
if self.id != idata.attrs["id"]:
model_builder.idata = idata
model_builder.build()
if model_builder.id != idata.attrs["id"]:
raise ValueError(
f"The file '{fname}' does not contain an inference data of the same model or configuration as '{self._model_type}'"
f"The file '{fname}' does not contain an inference data of the same model or configuration as '{cls._model_type}'"
)

return self
return model_builder

def fit(self, data: Dict[str, Union[np.ndarray, pd.DataFrame, pd.Series]] = None):
def fit(
self, data: Dict[str, Union[np.ndarray, pd.DataFrame, pd.Series]] = None
) -> az.InferenceData:
"""
As the name suggests fit can be used to fit a model using the data that is passed as a parameter.
Fit a model using the data passed as a parameter.
Sets attrs to inference data of the model.

Parameter
Expand All @@ -225,35 +239,36 @@ def fit(self, data: Dict[str, Union[np.ndarray, pd.DataFrame, pd.Series]] = None

if data is not None:
self.data = data
self._data_setter(data)

if self.basic_RVs == []:
self.build()
self._data_setter(data)

with self:
self.idata = pm.sample(**self.sample_config)
with self.model:
self.idata = pm.sample(**self.sampler_config)
self.idata.extend(pm.sample_prior_predictive())
self.idata.extend(pm.sample_posterior_predictive(self.idata))

self.idata.attrs["id"] = self.id
self.idata.attrs["model_type"] = self._model_type
self.idata.attrs["version"] = self.version
self.idata.attrs["sampler_config"] = json.dumps(self.sample_config)
self.idata.attrs["sampler_config"] = json.dumps(self.sampler_config)
self.idata.attrs["model_config"] = json.dumps(self.model_config)
self.idata.add_groups(fit_data=self.data.to_xarray())
return self.idata

def predict(
self,
data_prediction: Dict[str, Union[np.ndarray, pd.DataFrame, pd.Series]] = None,
):
extend_idata: bool = True,
) -> dict:
"""
Uses model to predict on unseen data and return point prediction of all the samples

Parameters
---------
data_prediction : Dictionary of string and either of numpy array, pandas dataframe or pandas Series
It is the data we need to make prediction on using the model.
extend_idata : Boolean determining whether the predictions should be added to inference data object.
Defaults to True.

Returns
-------
Expand All @@ -275,7 +290,8 @@ def predict(

with self.model: # sample with new input data
post_pred = pm.sample_posterior_predictive(self.idata)

if extend_idata:
self.idata.extend(post_pred)
# reshape output
post_pred = self._extract_samples(post_pred)
for key in post_pred:
Expand All @@ -286,16 +302,17 @@ def predict(
def predict_posterior(
self,
data_prediction: Dict[str, Union[np.ndarray, pd.DataFrame, pd.Series]] = None,
):
extend_idata: bool = True,
) -> Dict[str, np.array]:
"""
Uses model to predict samples on unseen data.

Parameters
---------
data_prediction : Dictionary of string and either of numpy array, pandas dataframe or pandas Series
It is the data we need to make prediction on using the model.
point_estimate : bool
Adds point like estimate used as mean passed as
extend_idata : Boolean determining whether the predictions should be added to inference data object.
Defaults to True.

Returns
-------
Expand All @@ -317,6 +334,8 @@ def predict_posterior(

with self.model: # sample with new input data
post_pred = pm.sample_posterior_predictive(self.idata)
if extend_idata:
self.idata.extend(post_pred)

# reshape output
post_pred = self._extract_samples(post_pred)
Expand Down Expand Up @@ -357,5 +376,4 @@ def id(self) -> str:
hasher.update(str(self.model_config.values()).encode())
hasher.update(self.version.encode())
hasher.update(self._model_type.encode())
# hasher.update(str(self.sample_config.values()).encode())
return hasher.hexdigest()[:16]
Loading