ModelBuilder docs out of date #246

theorashid · 2023-10-03T15:42:46Z

Copying straight from the example (maybe it's just out of date docs, if so, how do I fix?)

from typing import Union, Dict
import numpy as np
import pandas as pd
import pymc as pm
from pymc_experimental.model_builder import ModelBuilder


class LinearModel(ModelBuilder):
    # Give the model a name
    _model_type = "LinearModel"

    # And a version
    version = "0.1"

    def build_model(self, X: pd.DataFrame, y: Union[pd.Series, np.ndarray], **kwargs):
        """
        build_model creates the PyMC model

        Parameters:
        model_config: dictionary
            it is a dictionary with all the parameters that we need in our model example:  a_loc, a_scale, b_loc
        data: Dict[str, Union[np.ndarray, pd.DataFrame, pd.Series]]
            Data we want our model fit on.
        """
        # Check the type of X and y and adjust access accordingly
        X_values = X["input"].values
        y_values = y.values if isinstance(y, pd.Series) else y
        self._generate_and_preprocess_model_data(X_values, y_values)

        with pm.Model(coords=self.model_coords) as self.model:

            # Create mutable data containers
            x_data = pm.MutableData("x_data", X_values)
            y_data = pm.MutableData("y_data", y_values)

            # prior parameters
            a_mu_prior = self.model_config.get("a_mu_prior", 0.0)
            a_sigma_prior = self.model_config.get("a_sigma_prior", 1.0)
            b_mu_prior = self.model_config.get("b_mu_prior", 0.0)
            b_sigma_prior = self.model_config.get("b_sigma_prior", 1.0)
            eps_prior = self.model_config.get("eps_prior", 1.0)

            # priors
            a = pm.Normal("a", mu=a_mu_prior, sigma=a_sigma_prior)
            b = pm.Normal("b", mu=b_mu_prior, sigma=b_sigma_prior)
            eps = pm.HalfNormal("eps", eps_prior)

            obs = pm.Normal("y", mu=a + b * x_data, sigma=eps, shape=x_data.shape, observed=y_data)

    def _data_setter(
        self, X: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series, np.ndarray] = None
    ):
        if isinstance(X, pd.DataFrame):
            x_values = X["input"].values
        else:
            # Assuming "input" is the first column
            x_values = X[:, 0]

        with self.model:
            pm.set_data({"x_data": x_values})
            if y is not None:
                pm.set_data({"y_data": y.values if isinstance(y, pd.Series) else y})

    @property
    def default_model_config(self) -> Dict:
        """
        default_model_config is a property that returns a dictionary with all the prior values we want to build the model with.
        It supports more complex data structures like lists, dictionaries, etc.
        It will be passed to the class instance on initialization, in case the user doesn't provide any model_config of their own.
        """
        model_config: Dict = {
            "a_mu_prior": 0.0,
            "a_sigma_prior": 1.0,
            "b_mu_prior": 0.0,
            "b_sigma_prior": 1.0,
            "eps_prior": 1.0,
        }
        return model_config

    @property
    def default_sampler_config(self) -> Dict:
        """
        default_sampler_config is a property that returns a dictionary with all most important sampler parameters.
        It will be used in case the user doesn't provide any sampler_config of their own.
        """
        sampler_config: Dict = {
            "draws": 1_000,
            "tune": 1_000,
            "chains": 3,
            "target_accept": 0.95,
        }
        return sampler_config

    @property
    def output_var(self):
        return "y"

    @property
    def _serializable_model_config(self) -> Dict[str, Union[int, float, Dict]]:
        """
        _serializable_model_config is a property that returns a dictionary with all the model parameters that we want to save.
        as some of the data structures are not json serializable, we need to convert them to json serializable objects.
        Some models will need them, others can just define them to return the model_config.
        """
        return self.model_config

    def _save_input_params(self, idata) -> None:
        """
        Saves any additional model parameters (other than the dataset) to the idata object.

        These parameters are stored within `idata.attrs` using keys that correspond to the parameter names.
        If you don't need to store any extra parameters, you can leave this method unimplemented.

        Example:
            For saving customer IDs provided as an 'customer_ids' input to the model:
            self.customer_ids = customer_ids.values #this line is done outside of the function, preferably at the initialization of the model object.
            idata.attrs["customer_ids"] = json.dumps(self.customer_ids.tolist())  # Convert numpy array to a JSON-serializable list.
        """
        pass

    def _generate_and_preprocess_model_data(
        self, X: Union[pd.DataFrame, pd.Series], y: Union[pd.Series, np.ndarray]
    ) -> None:
        """
        Depending on the model, we might need to preprocess the data before fitting the model.
        all required preprocessing and conditional assignments should be defined here.
        """
        self.model_coords = None  # in our case we're not using coords, but if we were, we would define them here, or later on in the function, if extracting them from the data.
        # as we don't do any data preprocessing, we just assign the data givenin by the user. Note that it's very basic model,
        # and usually we would need to do some preprocessing, or generate the coords from the data.
        self.X = X
        self.y = y

Then running

LinearModel()

  52 def __init__(
     53     self,
     54     model_config: Dict = None,
     55     sampler_config: Dict = None,
     56 ):
     57     """
     58     Initializes model configuration and sampler configuration for the model
     59 
   (...)
     72     >>> model = MyModel(model_config, sampler_config)
     73     """
     74     sampler_config = (
---> 75         self.get_default_sampler_config() if sampler_config is None else sampler_config
     76     )
     77     self.sampler_config = sampler_config
     78     model_config = self.get_default_model_config() if model_config is None else model_config

TypeError: ModelBuilder.get_default_sampler_config() missing 1 required positional argument: 'self'

Runs fine if I pass a some dictionaries

LinearModel(model_config={}, sampler_config={})
<__main__.LinearModel at 0x16110ccd0>

I tried to edit the class with

+ def get_default_sampler_config(self) -> Dict:
-  def default_sampler_config(self) -> Dict:
+ def get_default_model_config(self) -> Dict:
-  def default_model_config(self) -> Dict:

but that just changed the error to

     52 def __init__(
     53     self,
     54     model_config: Dict = None,
     55     sampler_config: Dict = None,
     56 ):
     57     """
     58     Initializes model configuration and sampler configuration for the model
     59 
   (...)
     72     >>> model = MyModel(model_config, sampler_config)
     73     """
     74     sampler_config = (
---> 75         self.get_default_sampler_config() if sampler_config is None else sampler_config
     76     )
     77     self.sampler_config = sampler_config
     78     model_config = self.get_default_model_config() if model_config is None else model_config

TypeError: 'dict' object is not callable

Any ideas? Happy to do a fix PR if it's quick and easy

Also, if the docs are old, looking at the PR history with the merging of the BayesianEstimator and ModelBuilder classes, it would be great to have an example of a pymc model in a pipeline. For the project I'm working on, I currently have a pipeline in the _generate_and_preprocess_model_data(), but it would be cool to have that outside the model.

The text was updated successfully, but these errors were encountered:

pdb5627 · 2023-10-04T07:34:21Z

Those methods changed names because they are now static methods instead of properties. PR #235

+ @staticmethod
+ def get_default_sampler_config() -> Dict:
-  @property
-  def default_sampler_config(self) -> Dict:
+ @staticmethod
+ def get_default_model_config(self) -> Dict:
-  @property
-  def default_model_config(self) -> Dict:

There's a test for the experimental LinearModel class that uses a scikit-learn pipeline. Adapting it to be a standalone script:

import numpy as np
import pandas as pd
from sklearn import set_config
from sklearn.compose import TransformedTargetRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from pymc_experimental.linearmodel import LinearModel

set_config(transform_output="pandas")

toy_X = pd.DataFrame({"input": np.linspace(start=0, stop=1, num=100)})
y = 5 * toy_X["input"] + 3
y = y + np.random.normal(0, 1, size=len(toy_X))
toy_y = pd.Series(y, name="output")

model_config = {
    "intercept": {"loc": 0, "scale": 2},
    "slope": {"loc": 0, "scale": 2},
    "obs_error": 1,
    "default_output_var": "y_hat",
}
model = Pipeline(
    [
        ("input_scaling", StandardScaler()),
        (
            "linear_model",
            TransformedTargetRegressor(LinearModel(model_config), transformer=StandardScaler()),
        ),
    ]
)
model.fit(toy_X, toy_y)

X_pred = pd.DataFrame({"input": np.random.uniform(low=0, high=1, size=100)})
model.predict(X_pred)

If you want to get the posterior predictive samples transformed rather than just the expected value of the posterior prediction, then you need to extend TransformedTargetRegressor. I use this:

import sklearn.compose

class TransformedTargetPYMCRegressor(sklearn.compose.TransformedTargetRegressor):
    """Add predict_posterior to sklearn.compose.TransformedTargetRegressor"""

    def predict_posterior(self, X, **predict_params):
        """Predict using the base regressor, applying inverse.
        The regressor is used to predict and the `inverse_func` or
        `inverse_transform` is applied before returning the prediction.
        Parameters
        ----------
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Samples.
        **predict_params : dict of str -> object
            Parameters passed to the `predict_posterior` method of the underlying
            regressor.
        Returns
        -------
        y_hat : ndarray of shape (n_samples,)
            Predicted values.
        """
        # check_is_fitted(self)
        pred = self.regressor_.predict_posterior(X, **predict_params)
        # TODO: This only works if the output is reshaped to 2D. If draws & chains are separate dimensions, will fail.
        if pred.ndim == 1:
            pred_trans = self.transformer_.inverse_transform(pred.reshape(-1, 1))
        else:
            pred_trans = self.transformer_.inverse_transform(pred)
        if self._training_dim == 1 and pred_trans.ndim == 2 and pred_trans.shape[1] == 1:
            pred_trans = pred_trans.squeeze(axis=1)

        return pred_trans

    def predict_proba(self, X, **predict_params):
        return self.predict_posterior(X, **predict_params)

Then you can use model.predict_proba(X) to get the predictive samples.

twiecki · 2023-10-04T09:59:51Z

Can someone do a PR that fixes the docs?

pdb5627 · 2023-10-05T07:33:55Z

@twiecki I can prepare a PR. This will be my first contribution to the docs. I guess I just need to follow the instructions in Contributing.md.

twiecki · 2023-10-05T07:38:07Z

@pdb5627 Yes, exactly. Let me know if you have any questions.

theorashid · 2023-10-05T11:48:31Z

The ModelBuilder class actually was not as flexible as I'd hoped. I can't think of use cases outside of linear regression or BART where I just feed X and y into a model.

It would be good to see either:

A more general class where I can feed several covariates into the model (or an example of how to use this class to do that)
An example of a production-ready pymc model using object-oriented approach rather than a notebook, although I appreciate this goes against the Bayesian workflow of model checking and tweaking, and accessing a model in a pipeline as pipe["model"]._regressor.idata is not ideal. @twiecki, there must be something from pymc-labs here? In fact, a pymc-labs blog post would be great. Or equally, if you know of repos where they do this, that would be great

twiecki · 2023-10-05T11:59:13Z

Technically, you don't have to provide a y (it can be just default None) and pass everything into X.

A more general class where I can feed several covariates into the model (or an example of how to use this class to do that)

That's the idea of X where you can pass whatever covariates.

An example of a production-ready pymc model using object-oriented approach rather than a notebook, although I appreciate this goes against the Bayesian workflow of model checking and tweaking, and accessing a model in a pipeline as pipe["model"]._regressor.idata is not ideal. @twiecki, there must be something from pymc-labs here? In fact, a pymc-labs blog post would be just fine

Yeah, we want to write one but progress on this front has been slow. Do let me know if this is something you'd like to collaborate on, definitely looking for partners who can help with this.

theorashid · 2023-10-05T12:47:06Z

Technically, you don't have to provide a y (it can be just default None) and pass everything into X.
That's the idea of X where you can pass whatever covariates.

True but what if X isn't square? And there was some complex dim structure? I guess you could pad it with NaN but it seems overly complex

Yeah, we want to write one but progress on this front has been slow. Do let me know if this is something you'd like to collaborate on, definitely looking for partners who can help with this.

I'm going to try and use PyMC for my next project, if I can port their existing code from numpyro easily. I'll try and do it using some style of ModelBuilder class and I'll keep you in the loop in case it's something we can collaborate on, generalise and eventually write up/contribute.

twiecki · 2023-10-05T13:19:03Z

True but what if X isn't square? And there was some complex dim structure? I guess you could pad it with NaN but it seems overly complex

Perhaps we should still keep the original API and have the sklearn one in an inherited class like originally proposed...

theorashid · 2023-10-05T14:01:13Z

Oh yeah, if there was originally a class which just converted pymc models into classes with all the configs attached and a generalised .fit(), that would be great. I'm sure loads of users would like that

twiecki · 2023-10-05T14:20:49Z

@theorashid Yeah, that's what we had, and you'd input a dict, so no shape problems.

theorashid · 2023-10-05T14:42:22Z

I don't know the original, but it would be good to allow xr.DataSet inputs as well as np/pd. This will help with complex coords and keeping track of names

Fixes pymc-devs/pymc-extras#246

twiecki changed the title ~~ModelBuilder does not work out the box~~ ModelBuilder docs out of date Oct 4, 2023

pdb5627 added a commit to pdb5627/pymc-examples that referenced this issue Oct 5, 2023

Update model builder example to current API

1f7f228

Fixes pymc-devs/pymc-extras#246

pdb5627 mentioned this issue Oct 5, 2023

Update model builder example to current API pymc-devs/pymc-examples#582

Merged

twiecki closed this as completed in pymc-devs/pymc-examples#582 Oct 5, 2023

twiecki pushed a commit to pymc-devs/pymc-examples that referenced this issue Oct 5, 2023

Update model builder example to current API (#582)

5177871

Fixes pymc-devs/pymc-extras#246

pdb5627 mentioned this issue Mar 1, 2024

In model_builder, _validate_data changes input type #314

Open

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

ModelBuilder docs out of date #246

ModelBuilder docs out of date #246

theorashid commented Oct 3, 2023 •

edited

Loading

pdb5627 commented Oct 4, 2023

twiecki commented Oct 4, 2023

pdb5627 commented Oct 5, 2023

twiecki commented Oct 5, 2023

theorashid commented Oct 5, 2023 •

edited

Loading

twiecki commented Oct 5, 2023

theorashid commented Oct 5, 2023 •

edited

Loading

twiecki commented Oct 5, 2023

theorashid commented Oct 5, 2023 •

edited

Loading

twiecki commented Oct 5, 2023

theorashid commented Oct 5, 2023

ModelBuilder docs out of date #246

ModelBuilder docs out of date #246

Comments

theorashid commented Oct 3, 2023 • edited Loading

pdb5627 commented Oct 4, 2023

twiecki commented Oct 4, 2023

pdb5627 commented Oct 5, 2023

twiecki commented Oct 5, 2023

theorashid commented Oct 5, 2023 • edited Loading

twiecki commented Oct 5, 2023

theorashid commented Oct 5, 2023 • edited Loading

twiecki commented Oct 5, 2023

theorashid commented Oct 5, 2023 • edited Loading

twiecki commented Oct 5, 2023

theorashid commented Oct 5, 2023

theorashid commented Oct 3, 2023 •

edited

Loading

theorashid commented Oct 5, 2023 •

edited

Loading

theorashid commented Oct 5, 2023 •

edited

Loading

theorashid commented Oct 5, 2023 •

edited

Loading