From c55dbc7ff5632df5f5ca4972c7296c98282ce445 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Mon, 7 Jul 2025 08:23:47 +0100 Subject: [PATCH 1/3] Add GaussianMLP This reverts commit 62d8fdf38eaae595cdaa593bab165ad303cd2734. --- autoemulate/emulators/nn/mlp.py | 125 +++++++++++++++++++++++++++++++- 1 file changed, 122 insertions(+), 3 deletions(-) diff --git a/autoemulate/emulators/nn/mlp.py b/autoemulate/emulators/nn/mlp.py index a1fa9d5fe..608833524 100644 --- a/autoemulate/emulators/nn/mlp.py +++ b/autoemulate/emulators/nn/mlp.py @@ -1,11 +1,13 @@ -from torch import nn +import torch +from torch import nn, optim from autoemulate.core.device import TorchDeviceMixin -from autoemulate.core.types import DeviceLike, TensorLike +from autoemulate.core.types import DeviceLike, GaussianLike, TensorLike from autoemulate.data.utils import set_random_seed from autoemulate.transforms.standardize import StandardizeTransform +from autoemulate.transforms.utils import make_positive_definite -from ..base import DropoutTorchBackend +from ..base import DropoutTorchBackend, GaussianEmulator class MLP(DropoutTorchBackend): @@ -145,3 +147,120 @@ def get_tune_params(): "scheduler_cls": scheduler_params["scheduler_cls"], "scheduler_kwargs": scheduler_params["scheduler_kwargs"], } + + +class GaussianMLP(DropoutTorchBackend, GaussianEmulator): + """Multi-Layer Perceptron (MLP) emulator with Gaussian outputs.""" + + def __init__( + self, + x, + y, + activation_cls=nn.ReLU, + loss_fn_cls=nn.MSELoss, + optimizer_cls=optim.Adam, + epochs=100, + layer_dims=None, + weight_init="default", + scale=1, + bias_init="default", + dropout_prob=None, + lr=1e-2, + random_seed=None, + device=None, + **kwargs, + ): + TorchDeviceMixin.__init__(self, device=device) + nn.Module.__init__(self) + + if random_seed is not None: + set_random_seed(seed=random_seed) + + # Ensure x and y are tensors with correct dimensions + x, y = self._convert_to_tensors(x, y) + + # Construct the MLP layers + # Total params required for last layer: mean + tril covariance + num_params = y.shape[1] + (y.shape[1] * (y.shape[1] + 1)) // 2 + layer_dims = ( + [x.shape[1], *layer_dims] + if layer_dims + else [x.shape[1], 4 * num_params, 2 * num_params] + ) + layers = [] + for idx, dim in enumerate(layer_dims[1:]): + layers.append(nn.Linear(layer_dims[idx], dim, device=self.device)) + layers.append(activation_cls()) + if dropout_prob is not None: + layers.append(nn.Dropout(p=dropout_prob)) + + # Add final layer without activation + layers.append(nn.Linear(layer_dims[-1], num_params, device=self.device)) + self.nn = nn.Sequential(*layers) + + # Finalize initialization + self._initialize_weights(weight_init, scale, bias_init) + self.epochs = epochs + self.loss_fn = loss_fn_cls() + self.num_tasks = y.shape[1] + self.optimizer = optimizer_cls(self.nn.parameters(), lr=lr) # type: ignore[call-arg] since all optimizers include lr + self.to(device) + + def _predict(self, x, with_grad=False): + """Predict using the MLP model.""" + with torch.set_grad_enabled(with_grad): + self.nn.eval() + return self(x) + + def forward(self, x): + """Forward pass for the Gaussian MLP.""" + y = self.nn(x) + mean = y[..., : self.num_tasks] + + # Use Cholesky decomposition to guarantee PSD covariance matrix + num_chol_params = (self.num_tasks * (self.num_tasks + 1)) // 2 + chol_params = y[..., self.num_tasks : self.num_tasks + num_chol_params] + + # Assign params to matrix + scale_tril = torch.zeros( + *y.shape[:-1], self.num_tasks, self.num_tasks, device=y.device + ) + tril_indices = torch.tril_indices( + self.num_tasks, self.num_tasks, device=y.device + ) + scale_tril[..., tril_indices[0], tril_indices[1]] = chol_params + + # Ensure positive variance + diag_idxs = torch.arange(self.num_tasks) + diag = ( + torch.nn.functional.softplus(scale_tril[..., diag_idxs, diag_idxs]) + 1e-6 + ) + scale_tril[..., diag_idxs, diag_idxs] = diag + + covariance_matrix = scale_tril @ scale_tril.transpose(-1, -2) + + # TODO: for large covariance martrices, numerical instability remains + return GaussianLike(mean, make_positive_definite(covariance_matrix)) + + def loss_func(self, y_pred, y_true): + """Negative log likelihood loss function.""" + return -y_pred.log_prob(y_true).mean() + + @staticmethod + def is_multioutput() -> bool: + """GaussianMLP supports multi-output.""" + return True + + @staticmethod + def get_tune_config(): + """Return a dictionary of hyperparameters to tune.""" + return { + "epochs": [50, 100, 200], + "layer_dims": [[64, 64], [128, 128]], + "lr": [1e-1, 1e-2, 1e-3], + "batch_size": [16, 32], + "weight_init": ["default", "normal"], + "scale": [0.1, 1.0], + "bias_init": ["default", "zeros"], + "dropout_prob": [0.3, 0.5, None], + } From 604b32714a99d81ff71de1d3b9d3ee4302c95c61 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Thu, 10 Jul 2025 17:56:56 +0100 Subject: [PATCH 2/3] Add scheduler --- autoemulate/emulators/nn/mlp.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/autoemulate/emulators/nn/mlp.py b/autoemulate/emulators/nn/mlp.py index 608833524..99a125634 100644 --- a/autoemulate/emulators/nn/mlp.py +++ b/autoemulate/emulators/nn/mlp.py @@ -159,6 +159,8 @@ def __init__( activation_cls=nn.ReLU, loss_fn_cls=nn.MSELoss, optimizer_cls=optim.Adam, + scheduler_cls=None, + scheduler_kwargs=None, epochs=100, layer_dims=None, weight_init="default", @@ -203,7 +205,9 @@ def __init__( self.epochs = epochs self.loss_fn = loss_fn_cls() self.num_tasks = y.shape[1] - self.optimizer = optimizer_cls(self.nn.parameters(), lr=lr) # type: ignore[call-arg] since all optimizers include lr + self.optimizer = optimizer_cls(self.nn.parameters(), lr=lr) + self.scheduler_cls = scheduler_cls + self.scheduler_setup(scheduler_kwargs) self.to(device) def _predict(self, x, with_grad=False): From 8d8218436c05015723661e260a6032318c29189a Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Thu, 9 Oct 2025 16:49:11 +0100 Subject: [PATCH 3/3] Move to experimental --- autoemulate/emulators/nn/mlp.py | 129 +----------------- .../experimental/emulators/nn/mlp_gaussian.py | 114 ++++++++++++++++ 2 files changed, 117 insertions(+), 126 deletions(-) create mode 100644 autoemulate/experimental/emulators/nn/mlp_gaussian.py diff --git a/autoemulate/emulators/nn/mlp.py b/autoemulate/emulators/nn/mlp.py index 99a125634..a1fa9d5fe 100644 --- a/autoemulate/emulators/nn/mlp.py +++ b/autoemulate/emulators/nn/mlp.py @@ -1,13 +1,11 @@ -import torch -from torch import nn, optim +from torch import nn from autoemulate.core.device import TorchDeviceMixin -from autoemulate.core.types import DeviceLike, GaussianLike, TensorLike +from autoemulate.core.types import DeviceLike, TensorLike from autoemulate.data.utils import set_random_seed from autoemulate.transforms.standardize import StandardizeTransform -from autoemulate.transforms.utils import make_positive_definite -from ..base import DropoutTorchBackend, GaussianEmulator +from ..base import DropoutTorchBackend class MLP(DropoutTorchBackend): @@ -147,124 +145,3 @@ def get_tune_params(): "scheduler_cls": scheduler_params["scheduler_cls"], "scheduler_kwargs": scheduler_params["scheduler_kwargs"], } - - -class GaussianMLP(DropoutTorchBackend, GaussianEmulator): - """Multi-Layer Perceptron (MLP) emulator with Gaussian outputs.""" - - def __init__( - self, - x, - y, - activation_cls=nn.ReLU, - loss_fn_cls=nn.MSELoss, - optimizer_cls=optim.Adam, - scheduler_cls=None, - scheduler_kwargs=None, - epochs=100, - layer_dims=None, - weight_init="default", - scale=1, - bias_init="default", - dropout_prob=None, - lr=1e-2, - random_seed=None, - device=None, - **kwargs, - ): - TorchDeviceMixin.__init__(self, device=device) - nn.Module.__init__(self) - - if random_seed is not None: - set_random_seed(seed=random_seed) - - # Ensure x and y are tensors with correct dimensions - x, y = self._convert_to_tensors(x, y) - - # Construct the MLP layers - # Total params required for last layer: mean + tril covariance - num_params = y.shape[1] + (y.shape[1] * (y.shape[1] + 1)) // 2 - layer_dims = ( - [x.shape[1], *layer_dims] - if layer_dims - else [x.shape[1], 4 * num_params, 2 * num_params] - ) - layers = [] - for idx, dim in enumerate(layer_dims[1:]): - layers.append(nn.Linear(layer_dims[idx], dim, device=self.device)) - layers.append(activation_cls()) - if dropout_prob is not None: - layers.append(nn.Dropout(p=dropout_prob)) - - # Add final layer without activation - layers.append(nn.Linear(layer_dims[-1], num_params, device=self.device)) - self.nn = nn.Sequential(*layers) - - # Finalize initialization - self._initialize_weights(weight_init, scale, bias_init) - self.epochs = epochs - self.loss_fn = loss_fn_cls() - self.num_tasks = y.shape[1] - self.optimizer = optimizer_cls(self.nn.parameters(), lr=lr) - self.scheduler_cls = scheduler_cls - self.scheduler_setup(scheduler_kwargs) - self.to(device) - - def _predict(self, x, with_grad=False): - """Predict using the MLP model.""" - with torch.set_grad_enabled(with_grad): - self.nn.eval() - return self(x) - - def forward(self, x): - """Forward pass for the Gaussian MLP.""" - y = self.nn(x) - mean = y[..., : self.num_tasks] - - # Use Cholesky decomposition to guarantee PSD covariance matrix - num_chol_params = (self.num_tasks * (self.num_tasks + 1)) // 2 - chol_params = y[..., self.num_tasks : self.num_tasks + num_chol_params] - - # Assign params to matrix - scale_tril = torch.zeros( - *y.shape[:-1], self.num_tasks, self.num_tasks, device=y.device - ) - tril_indices = torch.tril_indices( - self.num_tasks, self.num_tasks, device=y.device - ) - scale_tril[..., tril_indices[0], tril_indices[1]] = chol_params - - # Ensure positive variance - diag_idxs = torch.arange(self.num_tasks) - diag = ( - torch.nn.functional.softplus(scale_tril[..., diag_idxs, diag_idxs]) + 1e-6 - ) - scale_tril[..., diag_idxs, diag_idxs] = diag - - covariance_matrix = scale_tril @ scale_tril.transpose(-1, -2) - - # TODO: for large covariance martrices, numerical instability remains - return GaussianLike(mean, make_positive_definite(covariance_matrix)) - - def loss_func(self, y_pred, y_true): - """Negative log likelihood loss function.""" - return -y_pred.log_prob(y_true).mean() - - @staticmethod - def is_multioutput() -> bool: - """GaussianMLP supports multi-output.""" - return True - - @staticmethod - def get_tune_config(): - """Return a dictionary of hyperparameters to tune.""" - return { - "epochs": [50, 100, 200], - "layer_dims": [[64, 64], [128, 128]], - "lr": [1e-1, 1e-2, 1e-3], - "batch_size": [16, 32], - "weight_init": ["default", "normal"], - "scale": [0.1, 1.0], - "bias_init": ["default", "zeros"], - "dropout_prob": [0.3, 0.5, None], - } diff --git a/autoemulate/experimental/emulators/nn/mlp_gaussian.py b/autoemulate/experimental/emulators/nn/mlp_gaussian.py new file mode 100644 index 000000000..6c9e55fa0 --- /dev/null +++ b/autoemulate/experimental/emulators/nn/mlp_gaussian.py @@ -0,0 +1,114 @@ +import torch +from autoemulate.core.device import TorchDeviceMixin +from autoemulate.core.types import DeviceLike, GaussianLike, TensorLike +from autoemulate.data.utils import set_random_seed +from autoemulate.emulators.base import GaussianEmulator +from autoemulate.emulators.nn.mlp import MLP +from autoemulate.transforms.standardize import StandardizeTransform +from autoemulate.transforms.utils import make_positive_definite +from torch import nn + + +class GaussianMLP(MLP, GaussianEmulator): + """Multi-Layer Perceptron (MLP) emulator with Gaussian outputs.""" + + def __init__( + self, + x: TensorLike, + y: TensorLike, + standardize_x: bool = True, + standardize_y: bool = True, + activation_cls: type[nn.Module] = nn.ReLU, + loss_fn_cls: type[nn.Module] = nn.MSELoss, + epochs: int = 100, + batch_size: int = 16, + layer_dims: list[int] | None = None, + weight_init: str = "default", + scale: float = 1.0, + bias_init: str = "default", + dropout_prob: float | None = None, + lr: float = 1e-2, + random_seed: int | None = None, + device: DeviceLike | None = None, + **scheduler_kwargs, + ): + TorchDeviceMixin.__init__(self, device=device) + nn.Module.__init__(self) + + if random_seed is not None: + set_random_seed(seed=random_seed) + + # Ensure x and y are tensors with correct dimensions + x, y = self._convert_to_tensors(x, y) + + # Construct the MLP layers + # Total params required for last layer: mean + tril covariance + num_params = y.shape[1] + (y.shape[1] * (y.shape[1] + 1)) // 2 + layer_dims = ( + [x.shape[1], *layer_dims] + if layer_dims + else [x.shape[1], 4 * num_params, 2 * num_params] + ) + layers = [] + for idx, dim in enumerate(layer_dims[1:]): + layers.append(nn.Linear(layer_dims[idx], dim, device=self.device)) + layers.append(activation_cls()) + if dropout_prob is not None: + layers.append(nn.Dropout(p=dropout_prob)) + + # Add final layer without activation + layers.append(nn.Linear(layer_dims[-1], num_params, device=self.device)) + self.nn = nn.Sequential(*layers) + + # Finalize initialization + self._initialize_weights(weight_init, scale, bias_init) + self.x_transform = StandardizeTransform() if standardize_x else None + self.y_transform = StandardizeTransform() if standardize_y else None + self.epochs = epochs + self.loss_fn = loss_fn_cls() + self.lr = lr + self.num_tasks = y.shape[1] + self.batch_size = batch_size + self.optimizer = self.optimizer_cls(self.nn.parameters(), lr=lr) # type: ignore # noqa: PGH003 + self.scheduler_setup(scheduler_kwargs) + self.to(device) + + def _predict(self, x, with_grad=False): + """Predict using the MLP model.""" + with torch.set_grad_enabled(with_grad): + self.nn.eval() + return self(x) + + def forward(self, x): + """Forward pass for the Gaussian MLP.""" + y = self.nn(x) + mean = y[..., : self.num_tasks] + + # Use Cholesky decomposition to guarantee PSD covariance matrix + num_chol_params = (self.num_tasks * (self.num_tasks + 1)) // 2 + chol_params = y[..., self.num_tasks : self.num_tasks + num_chol_params] + + # Assign params to matrix + scale_tril = torch.zeros( + *y.shape[:-1], self.num_tasks, self.num_tasks, device=y.device + ) + tril_indices = torch.tril_indices( + self.num_tasks, self.num_tasks, device=y.device + ) + scale_tril[..., tril_indices[0], tril_indices[1]] = chol_params + + # Ensure positive variance + diag_idxs = torch.arange(self.num_tasks) + diag = ( + torch.nn.functional.softplus(scale_tril[..., diag_idxs, diag_idxs]) + 1e-6 + ) + scale_tril[..., diag_idxs, diag_idxs] = diag + + covariance_matrix = scale_tril @ scale_tril.transpose(-1, -2) + + # TODO: for large covariance martrices, numerical instability remains + return GaussianLike(mean, make_positive_definite(covariance_matrix)) + + def loss_func(self, y_pred, y_true): + """Negative log likelihood loss function.""" + return -y_pred.log_prob(y_true).mean()