From c55dbc7ff5632df5f5ca4972c7296c98282ce445 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Mon, 7 Jul 2025 08:23:47 +0100
Subject: [PATCH 1/3] Add GaussianMLP

This reverts commit 62d8fdf38eaae595cdaa593bab165ad303cd2734.
---
 autoemulate/emulators/nn/mlp.py | 125 +++++++++++++++++++++++++++++++-
 1 file changed, 122 insertions(+), 3 deletions(-)

diff --git a/autoemulate/emulators/nn/mlp.py b/autoemulate/emulators/nn/mlp.py
index a1fa9d5fe..608833524 100644
--- a/autoemulate/emulators/nn/mlp.py
+++ b/autoemulate/emulators/nn/mlp.py
@@ -1,11 +1,13 @@
-from torch import nn
+import torch
+from torch import nn, optim
 
 from autoemulate.core.device import TorchDeviceMixin
-from autoemulate.core.types import DeviceLike, TensorLike
+from autoemulate.core.types import DeviceLike, GaussianLike, TensorLike
 from autoemulate.data.utils import set_random_seed
 from autoemulate.transforms.standardize import StandardizeTransform
+from autoemulate.transforms.utils import make_positive_definite
 
-from ..base import DropoutTorchBackend
+from ..base import DropoutTorchBackend, GaussianEmulator
 
 
 class MLP(DropoutTorchBackend):
@@ -145,3 +147,120 @@ def get_tune_params():
             "scheduler_cls": scheduler_params["scheduler_cls"],
             "scheduler_kwargs": scheduler_params["scheduler_kwargs"],
         }
+
+
+class GaussianMLP(DropoutTorchBackend, GaussianEmulator):
+    """Multi-Layer Perceptron (MLP) emulator with Gaussian outputs."""
+
+    def __init__(
+        self,
+        x,
+        y,
+        activation_cls=nn.ReLU,
+        loss_fn_cls=nn.MSELoss,
+        optimizer_cls=optim.Adam,
+        epochs=100,
+        layer_dims=None,
+        weight_init="default",
+        scale=1,
+        bias_init="default",
+        dropout_prob=None,
+        lr=1e-2,
+        random_seed=None,
+        device=None,
+        **kwargs,
+    ):
+        TorchDeviceMixin.__init__(self, device=device)
+        nn.Module.__init__(self)
+
+        if random_seed is not None:
+            set_random_seed(seed=random_seed)
+
+        # Ensure x and y are tensors with correct dimensions
+        x, y = self._convert_to_tensors(x, y)
+
+        # Construct the MLP layers
+        # Total params required for last layer: mean + tril covariance
+        num_params = y.shape[1] + (y.shape[1] * (y.shape[1] + 1)) // 2
+        layer_dims = (
+            [x.shape[1], *layer_dims]
+            if layer_dims
+            else [x.shape[1], 4 * num_params, 2 * num_params]
+        )
+        layers = []
+        for idx, dim in enumerate(layer_dims[1:]):
+            layers.append(nn.Linear(layer_dims[idx], dim, device=self.device))
+            layers.append(activation_cls())
+            if dropout_prob is not None:
+                layers.append(nn.Dropout(p=dropout_prob))
+
+        # Add final layer without activation
+        layers.append(nn.Linear(layer_dims[-1], num_params, device=self.device))
+        self.nn = nn.Sequential(*layers)
+
+        # Finalize initialization
+        self._initialize_weights(weight_init, scale, bias_init)
+        self.epochs = epochs
+        self.loss_fn = loss_fn_cls()
+        self.num_tasks = y.shape[1]
+        self.optimizer = optimizer_cls(self.nn.parameters(), lr=lr)  # type: ignore[call-arg] since all optimizers include lr
+        self.to(device)
+
+    def _predict(self, x, with_grad=False):
+        """Predict using the MLP model."""
+        with torch.set_grad_enabled(with_grad):
+            self.nn.eval()
+            return self(x)
+
+    def forward(self, x):
+        """Forward pass for the Gaussian MLP."""
+        y = self.nn(x)
+        mean = y[..., : self.num_tasks]
+
+        # Use Cholesky decomposition to guarantee PSD covariance matrix
+        num_chol_params = (self.num_tasks * (self.num_tasks + 1)) // 2
+        chol_params = y[..., self.num_tasks : self.num_tasks + num_chol_params]
+
+        # Assign params to matrix
+        scale_tril = torch.zeros(
+            *y.shape[:-1], self.num_tasks, self.num_tasks, device=y.device
+        )
+        tril_indices = torch.tril_indices(
+            self.num_tasks, self.num_tasks, device=y.device
+        )
+        scale_tril[..., tril_indices[0], tril_indices[1]] = chol_params
+
+        # Ensure positive variance
+        diag_idxs = torch.arange(self.num_tasks)
+        diag = (
+            torch.nn.functional.softplus(scale_tril[..., diag_idxs, diag_idxs]) + 1e-6
+        )
+        scale_tril[..., diag_idxs, diag_idxs] = diag
+
+        covariance_matrix = scale_tril @ scale_tril.transpose(-1, -2)
+
+        # TODO: for large covariance martrices, numerical instability remains
+        return GaussianLike(mean, make_positive_definite(covariance_matrix))
+
+    def loss_func(self, y_pred, y_true):
+        """Negative log likelihood loss function."""
+        return -y_pred.log_prob(y_true).mean()
+
+    @staticmethod
+    def is_multioutput() -> bool:
+        """GaussianMLP supports multi-output."""
+        return True
+
+    @staticmethod
+    def get_tune_config():
+        """Return a dictionary of hyperparameters to tune."""
+        return {
+            "epochs": [50, 100, 200],
+            "layer_dims": [[64, 64], [128, 128]],
+            "lr": [1e-1, 1e-2, 1e-3],
+            "batch_size": [16, 32],
+            "weight_init": ["default", "normal"],
+            "scale": [0.1, 1.0],
+            "bias_init": ["default", "zeros"],
+            "dropout_prob": [0.3, 0.5, None],
+        }

From 604b32714a99d81ff71de1d3b9d3ee4302c95c61 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Thu, 10 Jul 2025 17:56:56 +0100
Subject: [PATCH 2/3] Add scheduler

---
 autoemulate/emulators/nn/mlp.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/autoemulate/emulators/nn/mlp.py b/autoemulate/emulators/nn/mlp.py
index 608833524..99a125634 100644
--- a/autoemulate/emulators/nn/mlp.py
+++ b/autoemulate/emulators/nn/mlp.py
@@ -159,6 +159,8 @@ def __init__(
         activation_cls=nn.ReLU,
         loss_fn_cls=nn.MSELoss,
         optimizer_cls=optim.Adam,
+        scheduler_cls=None,
+        scheduler_kwargs=None,
         epochs=100,
         layer_dims=None,
         weight_init="default",
@@ -203,7 +205,9 @@ def __init__(
         self.epochs = epochs
         self.loss_fn = loss_fn_cls()
         self.num_tasks = y.shape[1]
-        self.optimizer = optimizer_cls(self.nn.parameters(), lr=lr)  # type: ignore[call-arg] since all optimizers include lr
+        self.optimizer = optimizer_cls(self.nn.parameters(), lr=lr)
+        self.scheduler_cls = scheduler_cls
+        self.scheduler_setup(scheduler_kwargs)
         self.to(device)
 
     def _predict(self, x, with_grad=False):

From 8d8218436c05015723661e260a6032318c29189a Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Thu, 9 Oct 2025 16:49:11 +0100
Subject: [PATCH 3/3] Move to experimental

---
 autoemulate/emulators/nn/mlp.py               | 129 +-----------------
 .../experimental/emulators/nn/mlp_gaussian.py | 114 ++++++++++++++++
 2 files changed, 117 insertions(+), 126 deletions(-)
 create mode 100644 autoemulate/experimental/emulators/nn/mlp_gaussian.py

diff --git a/autoemulate/emulators/nn/mlp.py b/autoemulate/emulators/nn/mlp.py
index 99a125634..a1fa9d5fe 100644
--- a/autoemulate/emulators/nn/mlp.py
+++ b/autoemulate/emulators/nn/mlp.py
@@ -1,13 +1,11 @@
-import torch
-from torch import nn, optim
+from torch import nn
 
 from autoemulate.core.device import TorchDeviceMixin
-from autoemulate.core.types import DeviceLike, GaussianLike, TensorLike
+from autoemulate.core.types import DeviceLike, TensorLike
 from autoemulate.data.utils import set_random_seed
 from autoemulate.transforms.standardize import StandardizeTransform
-from autoemulate.transforms.utils import make_positive_definite
 
-from ..base import DropoutTorchBackend, GaussianEmulator
+from ..base import DropoutTorchBackend
 
 
 class MLP(DropoutTorchBackend):
@@ -147,124 +145,3 @@ def get_tune_params():
             "scheduler_cls": scheduler_params["scheduler_cls"],
             "scheduler_kwargs": scheduler_params["scheduler_kwargs"],
         }
-
-
-class GaussianMLP(DropoutTorchBackend, GaussianEmulator):
-    """Multi-Layer Perceptron (MLP) emulator with Gaussian outputs."""
-
-    def __init__(
-        self,
-        x,
-        y,
-        activation_cls=nn.ReLU,
-        loss_fn_cls=nn.MSELoss,
-        optimizer_cls=optim.Adam,
-        scheduler_cls=None,
-        scheduler_kwargs=None,
-        epochs=100,
-        layer_dims=None,
-        weight_init="default",
-        scale=1,
-        bias_init="default",
-        dropout_prob=None,
-        lr=1e-2,
-        random_seed=None,
-        device=None,
-        **kwargs,
-    ):
-        TorchDeviceMixin.__init__(self, device=device)
-        nn.Module.__init__(self)
-
-        if random_seed is not None:
-            set_random_seed(seed=random_seed)
-
-        # Ensure x and y are tensors with correct dimensions
-        x, y = self._convert_to_tensors(x, y)
-
-        # Construct the MLP layers
-        # Total params required for last layer: mean + tril covariance
-        num_params = y.shape[1] + (y.shape[1] * (y.shape[1] + 1)) // 2
-        layer_dims = (
-            [x.shape[1], *layer_dims]
-            if layer_dims
-            else [x.shape[1], 4 * num_params, 2 * num_params]
-        )
-        layers = []
-        for idx, dim in enumerate(layer_dims[1:]):
-            layers.append(nn.Linear(layer_dims[idx], dim, device=self.device))
-            layers.append(activation_cls())
-            if dropout_prob is not None:
-                layers.append(nn.Dropout(p=dropout_prob))
-
-        # Add final layer without activation
-        layers.append(nn.Linear(layer_dims[-1], num_params, device=self.device))
-        self.nn = nn.Sequential(*layers)
-
-        # Finalize initialization
-        self._initialize_weights(weight_init, scale, bias_init)
-        self.epochs = epochs
-        self.loss_fn = loss_fn_cls()
-        self.num_tasks = y.shape[1]
-        self.optimizer = optimizer_cls(self.nn.parameters(), lr=lr)
-        self.scheduler_cls = scheduler_cls
-        self.scheduler_setup(scheduler_kwargs)
-        self.to(device)
-
-    def _predict(self, x, with_grad=False):
-        """Predict using the MLP model."""
-        with torch.set_grad_enabled(with_grad):
-            self.nn.eval()
-            return self(x)
-
-    def forward(self, x):
-        """Forward pass for the Gaussian MLP."""
-        y = self.nn(x)
-        mean = y[..., : self.num_tasks]
-
-        # Use Cholesky decomposition to guarantee PSD covariance matrix
-        num_chol_params = (self.num_tasks * (self.num_tasks + 1)) // 2
-        chol_params = y[..., self.num_tasks : self.num_tasks + num_chol_params]
-
-        # Assign params to matrix
-        scale_tril = torch.zeros(
-            *y.shape[:-1], self.num_tasks, self.num_tasks, device=y.device
-        )
-        tril_indices = torch.tril_indices(
-            self.num_tasks, self.num_tasks, device=y.device
-        )
-        scale_tril[..., tril_indices[0], tril_indices[1]] = chol_params
-
-        # Ensure positive variance
-        diag_idxs = torch.arange(self.num_tasks)
-        diag = (
-            torch.nn.functional.softplus(scale_tril[..., diag_idxs, diag_idxs]) + 1e-6
-        )
-        scale_tril[..., diag_idxs, diag_idxs] = diag
-
-        covariance_matrix = scale_tril @ scale_tril.transpose(-1, -2)
-
-        # TODO: for large covariance martrices, numerical instability remains
-        return GaussianLike(mean, make_positive_definite(covariance_matrix))
-
-    def loss_func(self, y_pred, y_true):
-        """Negative log likelihood loss function."""
-        return -y_pred.log_prob(y_true).mean()
-
-    @staticmethod
-    def is_multioutput() -> bool:
-        """GaussianMLP supports multi-output."""
-        return True
-
-    @staticmethod
-    def get_tune_config():
-        """Return a dictionary of hyperparameters to tune."""
-        return {
-            "epochs": [50, 100, 200],
-            "layer_dims": [[64, 64], [128, 128]],
-            "lr": [1e-1, 1e-2, 1e-3],
-            "batch_size": [16, 32],
-            "weight_init": ["default", "normal"],
-            "scale": [0.1, 1.0],
-            "bias_init": ["default", "zeros"],
-            "dropout_prob": [0.3, 0.5, None],
-        }
diff --git a/autoemulate/experimental/emulators/nn/mlp_gaussian.py b/autoemulate/experimental/emulators/nn/mlp_gaussian.py
new file mode 100644
index 000000000..6c9e55fa0
--- /dev/null
+++ b/autoemulate/experimental/emulators/nn/mlp_gaussian.py
@@ -0,0 +1,114 @@
+import torch
+from autoemulate.core.device import TorchDeviceMixin
+from autoemulate.core.types import DeviceLike, GaussianLike, TensorLike
+from autoemulate.data.utils import set_random_seed
+from autoemulate.emulators.base import GaussianEmulator
+from autoemulate.emulators.nn.mlp import MLP
+from autoemulate.transforms.standardize import StandardizeTransform
+from autoemulate.transforms.utils import make_positive_definite
+from torch import nn
+
+
+class GaussianMLP(MLP, GaussianEmulator):
+    """Multi-Layer Perceptron (MLP) emulator with Gaussian outputs."""
+
+    def __init__(
+        self,
+        x: TensorLike,
+        y: TensorLike,
+        standardize_x: bool = True,
+        standardize_y: bool = True,
+        activation_cls: type[nn.Module] = nn.ReLU,
+        loss_fn_cls: type[nn.Module] = nn.MSELoss,
+        epochs: int = 100,
+        batch_size: int = 16,
+        layer_dims: list[int] | None = None,
+        weight_init: str = "default",
+        scale: float = 1.0,
+        bias_init: str = "default",
+        dropout_prob: float | None = None,
+        lr: float = 1e-2,
+        random_seed: int | None = None,
+        device: DeviceLike | None = None,
+        **scheduler_kwargs,
+    ):
+        TorchDeviceMixin.__init__(self, device=device)
+        nn.Module.__init__(self)
+
+        if random_seed is not None:
+            set_random_seed(seed=random_seed)
+
+        # Ensure x and y are tensors with correct dimensions
+        x, y = self._convert_to_tensors(x, y)
+
+        # Construct the MLP layers
+        # Total params required for last layer: mean + tril covariance
+        num_params = y.shape[1] + (y.shape[1] * (y.shape[1] + 1)) // 2
+        layer_dims = (
+            [x.shape[1], *layer_dims]
+            if layer_dims
+            else [x.shape[1], 4 * num_params, 2 * num_params]
+        )
+        layers = []
+        for idx, dim in enumerate(layer_dims[1:]):
+            layers.append(nn.Linear(layer_dims[idx], dim, device=self.device))
+            layers.append(activation_cls())
+            if dropout_prob is not None:
+                layers.append(nn.Dropout(p=dropout_prob))
+
+        # Add final layer without activation
+        layers.append(nn.Linear(layer_dims[-1], num_params, device=self.device))
+        self.nn = nn.Sequential(*layers)
+
+        # Finalize initialization
+        self._initialize_weights(weight_init, scale, bias_init)
+        self.x_transform = StandardizeTransform() if standardize_x else None
+        self.y_transform = StandardizeTransform() if standardize_y else None
+        self.epochs = epochs
+        self.loss_fn = loss_fn_cls()
+        self.lr = lr
+        self.num_tasks = y.shape[1]
+        self.batch_size = batch_size
+        self.optimizer = self.optimizer_cls(self.nn.parameters(), lr=lr)  # type: ignore  # noqa: PGH003
+        self.scheduler_setup(scheduler_kwargs)
+        self.to(device)
+
+    def _predict(self, x, with_grad=False):
+        """Predict using the MLP model."""
+        with torch.set_grad_enabled(with_grad):
+            self.nn.eval()
+            return self(x)
+
+    def forward(self, x):
+        """Forward pass for the Gaussian MLP."""
+        y = self.nn(x)
+        mean = y[..., : self.num_tasks]
+
+        # Use Cholesky decomposition to guarantee PSD covariance matrix
+        num_chol_params = (self.num_tasks * (self.num_tasks + 1)) // 2
+        chol_params = y[..., self.num_tasks : self.num_tasks + num_chol_params]
+
+        # Assign params to matrix
+        scale_tril = torch.zeros(
+            *y.shape[:-1], self.num_tasks, self.num_tasks, device=y.device
+        )
+        tril_indices = torch.tril_indices(
+            self.num_tasks, self.num_tasks, device=y.device
+        )
+        scale_tril[..., tril_indices[0], tril_indices[1]] = chol_params
+
+        # Ensure positive variance
+        diag_idxs = torch.arange(self.num_tasks)
+        diag = (
+            torch.nn.functional.softplus(scale_tril[..., diag_idxs, diag_idxs]) + 1e-6
+        )
+        scale_tril[..., diag_idxs, diag_idxs] = diag
+
+        covariance_matrix = scale_tril @ scale_tril.transpose(-1, -2)
+
+        # TODO: for large covariance martrices, numerical instability remains
+        return GaussianLike(mean, make_positive_definite(covariance_matrix))
+
+    def loss_func(self, y_pred, y_true):
+        """Negative log likelihood loss function."""
+        return -y_pred.log_prob(y_true).mean()