Add benchmark script

sgreenbury · sgreenbury · commit 28506b006986 · 2025-07-14T11:55:28.000+01:00
diff --git a/autoemulate/experimental/emulators/gaussian_process/exact.py b/autoemulate/experimental/emulators/gaussian_process/exact.py
@@ -52,7 +52,7 @@ class GaussianProcessExact(GaussianProcessEmulator, gpytorch.models.ExactGP):
     # TODO: refactor to work more like PyTorchBackend once any subclasses implemented
     optimizer_cls: type[optim.Optimizer] = optim.Adam
     optimizer: optim.Optimizer
-    lr: float = 1e-1
+    lr: float = 2e-1
     scheduler_cls: type[LRScheduler] | None = None
 
     def __init__(  # noqa: PLR0913 allow too many arguments since all currently required
@@ -64,7 +64,7 @@ def __init__(  # noqa: PLR0913 allow too many arguments since all currently requ
         covar_module_fn: CovarModuleFn = rbf,
         epochs: int = 50,
         activation: type[nn.Module] = nn.ReLU,
-        lr: float = 2e-1,
+        lr: float = 1e-1,
         early_stopping: EarlyStopping | None = None,
         device: DeviceLike | None = None,
         **kwargs,
@@ -225,8 +225,7 @@ def get_tune_config():
                 matern_5_2_plus_rq,
                 rbf_times_linear,
             ],
-            "epochs": [10, 50, 100, 200],
-            "batch_size": [16, 32],
+            "epochs": [50, 100, 200],
             "activation": [
                 nn.ReLU,
                 nn.GELU,
diff --git a/autoemulate/experimental/emulators/nn/mlp.py b/autoemulate/experimental/emulators/nn/mlp.py
@@ -113,14 +113,17 @@ def is_multioutput() -> bool:
     def get_tune_config():
         scheduler_params = MLP.scheduler_config()
         return {
-            "epochs": [50, 100, 200],
-            "layer_dims": [[32, 16], [64, 32, 16]],
-            "lr": [1e-1, 1e-2, 1e-3],
+            # "epochs": [50, 100, 200],
+            "epochs": [100, 200],
+            # "layer_dims": [[32, 16], [64, 32, 16]],
+            "layer_dims": [[8, 4], [16, 8], [32, 16]],
+            # "lr": [5e-1, 2e-1, 1e-1, 1e-2, 1e-3],
+            "lr": [5e-1, 2e-1, 1e-1, 1e-2],
             "batch_size": [16, 32],
             "weight_init": ["default", "normal"],
             "scale": [0.1, 1.0],
             "bias_init": ["default", "zeros"],
-            "dropout_prob": [0.3, 0.5, None],
+            "dropout_prob": [0.3, None],
             "scheduler_cls": scheduler_params["scheduler_cls"],
             "scheduler_kwargs": scheduler_params["scheduler_kwargs"],
         }
diff --git a/scripts/benchmark.py b/scripts/benchmark.py
@@ -0,0 +1,75 @@
+import itertools
+
+import click
+import numpy as np
+import pandas as pd
+from autoemulate.experimental.compare import AutoEmulate
+from autoemulate.experimental.emulators import ALL_EMULATORS
+from autoemulate.experimental.simulations.projectile import ProjectileMultioutput
+from tqdm import tqdm
+
+
+def run_benchmark(n_samples, n_iter, n_splits, log_level) -> pd.DataFrame:
+    projectile = ProjectileMultioutput()
+    x = projectile.sample_inputs(n_samples).float()
+    y = projectile.forward_batch(x).float()
+
+    ae = AutoEmulate(
+        x,
+        y,
+        models=ALL_EMULATORS,
+        n_iter=n_iter,
+        n_splits=n_splits,
+        # log_level=log_level,
+    )
+
+    return ae.summarise()
+
+
+@click.command()
+@click.option(
+    "--n_samples_list",
+    type=list[int],
+    default=[10, 50, 100, 200, 500],
+    help="Number of samples to generate",
+)
+@click.option(
+    "--n_iter_list",
+    type=list[int],
+    default=[10, 50, 100, 200],
+    help="Number of iterations to run",
+)
+@click.option(
+    "--n_splits_list",
+    type=list[int],
+    default=[2, 4],
+    help="Number of splits for cross-validation",
+)
+@click.option("--log_level", default="info", help="Logging level")
+def main(n_samples_list, n_iter_list, n_splits_list, log_level):
+    """Run the benchmark for MLP and GaussianProcessExact emulators."""
+
+    dfs = []
+
+    params = list(itertools.product(n_samples_list, n_iter_list, n_splits_list))
+    np.random.seed(43)
+    params = np.random.permutation(params)
+    for n_samples, n_iter, n_splits in tqdm(params):
+        print(
+            f"Running benchmark with {n_samples} samples, {n_iter} iterations, "
+            f"and {n_splits} splits"
+        )
+        df = run_benchmark(n_samples, n_iter, n_splits, log_level)
+
+        df["n_samples"] = n_samples
+        df["n_iter"] = n_iter
+        df["n_splits"] = n_splits
+        dfs.append(df)
+        final_df = pd.concat(dfs, ignore_index=True)
+        final_df.sort_values("r2", ascending=False).to_csv(
+            "notebooks/benchmark_results.csv", index=False
+        )
+
+
+if __name__ == "__main__":
+    main()