diff --git a/CHANGELOG.md b/CHANGELOG.md
index c8b177eb11..989bfcc60e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -14,6 +14,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Fixed a bug where test metrics were not logged correctly with active learning ([#879](https://github.com/PyTorchLightning/lightning-flash/pull/879))
 
+
+- Fixed a bug where validation metrics could be aggregated together with test metrics in some cases ([#900](https://github.com/PyTorchLightning/lightning-flash/pull/900))
+
+
 ## [0.5.1] - 2021-10-26
 
 ### Added
diff --git a/flash/core/model.py b/flash/core/model.py
index 20da95d285..6f87bcb4c3 100644
--- a/flash/core/model.py
+++ b/flash/core/model.py
@@ -354,6 +354,7 @@ def __init__(
 
         self.train_metrics = nn.ModuleDict({} if metrics is None else get_callable_dict(metrics))
         self.val_metrics = nn.ModuleDict({} if metrics is None else get_callable_dict(deepcopy(metrics)))
+        self.test_metrics = nn.ModuleDict({} if metrics is None else get_callable_dict(deepcopy(metrics)))
         self.learning_rate = learning_rate
         # TODO: should we save more? Bug on some regarding yaml if we save metrics
         self.save_hyperparameters("learning_rate", "optimizer")
@@ -454,7 +455,7 @@ def validation_step(self, batch: Any, batch_idx: int) -> None:
         )
 
     def test_step(self, batch: Any, batch_idx: int) -> None:
-        output = self.step(batch, batch_idx, self.val_metrics)
+        output = self.step(batch, batch_idx, self.test_metrics)
         self.log_dict(
             {f"test_{k}": v for k, v in output[OutputKeys.LOGS].items()},
             on_step=False,
diff --git a/tests/core/test_model.py b/tests/core/test_model.py
index 0e68344bb5..f31bba3e70 100644
--- a/tests/core/test_model.py
+++ b/tests/core/test_model.py
@@ -437,6 +437,7 @@ def i_will_create_a_misconfiguration_exception(optimizer):
 def test_classification_task_metrics():
     train_dataset = FixedDataset([0, 1])
     val_dataset = FixedDataset([1, 1])
+    test_dataset = FixedDataset([0, 0])
 
     model = OnesModel()
 
@@ -444,6 +445,13 @@ class CheckAccuracy(Callback):
         def on_train_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
             assert math.isclose(trainer.callback_metrics["train_accuracy_epoch"], 0.5)
 
+        def on_validation_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
+            assert math.isclose(trainer.callback_metrics["val_accuracy"], 1.0)
+
+        def on_test_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
+            assert math.isclose(trainer.callback_metrics["test_accuracy"], 0.0)
+
     task = ClassificationTask(model)
     trainer = flash.Trainer(max_epochs=1, callbacks=CheckAccuracy(), gpus=torch.cuda.device_count())
     trainer.fit(task, train_dataloader=DataLoader(train_dataset), val_dataloaders=DataLoader(val_dataset))
+    trainer.test(task, dataloaders=DataLoader(test_dataset))