Lightning-AI
diff --git a/‎docs/source/early_stopping.rst
+4-3 b/‎docs/source/early_stopping.rst
+4-3
diff --git a/‎docs/source/introduction_guide.rst
+3 b/‎docs/source/introduction_guide.rst
+3
diff --git a/‎docs/source/lightning_module.rst
+2 b/‎docs/source/lightning_module.rst
+2
diff --git a/‎docs/source/weights_loading.rst
+1-1 b/‎docs/source/weights_loading.rst
+1-1
diff --git a/‎pytorch_lightning/callbacks/early_stopping.py
+3-29 b/‎pytorch_lightning/callbacks/early_stopping.py
+3-29
diff --git a/‎pytorch_lightning/callbacks/model_checkpoint.py
+16-30 b/‎pytorch_lightning/callbacks/model_checkpoint.py
+16-30
diff --git a/‎pytorch_lightning/core/step_result.py
+5-2 b/‎pytorch_lightning/core/step_result.py
+5-2
diff --git a/‎pytorch_lightning/trainer/__init__.py
+5-5 b/‎pytorch_lightning/trainer/__init__.py
+5-5
diff --git a/‎pytorch_lightning/trainer/connectors/callback_connector.py
+4-12 b/‎pytorch_lightning/trainer/connectors/callback_connector.py
+4-12
diff --git a/‎pytorch_lightning/trainer/connectors/logger_connector.py
+14-4 b/‎pytorch_lightning/trainer/connectors/logger_connector.py
+14-4
diff --git a/‎pytorch_lightning/trainer/logging.py
+12-1 b/‎pytorch_lightning/trainer/logging.py
+12-1
@@ -18,9 +18,10 @@ If you do this repeatedly, for every epoch you had originally requested, then th
 
 Default Epoch End Callback Behavior
 -----------------------------------
-By default early stopping will be enabled if `'val_loss'`
-is found in :meth:`~pytorch_lightning.core.lightning.LightningModule.validation_epoch_end`'s
-return dict. Otherwise training will proceed with early stopping disabled.
+By default early stopping will be enabled if the `early_stop_on` key in the EvalResult object is used
+in either the :meth:`~pytorch_lightning.core.lightning.LightningModule.validation_step` method or
+the :meth:`~pytorch_lightning.core.lightning.LightningModule.validation_epoch_end` method.
+
 
 ----------
 
 
@@ -683,6 +683,9 @@ Since the `validation_step` processes a single batch, use the `EvalResult` to lo
 .. code-block:: python
 
     def validation_step(self, batch, batch_idx):
+        loss = MSE_loss(...)
+
+        # loss is a tensor. The Checkpoint Callback is monitoring 'checkpoint_on'
         result = pl.EvalResult(checkpoint_on=loss)
         result.log('val_loss', loss)
 
 
@@ -288,6 +288,8 @@ For cases like production, you might want to iterate different models inside a L
             y_hat = self.model(x)
             loss = F.cross_entropy(y_hat, y)
             acc = FM.accuracy(y_hat, y)
+
+            # loss is tensor. The Checkpoint Callback is monitoring 'checkpoint_on'
             result = pl.EvalResult(checkpoint_on=loss)
             result.log_dict({'val_acc': acc, 'val_loss': loss})
             return result
 
@@ -45,7 +45,7 @@ To modify the behavior of checkpointing pass in your own callback.
         filepath=os.getcwd(),
         save_top_k=1,
         verbose=True,
-        monitor='val_loss',
+        monitor='checkpoint_on',
         mode='min',
         prefix=''
     )
 
@@ -42,8 +42,7 @@ class EarlyStopping(Callback):
     r"""
 
     Args:
-        monitor: quantity to be monitored. Default: ``'val_loss'``.
-            .. note:: Has no effect when using `EvalResult` or `TrainResult`
+        monitor: quantity to be monitored. Default: ``'early_stop_on'``.
         min_delta: minimum change in the monitored quantity
             to qualify as an improvement, i.e. an absolute
             change of less than `min_delta`, will count as no
@@ -73,7 +72,7 @@ class EarlyStopping(Callback):
         'max': torch.gt,
     }
 
-    def __init__(self, monitor: str = 'val_loss', min_delta: float = 0.0, patience: int = 3,
+    def __init__(self, monitor: str = 'early_stop_on', min_delta: float = 0.0, patience: int = 3,
                  verbose: bool = False, mode: str = 'auto', strict: bool = True):
         super().__init__()
         self.monitor = monitor
@@ -150,16 +149,6 @@ def on_validation_epoch_end(self, trainer, pl_module):
         if trainer.running_sanity_check:
             return
 
-        self.__warn_deprecated_monitor_key()
-
-        val_es_key = 'val_early_stop_on'
-        if trainer.logger_connector.callback_metrics.get(val_es_key) is not None:
-            self.monitor = val_es_key
-
-        # disable strict checking when using structured results
-        if val_es_key in trainer.logger_connector.callback_metrics:
-            self.strict = False
-
         if self._validate_condition_metric(trainer.logger_connector.callback_metrics):
             # turn off early stopping in on_train_epoch_end
             self.based_on_eval_results = True
@@ -171,29 +160,14 @@ def on_train_epoch_end(self, trainer, pl_module):
 
         # early stopping can also work in the train loop when there is no val loop
         should_check_early_stop = False
-        # early_stop_on takes precedence over monitor key
-        train_es_key = 'early_stop_on'
-        if trainer.logger_connector.callback_metrics.get(train_es_key, None) is not None:
-            self.monitor = train_es_key
-            should_check_early_stop = True
+
         # fallback to monitor key in result dict
         if trainer.logger_connector.callback_metrics.get(self.monitor, None) is not None:
             should_check_early_stop = True
 
         if should_check_early_stop:
             self._run_early_stopping_check(trainer, pl_module)
 
-    def __warn_deprecated_monitor_key(self):
-        using_result_obj = os.environ.get('PL_USING_RESULT_OBJ', None)
-        invalid_key = self.monitor not in ['val_loss', 'early_stop_on', 'val_early_stop_on', 'loss']
-        if using_result_obj and not self.warned_result_obj and invalid_key:
-            self.warned_result_obj = True
-            rank_zero_warn(
-                f"When using `EvalResult(early_stop_on=X)` or `TrainResult(early_stop_on=X)`"
-                " the 'monitor' key of `EarlyStopping` has no effect. "
-                f" Remove `EarlyStopping(monitor='{self.monitor}')` to fix."
-            )
-
     def _run_early_stopping_check(self, trainer, pl_module):
         """
         Checks whether the early stopping condition is met
 
@@ -30,6 +30,7 @@
 from pytorch_lightning.callbacks.base import Callback
 from pytorch_lightning.utilities import rank_zero_only, rank_zero_warn
 from pytorch_lightning.utilities.cloud_io import get_filesystem
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 
 class ModelCheckpoint(Callback):
@@ -118,7 +119,7 @@ class ModelCheckpoint(Callback):
     def __init__(
         self,
         filepath: Optional[str] = None,
-        monitor: str = "val_loss",
+        monitor: str = "checkpoint_on",
         verbose: bool = False,
         save_last: bool = False,
         save_top_k: int = 1,
@@ -317,48 +318,30 @@ def on_pretrain_routine_start(self, trainer, pl_module):
         ), "tried to make a checkpoint from non global_rank=0"
         self._fs.makedirs(self.dirpath, exist_ok=True)
 
-    def __warn_deprecated_monitor_key(self):
-        using_result_obj = os.environ.get("PL_USING_RESULT_OBJ", None)
-        invalid_key = self.monitor not in [
-            "val_loss",
-            "checkpoint_on",
-            "loss",
-            "val_checkpoint_on",
-        ]
-        if using_result_obj and not self.warned_result_obj and invalid_key:
-            self.warned_result_obj = True
-            rank_zero_warn(
-                f"When using `EvalResult(checkpoint_on=X)` or `TrainResult(checkpoint_on=X)`"
-                " the 'monitor' key of `ModelCheckpoint` has no effect."
-                f" Remove `ModelCheckpoint(monitor='{self.monitor}')` to fix."
-            )
-
     @rank_zero_only
     def on_validation_end(self, trainer, pl_module):
         # only run on main process
         if trainer.global_rank != 0:
             return
 
-        if trainer.running_sanity_check:
+        # no models are saved
+        if self.save_top_k == 0:
             return
 
-        # TODO: remove when dict results are deprecated
-        self.__warn_deprecated_monitor_key()
+        if trainer.running_sanity_check:
+            return
 
         metrics = trainer.logger_connector.callback_metrics
         epoch = trainer.current_epoch
 
-        # support structured results
-        if metrics.get("checkpoint_on") is not None:
-            self.monitor = "checkpoint_on"
-
-        # conditioned val metrics override conditioned train loop metrics
-        if metrics.get("val_checkpoint_on") is not None:
-            self.monitor = "val_checkpoint_on"
+        # validate metric
+        if not self._is_valid_monitor_key(metrics):
+            keys = list(metrics.keys())
+            m = f"""
+                ModelCheckpoint(monitor='{self.monitor}') not found in the returned metrics ({keys}),
+                "did you call result.log(f'{self.monitor}', tensor)?"""
+            raise MisconfigurationException(m)
 
-        if self.save_top_k == 0:
-            # no models are saved
-            return
         if (
             self.epoch_last_check is not None
             and (epoch - self.epoch_last_check) < self.period
@@ -420,6 +403,9 @@ def on_validation_end(self, trainer, pl_module):
             if self.last_model_path and self.last_model_path != filepath:
                 self._del_model(self.last_model_path)
 
+    def _is_valid_monitor_key(self, metrics):
+        return self.monitor in metrics or len(metrics) == 0
+
     def _do_check_save(
         self,
         filepath: str,
 
@@ -801,8 +801,11 @@ def log_dict(
             )
 
     def get_callback_metrics(self) -> dict:
-        result = {'val_early_stop_on': self.early_stop_on, 'val_checkpoint_on': self.checkpoint_on}
-
+        result = {}
+        if self.early_stop_on:
+            result['early_stop_on'] = self.early_stop_on
+        if self.checkpoint_on:
+            result['checkpoint_on'] = self.checkpoint_on
         return result
 
     def write(self, name: str, values: Union[Tensor, list], filename: str = 'predictions.pt'):
 
@@ -350,7 +350,7 @@ def on_train_end(self, trainer, pl_module):
         filepath=os.getcwd(),
         save_top_k=True,
         verbose=True,
-        monitor='val_loss',
+        monitor='checkpoint_on',
         mode='min',
         prefix=''
     )
@@ -411,9 +411,9 @@ def on_train_end(self, trainer, pl_module):
 Callback for early stopping.
 early_stop_callback (:class:`pytorch_lightning.callbacks.EarlyStopping`)
 
-- ``True``: A default callback monitoring ``'val_loss'`` (if dict is returned in validation loop) or
+- ``True``: A default callback monitoring ``'early_stop_on'`` (if dict is returned in validation loop) or
   ``early_stopping_on`` (if :class:`~pytorch_lightning.core.step_result.Result` is returned) is created.
-  Will raise an error if a dictionary is returned and ``'val_loss'`` is not found.
+  Will raise an error if a dictionary is returned and ``'early_stop_on'`` is not found.
   Will raise an error if a :class:`~pytorch_lightning.core.step_result.Result` is returned
   and ``early_stopping_on`` was not specified.
 - ``False``: Early stopping will be disabled.
@@ -426,15 +426,15 @@ def on_train_end(self, trainer, pl_module):
 
     # default used by the Trainer
     early_stop = EarlyStopping(
-        monitor='val_loss',
+        monitor='early_stop_on',
         patience=3,
         strict=False,
         verbose=False,
         mode='min'
     )
     trainer = Trainer(early_stop_callback=early_stop)
 
-.. note:: If ``'val_loss'`` is not found will work as if early stopping is disabled.
+.. note:: If ``'early_stop_on'`` is not found will work as if early stopping is disabled.
 
 fast_dev_run
 ^^^^^^^^^^^^
 
@@ -1,7 +1,6 @@
 import os
 from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, ProgressBarBase, ProgressBar
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from pytorch_lightning.utilities.model_utils import is_overridden
 
 
 class CallbackConnector:
@@ -38,7 +37,7 @@ def on_trainer_init(
         # configure checkpoint callback
         # it is important that this is the last callback to run
         # pass through the required args to figure out defaults
-        checkpoint_callback = self.configure_checkpoint_callback(checkpoint_callback)
+        checkpoint_callback = self.init_default_checkpoint_callback(checkpoint_callback)
         if checkpoint_callback:
             self.trainer.callbacks.append(checkpoint_callback)
 
@@ -51,18 +50,11 @@ def on_trainer_init(
             progress_bar_refresh_rate, process_position
         )
 
-    def configure_checkpoint_callback(self, checkpoint_callback):
+    def init_default_checkpoint_callback(self, checkpoint_callback):
         if checkpoint_callback is True:
-            # when no val step is defined, use 'loss' otherwise 'val_loss'
-            train_step_only = not is_overridden('validation_step', self.trainer.get_model())
-            monitor_key = 'loss' if train_step_only else 'val_loss'
-            checkpoint_callback = ModelCheckpoint(
-                filepath=None,
-                monitor=monitor_key
-            )
+            checkpoint_callback = ModelCheckpoint(filepath=None)
         elif checkpoint_callback is False:
             checkpoint_callback = None
-
         if checkpoint_callback:
             checkpoint_callback.save_function = self.trainer.save_checkpoint
 
@@ -71,7 +63,7 @@ def configure_checkpoint_callback(self, checkpoint_callback):
     def configure_early_stopping(self, early_stop_callback):
         if early_stop_callback is True or None:
             early_stop_callback = EarlyStopping(
-                monitor='val_loss',
+                monitor='early_stop_on',
                 patience=3,
                 strict=True,
                 verbose=True,
 
@@ -110,9 +110,9 @@ def _log_on_evaluation_epoch_end_metrics(self, eval_results, using_eval_result):
         if using_eval_result:
             if isinstance(eval_results, list):
                 for eval_result in eval_results:
-                    self.trainer.logger_connector.callback_metrics = eval_result.callback_metrics
+                    self.trainer.logger_connector.callback_metrics.update(eval_result.callback_metrics)
             else:
-                self.trainer.logger_connector.callback_metrics = eval_results.callback_metrics
+                self.trainer.logger_connector.callback_metrics.update(eval_results.callback_metrics)
         else:
             if isinstance(eval_results, list):
                 for eval_result in eval_results:
@@ -121,13 +121,23 @@ def _log_on_evaluation_epoch_end_metrics(self, eval_results, using_eval_result):
                         flat = {'val_loss': eval_result}
                     else:
                         flat = flatten_dict(eval_result)
+
+                    # removing val_loss magic word to map to checkpoint + ES callback
+                    if 'val_loss' in flat:
+                        flat['checkpoint_on'] = flat['val_loss']
+                        flat['early_stop_on'] = flat['val_loss']
                     self.trainer.logger_connector.callback_metrics.update(flat)
             else:
                 # with a scalar return, auto set it to "val_loss" for callbacks
                 if isinstance(eval_results, torch.Tensor):
                     flat = {'val_loss': eval_results}
                 else:
                     flat = flatten_dict(eval_results)
+
+                # removing val_loss magic word to map to checkpoint + ES callback
+                if 'val_loss' in flat:
+                    flat['checkpoint_on'] = flat['val_loss']
+                    flat['early_stop_on'] = flat['val_loss']
                 self.trainer.logger_connector.callback_metrics.update(flat)
 
     def __log_evaluation_epoch_metrics_2(self, eval_results, test_mode):
@@ -151,7 +161,7 @@ def __log_evaluation_epoch_metrics_2(self, eval_results, test_mode):
                     if test_mode:
                         callback_metrics = {}
                 else:
-                    _, prog_bar_metrics, log_metrics, callback_metrics, _ = self.trainer.process_output(result)
+                    _, prog_bar_metrics, log_metrics, callback_metrics, _ = self.trainer.process_dict_result(result)
 
                 # eval loop returns all metrics
                 dataloader_result_metrics = {**prog_bar_metrics, **log_metrics, **callback_metrics}
@@ -239,7 +249,7 @@ def log_train_epoch_end_metrics(self,
                 epoch_log_metrics = epoch_output.epoch_log_metrics
                 epoch_progress_bar_metrics = epoch_output.epoch_pbar_metrics
             else:
-                _processed_outputs = self.trainer.process_output(epoch_output)
+                _processed_outputs = self.trainer.process_dict_result(epoch_output)
                 epoch_progress_bar_metrics = _processed_outputs[1]
                 epoch_log_metrics = _processed_outputs[2]
                 epoch_callback_metrics = _processed_outputs[3]
 
@@ -52,7 +52,7 @@ def metrics_to_scalars(self, metrics):
 
         return new_metrics
 
-    def process_output(self, output, train=False):
+    def process_dict_result(self, output, train=False):
         """Reduces output according to the training mode.
 
         Separates loss from logging and progress bar metrics
@@ -147,6 +147,17 @@ def process_output(self, output, train=False):
         # no .item() because it will slow things down
         callback_metrics = recursive_detach(callback_metrics)
 
+        # replace loss with checkpoint_on
+        if 'loss' in callback_metrics:
+            callback_metrics['checkpoint_on'] = callback_metrics['loss']
+            callback_metrics['early_stop_on'] = callback_metrics['loss']
+            del callback_metrics['loss']
+
+        if 'val_loss' in callback_metrics:
+            callback_metrics['checkpoint_on'] = callback_metrics['val_loss']
+            callback_metrics['early_stop_on'] = callback_metrics['val_loss']
+            del callback_metrics['val_loss']
+
         return loss, progress_bar_metrics, log_metrics, callback_metrics, hiddens
 
     def reduce_distributed_output(self, output, num_gpus):
Original file line number	Diff line number	Diff line change
`@@ -45,7 +45,7 @@ To modify the behavior of checkpointing pass in your own callback.`
`45`	`45`	`filepath=os.getcwd(),`
`46`	`46`	`save_top_k=1,`
`47`	`47`	`verbose=True,`
`48`		`- monitor='val_loss',`
	`48`	`+ monitor='checkpoint_on',`
`49`	`49`	`mode='min',`
`50`	`50`	`prefix=''`
`51`	`51`	`)`