From b9443a07b9f514df2f6a96bce5bbe9dc43e58953 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Tue, 24 Aug 2021 19:35:19 +0200
Subject: [PATCH] [2 / 3] improvements to saving and loading callback state
 (#7187)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
---
 CHANGELOG.md                                  |  5 +-
 docs/source/extensions/callbacks.rst          | 59 ++++++++++++++++++-
 pytorch_lightning/callbacks/base.py           | 20 +++++--
 pytorch_lightning/callbacks/early_stopping.py | 11 ++++
 .../callbacks/model_checkpoint.py             | 17 ++++++
 pytorch_lightning/trainer/callback_hook.py    |  4 +-
 tests/callbacks/test_callbacks.py             |  4 +-
 tests/callbacks/test_early_stopping.py        |  8 ++-
 tests/callbacks/test_lambda_function.py       |  4 +-
 tests/checkpointing/test_model_checkpoint.py  | 34 +++++++++--
 .../connectors/test_callback_connector.py     | 41 ++++++++++---
 11 files changed, 180 insertions(+), 27 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d3ca05789fa6b..313a790ce5468 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -19,7 +19,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added a flavor of `training_step` that takes `dataloader_iter` as an argument ([#8807](https://github.com/PyTorchLightning/pytorch-lightning/pull/8807))
 
 
-- Added `state_id` property to the `Callback` base class ([#6886](https://github.com/PyTorchLightning/pytorch-lightning/pull/6886))
+- Added `state_key` property to the `Callback` base class ([#6886](https://github.com/PyTorchLightning/pytorch-lightning/pull/6886))
 
 
 - Progress tracking
@@ -60,6 +60,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
     * Refactored CheckpointConnector to offload validating logic to the checkpoitn IO plugin ([#9045](https://github.com/PyTorchLightning/pytorch-lightning/pull/9045))
 
 
+- Added support for saving and loading state of multiple callbacks of the same type ([#7187](https://github.com/PyTorchLightning/pytorch-lightning/pull/7187))
+
+
 - Added DeepSpeed Stage 1 support ([#8974](https://github.com/PyTorchLightning/pytorch-lightning/pull/8974))
 
 
diff --git a/docs/source/extensions/callbacks.rst b/docs/source/extensions/callbacks.rst
index 2c9ee612ceb22..b007fd479b0d0 100644
--- a/docs/source/extensions/callbacks.rst
+++ b/docs/source/extensions/callbacks.rst
@@ -113,16 +113,69 @@ Lightning has a few built-in callbacks.
 
 ----------
 
+.. _Persisting Callback State:
+
 Persisting State
 ----------------
 
 Some callbacks require internal state in order to function properly. You can optionally
 choose to persist your callback's state as part of model checkpoint files using the callback hooks
 :meth:`~pytorch_lightning.callbacks.Callback.on_save_checkpoint` and :meth:`~pytorch_lightning.callbacks.Callback.on_load_checkpoint`.
-However, you must follow two constraints:
+Note that the returned state must be able to be pickled.
+
+When your callback is meant to be used only as a singleton callback then implementing the above two hooks is enough
+to persist state effectively. However, if passing multiple instances of the callback to the Trainer is supported, then
+the callback must define a :attr:`~pytorch_lightning.callbacks.Callback.state_key` property in order for Lightning
+to be able to distinguish the different states when loading the callback state. This concept is best illustrated by
+the following example.
+
+.. testcode::
+
+    class Counter(Callback):
+        def __init__(self, what="epochs", verbose=True):
+            self.what = what
+            self.verbose = verbose
+            self.state = {"epochs": 0, "batches": 0}
+
+        @property
+        def state_key(self):
+            # note: we do not include `verbose` here on purpose
+            return self._generate_state_key(what=self.what)
+
+        def on_train_epoch_end(self, *args, **kwargs):
+            if self.what == "epochs":
+                self.state["epochs"] += 1
+
+        def on_train_batch_end(self, *args, **kwargs):
+            if self.what == "batches":
+                self.state["batches"] += 1
+
+        def on_load_checkpoint(self, trainer, pl_module, callback_state):
+            self.state.update(callback_state)
+
+        def on_save_checkpoint(self, trainer, pl_module, checkpoint):
+            return self.state.copy()
+
+
+    # two callbacks of the same type are being used
+    trainer = Trainer(callbacks=[Counter(what="epochs"), Counter(what="batches")])
+
+A Lightning checkpoint from this Trainer with the two stateful callbacks will include the following information:
+
+.. code-block::
+
+    {
+        "state_dict": ...,
+        "callbacks": {
+            "Counter{'what': 'batches'}": {"batches": 32, "epochs": 0},
+            "Counter{'what': 'epochs'}": {"batches": 0, "epochs": 2},
+            ...
+        }
+    }
 
-1. Your returned state must be able to be pickled.
-2. You can only use one instance of that class in the Trainer callbacks list. We don't support persisting state for multiple callbacks of the same class.
+The implementation of a :attr:`~pytorch_lightning.callbacks.Callback.state_key` is essential here. If it were missing,
+Lightning would not be able to disambiguate the state for these two callbacks, and :attr:`~pytorch_lightning.callbacks.Callback.state_key`
+by default only defines the class name as the key, e.g., here ``Counter``.
 
 
 Best Practices
diff --git a/pytorch_lightning/callbacks/base.py b/pytorch_lightning/callbacks/base.py
index c38613b0e3159..fdb22a44ed307 100644
--- a/pytorch_lightning/callbacks/base.py
+++ b/pytorch_lightning/callbacks/base.py
@@ -34,20 +34,30 @@ class Callback(abc.ABC):
     """
 
     @property
-    def state_id(self) -> str:
+    def state_key(self) -> str:
         """
         Identifier for the state of the callback. Used to store and retrieve a callback's state from the
-        checkpoint dictionary by ``checkpoint["callbacks"][state_id]``. Implementations of a callback need to
-        provide a unique state id if 1) the callback has state and 2) it is desired to maintain the state of
+        checkpoint dictionary by ``checkpoint["callbacks"][state_key]``. Implementations of a callback need to
+        provide a unique state key if 1) the callback has state and 2) it is desired to maintain the state of
         multiple instances of that callback.
         """
         return self.__class__.__qualname__
 
     @property
-    def _legacy_state_id(self) -> Type["Callback"]:
-        """State identifier for checkpoints saved prior to version 1.5.0."""
+    def _legacy_state_key(self) -> Type["Callback"]:
+        """State key for checkpoints saved prior to version 1.5.0."""
         return type(self)
 
+    def _generate_state_key(self, **kwargs: Any) -> str:
+        """
+        Formats a set of key-value pairs into a state key string with the callback class name prefixed.
+        Useful for defining a :attr:`state_key`.
+
+        Args:
+            **kwargs: A set of key-value pairs. Must be serializable to :class:`str`.
+        """
+        return f"{self.__class__.__qualname__}{repr(kwargs)}"
+
     def on_configure_sharded_model(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
         """Called before configure sharded model"""
 
diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py
index ad7beec6927d7..77683ad2819f3 100644
--- a/pytorch_lightning/callbacks/early_stopping.py
+++ b/pytorch_lightning/callbacks/early_stopping.py
@@ -75,6 +75,13 @@ class EarlyStopping(Callback):
         >>> from pytorch_lightning.callbacks import EarlyStopping
         >>> early_stopping = EarlyStopping('val_loss')
         >>> trainer = Trainer(callbacks=[early_stopping])
+
+    .. tip:: Saving and restoring multiple early stopping callbacks at the same time is supported under variation in the
+        following arguments:
+
+        *monitor, mode*
+
+        Read more: :ref:`Persisting Callback State`
     """
     mode_dict = {"min": torch.lt, "max": torch.gt}
 
@@ -120,6 +127,10 @@ def __init__(
             )
         self.monitor = monitor or "early_stop_on"
 
+    @property
+    def state_key(self) -> str:
+        return self._generate_state_key(monitor=self.monitor, mode=self.mode)
+
     def on_pretrain_routine_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
         if self._check_on_train_epoch_end is None:
             # if the user runs validation multiple times per training epoch, we try to check after
diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py
index 1144af7e32e9f..414a92af6a66c 100644
--- a/pytorch_lightning/callbacks/model_checkpoint.py
+++ b/pytorch_lightning/callbacks/model_checkpoint.py
@@ -194,6 +194,12 @@ class ModelCheckpoint(Callback):
         trainer.fit(model)
         checkpoint_callback.best_model_path
 
+    .. tip:: Saving and restoring multiple checkpoint callbacks at the same time is supported under variation in the
+        following arguments:
+
+        *monitor, mode, every_n_train_steps, every_n_epochs, train_time_interval, save_on_train_epoch_end*
+
+        Read more: :ref:`Persisting Callback State`
     """
 
     CHECKPOINT_JOIN_CHAR = "-"
@@ -248,6 +254,17 @@ def __init__(
         self.__init_triggers(every_n_train_steps, every_n_epochs, train_time_interval, period)
         self.__validate_init_configuration()
 
+    @property
+    def state_key(self) -> str:
+        return self._generate_state_key(
+            monitor=self.monitor,
+            mode=self.mode,
+            every_n_train_steps=self._every_n_train_steps,
+            every_n_epochs=self._every_n_epochs,
+            train_time_interval=self._train_time_interval,
+            save_on_train_epoch_end=self._save_on_train_epoch_end,
+        )
+
     def on_pretrain_routine_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
         """
         When pretrain routine starts we build the ckpt dir on the fly
diff --git a/pytorch_lightning/trainer/callback_hook.py b/pytorch_lightning/trainer/callback_hook.py
index 2261538d7cbcc..36a3e9abb7b7a 100644
--- a/pytorch_lightning/trainer/callback_hook.py
+++ b/pytorch_lightning/trainer/callback_hook.py
@@ -242,7 +242,7 @@ def on_save_checkpoint(self, checkpoint: Dict[str, Any]) -> Dict[str, dict]:
         for callback in self.callbacks:
             state = callback.on_save_checkpoint(self, self.lightning_module, checkpoint)
             if state:
-                callback_states[callback.state_id] = state
+                callback_states[callback.state_key] = state
         return callback_states
 
     def on_load_checkpoint(self, checkpoint: Dict[str, Any]) -> None:
@@ -267,7 +267,7 @@ def on_load_checkpoint(self, checkpoint: Dict[str, Any]) -> None:
             )
 
         for callback in self.callbacks:
-            state = callback_states.get(callback.state_id, callback_states.get(callback._legacy_state_id))
+            state = callback_states.get(callback.state_key, callback_states.get(callback._legacy_state_key))
             if state:
                 state = deepcopy(state)
                 callback.on_load_checkpoint(self, self.lightning_module, state)
diff --git a/tests/callbacks/test_callbacks.py b/tests/callbacks/test_callbacks.py
index d190feed7e1f7..dc191f4853cc1 100644
--- a/tests/callbacks/test_callbacks.py
+++ b/tests/callbacks/test_callbacks.py
@@ -109,7 +109,7 @@ def __init__(self, state):
         self.state = state
 
     @property
-    def state_id(self):
+    def state_key(self):
         return type(self)
 
     def on_save_checkpoint(self, *args):
@@ -120,7 +120,7 @@ def on_load_checkpoint(self, trainer, pl_module, callback_state):
 
 
 def test_resume_callback_state_saved_by_type(tmpdir):
-    """Test that a legacy checkpoint that didn't use a state identifier before can still be loaded."""
+    """Test that a legacy checkpoint that didn't use a state key before can still be loaded."""
     model = BoringModel()
     callback = OldStatefulCallback(state=111)
     trainer = Trainer(default_root_dir=tmpdir, max_steps=1, callbacks=[callback])
diff --git a/tests/callbacks/test_early_stopping.py b/tests/callbacks/test_early_stopping.py
index 4c3b990dd1b13..ad343cdf329f5 100644
--- a/tests/callbacks/test_early_stopping.py
+++ b/tests/callbacks/test_early_stopping.py
@@ -33,6 +33,11 @@
 _logger = logging.getLogger(__name__)
 
 
+def test_early_stopping_state_key():
+    early_stopping = EarlyStopping(monitor="val_loss")
+    assert early_stopping.state_key == "EarlyStopping{'monitor': 'val_loss', 'mode': 'min'}"
+
+
 class EarlyStoppingTestRestore(EarlyStopping):
     # this class has to be defined outside the test function, otherwise we get pickle error
     def __init__(self, expected_state, *args, **kwargs):
@@ -77,7 +82,8 @@ def test_resume_early_stopping_from_checkpoint(tmpdir):
     # the checkpoint saves "epoch + 1"
     early_stop_callback_state = early_stop_callback.saved_states[checkpoint["epoch"] - 1]
     assert 4 == len(early_stop_callback.saved_states)
-    assert checkpoint["callbacks"]["EarlyStoppingTestRestore"] == early_stop_callback_state
+    es_name = "EarlyStoppingTestRestore{'monitor': 'train_loss', 'mode': 'min'}"
+    assert checkpoint["callbacks"][es_name] == early_stop_callback_state
 
     # ensure state is reloaded properly (assertion in the callback)
     early_stop_callback = EarlyStoppingTestRestore(early_stop_callback_state, monitor="train_loss")
diff --git a/tests/callbacks/test_lambda_function.py b/tests/callbacks/test_lambda_function.py
index 82f64d676c774..88752d56bf697 100644
--- a/tests/callbacks/test_lambda_function.py
+++ b/tests/callbacks/test_lambda_function.py
@@ -11,12 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import inspect
 from functools import partial
 
 from pytorch_lightning import seed_everything, Trainer
 from pytorch_lightning.callbacks import Callback, LambdaCallback
 from tests.helpers.boring_model import BoringModel
+from tests.models.test_hooks import get_members
 
 
 def test_lambda_call(tmpdir):
@@ -32,7 +32,7 @@ def on_train_epoch_start(self):
     def call(hook, *_, **__):
         checker.add(hook)
 
-    hooks = {m for m, _ in inspect.getmembers(Callback, predicate=inspect.isfunction)}
+    hooks = get_members(Callback)
     hooks_args = {h: partial(call, h) for h in hooks}
     hooks_args["on_save_checkpoint"] = lambda *_: [checker.add("on_save_checkpoint")]
 
diff --git a/tests/checkpointing/test_model_checkpoint.py b/tests/checkpointing/test_model_checkpoint.py
index 314ed899c588a..f49fa16598fd2 100644
--- a/tests/checkpointing/test_model_checkpoint.py
+++ b/tests/checkpointing/test_model_checkpoint.py
@@ -43,6 +43,15 @@
 from tests.helpers.runif import RunIf
 
 
+def test_model_checkpoint_state_key():
+    early_stopping = ModelCheckpoint(monitor="val_loss")
+    expected_id = (
+        "ModelCheckpoint{'monitor': 'val_loss', 'mode': 'min', 'every_n_train_steps': 0, 'every_n_epochs': 1,"
+        " 'train_time_interval': None, 'save_on_train_epoch_end': None}"
+    )
+    assert early_stopping.state_key == expected_id
+
+
 class LogInTwoMethods(BoringModel):
     def training_step(self, batch, batch_idx):
         out = super().training_step(batch, batch_idx)
@@ -148,7 +157,10 @@ def on_validation_epoch_end(self):
         assert chk["epoch"] == epoch + 1
         assert chk["global_step"] == limit_train_batches * (epoch + 1)
 
-        mc_specific_data = chk["callbacks"]["ModelCheckpoint"]
+        mc_specific_data = chk["callbacks"][
+            f"ModelCheckpoint{{'monitor': '{monitor}', 'mode': 'min', 'every_n_train_steps': 0, 'every_n_epochs': 1,"
+            " 'train_time_interval': None, 'save_on_train_epoch_end': True}"
+        ]
         assert mc_specific_data["dirpath"] == checkpoint.dirpath
         assert mc_specific_data["monitor"] == monitor
         assert mc_specific_data["current_score"] == score
@@ -259,7 +271,10 @@ def _make_assertions(epoch, ix, version=""):
         expected_global_step = per_val_train_batches * (global_ix + 1) + (leftover_train_batches * epoch_num)
         assert chk["global_step"] == expected_global_step
 
-        mc_specific_data = chk["callbacks"]["ModelCheckpoint"]
+        mc_specific_data = chk["callbacks"][
+            f"ModelCheckpoint{{'monitor': '{monitor}', 'mode': 'min', 'every_n_train_steps': 0, 'every_n_epochs': 1,"
+            " 'train_time_interval': None, 'save_on_train_epoch_end': False}"
+        ]
         assert mc_specific_data["dirpath"] == checkpoint.dirpath
         assert mc_specific_data["monitor"] == monitor
         assert mc_specific_data["current_score"] == score
@@ -857,7 +872,12 @@ def test_model_checkpoint_save_last_checkpoint_contents(tmpdir):
 
     assert ckpt_last_epoch["epoch"] == ckpt_last["epoch"]
     assert ckpt_last_epoch["global_step"] == ckpt_last["global_step"]
-    assert ckpt_last["callbacks"]["ModelCheckpoint"] == ckpt_last_epoch["callbacks"]["ModelCheckpoint"]
+
+    ckpt_id = (
+        "ModelCheckpoint{'monitor': 'early_stop_on', 'mode': 'min', 'every_n_train_steps': 0, 'every_n_epochs': 1,"
+        " 'train_time_interval': None, 'save_on_train_epoch_end': True}"
+    )
+    assert ckpt_last["callbacks"][ckpt_id] == ckpt_last_epoch["callbacks"][ckpt_id]
 
     # it is easier to load the model objects than to iterate over the raw dict of tensors
     model_last_epoch = LogInTwoMethods.load_from_checkpoint(path_last_epoch)
@@ -1095,7 +1115,13 @@ def training_step(self, *args):
     trainer.fit(TestModel())
     assert model_checkpoint.current_score == 0.3
     ckpts = [torch.load(str(ckpt)) for ckpt in tmpdir.listdir()]
-    ckpts = [ckpt["callbacks"]["ModelCheckpoint"] for ckpt in ckpts]
+    ckpts = [
+        ckpt["callbacks"][
+            "ModelCheckpoint{'monitor': 'foo', 'mode': 'min', 'every_n_train_steps': 0, 'every_n_epochs': 1,"
+            " 'train_time_interval': None, 'save_on_train_epoch_end': True}"
+        ]
+        for ckpt in ckpts
+    ]
     assert sorted(ckpt["current_score"] for ckpt in ckpts) == [0.1, 0.2, 0.3]
 
 
diff --git a/tests/trainer/connectors/test_callback_connector.py b/tests/trainer/connectors/test_callback_connector.py
index 43158865f9e75..455e08dc10ad5 100644
--- a/tests/trainer/connectors/test_callback_connector.py
+++ b/tests/trainer/connectors/test_callback_connector.py
@@ -64,28 +64,55 @@ def on_save_checkpoint(self, *args):
 
 
 class StatefulCallback1(Callback):
+    def __init__(self, unique=None, other=None):
+        self._unique = unique
+        self._other = other
+
+    @property
+    def state_key(self):
+        return self._generate_state_key(unique=self._unique)
+
     def on_save_checkpoint(self, *args):
-        return {"content1": 1}
+        return {"content1": self._unique}
 
 
 def test_all_callback_states_saved_before_checkpoint_callback(tmpdir):
-    """Test that all callback states get saved even if the ModelCheckpoint is not given as last."""
+    """
+    Test that all callback states get saved even if the ModelCheckpoint is not given as last
+    and when there are multiple callbacks of the same type.
+    """
 
     callback0 = StatefulCallback0()
-    callback1 = StatefulCallback1()
+    callback1 = StatefulCallback1(unique="one")
+    callback2 = StatefulCallback1(unique="two", other=2)
     checkpoint_callback = ModelCheckpoint(dirpath=tmpdir, filename="all_states")
     model = BoringModel()
     trainer = Trainer(
-        default_root_dir=tmpdir, max_steps=1, limit_val_batches=1, callbacks=[callback0, checkpoint_callback, callback1]
+        default_root_dir=tmpdir,
+        max_steps=1,
+        limit_val_batches=1,
+        callbacks=[
+            callback0,
+            # checkpoint callback does not have to be at the end
+            checkpoint_callback,
+            # callback2 and callback3 have the same type
+            callback1,
+            callback2,
+        ],
     )
     trainer.fit(model)
 
     ckpt = torch.load(str(tmpdir / "all_states.ckpt"))
     state0 = ckpt["callbacks"]["StatefulCallback0"]
-    state1 = ckpt["callbacks"]["StatefulCallback1"]
+    state1 = ckpt["callbacks"]["StatefulCallback1{'unique': 'one'}"]
+    state2 = ckpt["callbacks"]["StatefulCallback1{'unique': 'two'}"]
     assert "content0" in state0 and state0["content0"] == 0
-    assert "content1" in state1 and state1["content1"] == 1
-    assert "ModelCheckpoint" in ckpt["callbacks"]
+    assert "content1" in state1 and state1["content1"] == "one"
+    assert "content1" in state2 and state2["content1"] == "two"
+    assert (
+        "ModelCheckpoint{'monitor': None, 'mode': 'min', 'every_n_train_steps': 0, 'every_n_epochs': 1,"
+        " 'train_time_interval': None, 'save_on_train_epoch_end': True}" in ckpt["callbacks"]
+    )
 
 
 def test_attach_model_callbacks():