Lightning-AI · rohitgr7 · Feb 14, 2022 · Jan 27, 2022 · Jan 27, 2022 · Feb 3, 2022
@@ -92,9 +92,9 @@ def optimizer_step(
         if not isinstance(model, pl.LightningModule) or not model.automatic_optimization or not skipped_backward:
             optimizer.step(**kwargs)
 
-    def on_load_checkpoint(self, checkpoint: Dict[str, Any]) -> None:
-        if "amp_scaling_state" in checkpoint:
-            amp.load_state_dict(checkpoint["amp_scaling_state"])
+    def state_dict(self) -> Dict[str, Any]:
+        return amp.state_dict()
 
-    def on_save_checkpoint(self, checkpoint: Dict[str, Any]) -> None:
-        checkpoint["amp_scaling_state"] = amp.state_dict()
+    def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
+        if state_dict:
+            amp.load_state_dict(state_dict)
@@ -106,10 +106,10 @@ def forward_context(self) -> Generator[None, None, None]:
         with self.autocast_context_manager():
             yield
 
-    def on_load_checkpoint(self, checkpoint: Dict[str, Any]) -> None:
-        if self.scaler is not None and "native_amp_scaling_state" in checkpoint:
-            self.scaler.load_state_dict(checkpoint["native_amp_scaling_state"])
-
-    def on_save_checkpoint(self, checkpoint: Dict[str, Any]) -> None:
+    def state_dict(self) -> Dict[str, Any]:
         if self.scaler is not None:
-            checkpoint["native_amp_scaling_state"] = self.scaler.state_dict()
+            return self.scaler.state_dict()
+
+    def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
+        if self.scaler is not None and state_dict:
+            self.scaler.load_state_dict(state_dict)
@@ -13,7 +13,7 @@
 # limitations under the License.
 import contextlib
 from functools import partial
-from typing import Any, Callable, Generator, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, Generator, List, Optional, Tuple, Union
 
 import torch
 from torch import Tensor
@@ -242,3 +242,9 @@ def teardown(self) -> None:
 
         It is the right place to release memory and free other resources.
         """
+
+    def state_dict(self) -> Dict[str, Any]:
+        return {}
+
+    def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
+        pass
@@ -192,7 +192,17 @@ def restore_training_state(self) -> None:
             return
 
         # restore precision plugin (scaler etc.)
-        self.trainer.precision_plugin.on_load_checkpoint(self._loaded_checkpoint)
+        prec_plugin = self.trainer.precision_plugin
+        prec_plugin.on_load_checkpoint(self._loaded_checkpoint)
+        if prec_plugin.__class__.__name__ in self._loaded_checkpoint:
+            prec_plugin.load_state_dict(self._loaded_checkpoint[prec_plugin.__class__.__name__])
+
+        # old checkpoints compatibility
+        # should we raise error and force user to run utilities/upgrade_checkpoint instead?
+        if "amp_scaling_state" in self._loaded_checkpoint:
+            prec_plugin.load_state_dict(self._loaded_checkpoint["amp_scaling_state"])
+        if "native_amp_scaling_state" in self._loaded_checkpoint:
+            prec_plugin.load_state_dict(self._loaded_checkpoint["native_amp_scaling_state"])
 
         # restore loops and their progress
         self.restore_loops()
@@ -372,7 +382,9 @@ def dump_checkpoint(self, weights_only: bool = False) -> dict:
                 lr_schedulers.append(config.scheduler.state_dict())
             checkpoint["lr_schedulers"] = lr_schedulers
 
-            self.trainer.precision_plugin.on_save_checkpoint(checkpoint)
+            # precision plugin
+            prec_plugin = self.trainer.precision_plugin
+            checkpoint[prec_plugin.__class__.__name__] = self.trainer.precision_plugin.state_dict()
 
         # dump hyper-parameters
         if model.hparams:
@@ -389,6 +401,8 @@ def dump_checkpoint(self, weights_only: bool = False) -> dict:
         model.on_save_checkpoint(checkpoint)
         if self.trainer.datamodule is not None:
             self.trainer.datamodule.on_save_checkpoint(checkpoint)
+        if not weights_only:
+            self.trainer.precision_plugin.on_save_checkpoint(checkpoint)
 
         # TODO: remove this in v1.8.
         environment = self.trainer._accelerator_connector.cluster_environment