Lightning-AI · williamFalcon · Sep 30, 2020 · Sep 28, 2020 · Sep 28, 2020 · Sep 28, 2020
@@ -31,7 +31,7 @@
 import torch
 from pytorch_lightning import _logger as log
 from pytorch_lightning.callbacks.base import Callback
-from pytorch_lightning.utilities import rank_zero_only, rank_zero_warn
+from pytorch_lightning.utilities import rank_zero_only, rank_zero_warn, rank_zero_info
 from pytorch_lightning.utilities.cloud_io import get_filesystem
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
@@ -176,16 +176,16 @@ def on_load_checkpoint(self, checkpointed_state: Dict[str, Any]):
         self.best_model_score = checkpointed_state["best_model_score"]
         self.best_model_path = checkpointed_state["best_model_path"]
 
-    @rank_zero_only
     def save_checkpoint(self, trainer, pl_module):
         """
-        Performs the main logic around saving a checkpoint
+        Performs the main logic around saving a checkpoint.
+        This method runs on all ranks, it is the responsibility of `self.save_function`
+        to handle correct behaviour in distributed training, i.e., saving only on rank 0.
         """
         epoch = trainer.current_epoch
 
         if (
-            trainer.global_rank != 0  # only run on main process
-            or self.save_top_k == 0  # no models are saved
+            self.save_top_k == 0  # no models are saved
             or self.period < 1  # no models are saved
             or (epoch + 1) % self.period  # skip epoch
             or trainer.running_sanity_check  # don't save anything during sanity check
@@ -207,16 +207,16 @@ def save_checkpoint(self, trainer, pl_module):
 
         # callback supports multiple simultaneous modes
         # here we call each mode sequentially
-        # Mode 1: save the last checkpoint
-        self._save_last_checkpoint(trainer, pl_module, epoch, monitor_candidates, filepath)
-
-        # Mode 2: save all checkpoints OR only the top k
+        # Mode 1: save all checkpoints OR only the top k
         if self.save_top_k:
             if self.save_top_k == -1:
                 self._save_all_checkpoints(trainer, pl_module, epoch, filepath)
             else:
                 self._save_top_k_checkpoints(monitor_candidates, trainer, pl_module, epoch, filepath)
 
+        # Mode 2: save the last checkpoint
+        self._save_last_checkpoint(trainer, pl_module, epoch, monitor_candidates, filepath)
+
     def __validate_init_configuration(self):
         if self.save_top_k is not None and self.save_top_k < -1:
             raise MisconfigurationException(
@@ -279,18 +279,21 @@ def __init_monitor_mode(self, monitor, mode):
 
         self.kth_value, self.mode = mode_dict[mode]
 
+    @rank_zero_only
     def _del_model(self, filepath: str):
         if self._fs.exists(filepath):
             self._fs.rm(filepath)
+            log.debug(f"Removed checkpoint: {filepath}")
 
     def _save_model(self, filepath: str, trainer, pl_module):
         # in debugging, track when we save checkpoints
         trainer.dev_debugger.track_checkpointing_history(filepath)
 
         # make paths
-        self._fs.makedirs(os.path.dirname(filepath), exist_ok=True)
+        if trainer.is_global_zero:
+            self._fs.makedirs(os.path.dirname(filepath), exist_ok=True)
 
-        # delegate the saving to the model
+        # delegate the saving to the trainer
         if self.save_function is not None:
             self.save_function(filepath, self.save_weights_only)
         else:
@@ -325,7 +328,7 @@ def _format_checkpoint_name(
             filename = "{epoch}"
         # check and parse user passed keys in the string
         groups = re.findall(r"(\{.*?)[:\}]", filename)
-        if groups:
+        if len(groups) >= 0:
             metrics["epoch"] = epoch
             for group in groups:
                 name = group[1:]
@@ -404,10 +407,8 @@ def __resolve_ckpt_dir(self, trainer, pl_module):
 
         self.dirpath = ckpt_path
 
-        assert (
-            trainer.global_rank == 0
-        ), "tried to make a checkpoint from non global_rank=0"
-        self._fs.makedirs(self.dirpath, exist_ok=True)
+        if trainer.is_global_zero:
+            self._fs.makedirs(self.dirpath, exist_ok=True)
 
     def _add_backward_monitor_support(self, trainer):
         metrics = trainer.logger_connector.callback_metrics
@@ -460,13 +461,18 @@ def _save_last_checkpoint(self, trainer, pl_module, epoch, ckpt_name_metrics, fi
 
         # when user ALSO asked for the 'last.ckpt' change the name
         if self.save_last:
-            filename = self._format_checkpoint_name(
+            last_filepath = self._format_checkpoint_name(
                 self.CHECKPOINT_NAME_LAST, epoch, ckpt_name_metrics, prefix=self.prefix
             )
-            last_filepath = os.path.join(self.dirpath, f"{filename}.ckpt")
+            last_filepath = os.path.join(self.dirpath, f"{last_filepath}.ckpt")
 
         self._save_model(last_filepath, trainer, pl_module)
-        if self.last_model_path and self.last_model_path != last_filepath and (self.save_top_k != -1 or self.save_last):
+        if (
+                self.last_model_path
+                and self.last_model_path != last_filepath
+                and (self.save_top_k != -1 or self.save_last)
+                and trainer.is_global_zero
+        ):
             self._del_model(self.last_model_path)
         self.last_model_path = last_filepath
 
@@ -491,15 +497,14 @@ def _save_top_k_checkpoints(self, metrics, trainer, pl_module, epoch, filepath):
         elif self.check_monitor_top_k(current):
             self._do_check_save(filepath, current, epoch, trainer, pl_module)
         elif self.verbose:
-            log.info(
+            rank_zero_info(
                 f"Epoch {epoch:d}: {self.monitor} was not in top {self.save_top_k}"
             )
 
     def _save_all_checkpoints(self, trainer, pl_module, epoch, filepath):
         if self.verbose:
-            log.info(f"Epoch {epoch:d}: saving model to {filepath}")
+            rank_zero_info(f"Epoch {epoch:d}: saving model to {filepath}")
 
-        assert (trainer.global_rank == 0), "tried to make a checkpoint from non global_rank=0"
         self._save_model(filepath, trainer, pl_module)
 
     def _is_valid_monitor_key(self, metrics):
@@ -535,7 +540,7 @@ def _do_check_save(
         self.best_model_score = self.best_k_models[self.best_model_path]
 
         if self.verbose:
-            log.info(
+            rank_zero_info(
                 f"Epoch {epoch:d}: {self.monitor} reached"
                 f" {current:0.5f} (best {self.best_model_score:0.5f}),"
                 f" saving model to {filepath} as top {self.save_top_k}"

@@ -1,4 +1,6 @@
 import os
+from unittest.mock import MagicMock
+
 import yaml
 import pickle
 import platform
@@ -92,21 +94,24 @@ class ModelCheckpointTestInvocations(ModelCheckpoint):
 
     def __init__(self, expected_count, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.count = 0
         self.expected_count = expected_count
+        self.on_save_checkpoint_count = 0
+
+    def on_train_start(self, trainer, pl_module):
+        torch.save = MagicMock()
 
-    def _save_model(self, filepath, trainer, pl_module):
-        # make sure we don't save twice
-        assert not os.path.isfile(filepath)
-        self.count += 1
-        super()._save_model(filepath, trainer, pl_module)
+    def on_save_checkpoint(self, trainer, pl_module):
+        # expect all ranks to run but only rank 0 will actually write the checkpoint file
+        super().on_save_checkpoint(trainer, pl_module)
+        self.on_save_checkpoint_count += 1
 
     def on_train_end(self, trainer, pl_module):
         super().on_train_end(trainer, pl_module)
-        # on rank 0 we expect the saved files and on all others no saves
-        assert (trainer.global_rank == 0 and self.count == self.expected_count) or (
-            trainer.global_rank > 0 and self.count == 0
-        )
+        assert self.on_save_checkpoint_count == self.expected_count
+        if trainer.is_global_zero:
+            assert torch.save.call_count == self.expected_count
+        else:
+            assert torch.save.call_count == 0
 
 
 @pytest.mark.skipif(
@@ -220,34 +225,6 @@ def test_none_monitor_save_last(tmpdir):
     ModelCheckpoint(filepath=tmpdir, save_last=False)
 
 
-def test_model_checkpoint_save_last_checkpoint_contents(tmpdir):
-    """Tests that the save_last checkpoint contains the latest information."""
-    seed_everything(100)
-    model = EvalModelTemplate()
-    num_epochs = 3
-    model_checkpoint = ModelCheckpoint(monitor='val_loss', filepath=tmpdir, save_top_k=num_epochs, save_last=True)
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        early_stop_callback=False,
-        checkpoint_callback=model_checkpoint,
-        max_epochs=num_epochs,
-    )
-    trainer.fit(model)
-
-    path_last_epoch = model_checkpoint.format_checkpoint_name(num_epochs - 1, {})
-    assert path_last_epoch != model_checkpoint.last_model_path
-
-    ckpt_last_epoch = torch.load(path_last_epoch)
-    ckpt_last = torch.load(model_checkpoint.last_model_path)
-    assert all(ckpt_last_epoch[k] == ckpt_last[k] for k in ("epoch", "global_step"))
-
-    # it is easier to load the model objects than to iterate over the raw dict of tensors
-    model_last_epoch = EvalModelTemplate.load_from_checkpoint(path_last_epoch)
-    model_last = EvalModelTemplate.load_from_checkpoint(model_checkpoint.last_model_path)
-    for w0, w1 in zip(model_last_epoch.parameters(), model_last.parameters()):
-        assert w0.eq(w1).all()
-
-
 def test_model_checkpoint_none_monitor(tmpdir):
     """ Test that it is possible to save all checkpoints when monitor=None. """
     model = EvalModelTemplate()
@@ -419,3 +396,42 @@ def test_model_checkpoint_save_last_warning(tmpdir, caplog, max_epochs, should_v
     )
     trainer.fit(model)
     assert caplog.messages.count('Saving latest checkpoint...') == save_last
+
+
+def test_model_checkpoint_save_last_checkpoint_contents(tmpdir):
+    """ Tests that the save_last checkpoint contains the latest information. """
+    seed_everything(100)
+    model = EvalModelTemplate()
+    num_epochs = 3
+    model_checkpoint = ModelCheckpoint(
+        monitor='val_loss', filepath=tmpdir, save_top_k=num_epochs, save_last=True
+    )
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        early_stop_callback=False,
+        checkpoint_callback=model_checkpoint,
+        max_epochs=num_epochs,
+    )
+    trainer.fit(model)
+
+    path_last_epoch = str(tmpdir / f"epoch={num_epochs - 1}.ckpt")
+    path_last = str(tmpdir / f"last.ckpt")
+    assert path_last == model_checkpoint.last_model_path
+
+    ckpt_last_epoch = torch.load(path_last_epoch)
+    ckpt_last = torch.load(path_last)
+    assert all(ckpt_last_epoch[k] == ckpt_last[k] for k in ("epoch", "global_step"))
+
+    ch_type = type(model_checkpoint)
+    assert all(list(
+        ckpt_last["callbacks"][ch_type][k] == ckpt_last_epoch["callbacks"][ch_type][k]
+        for k in ("best_model_score", "best_model_path")
+    ))
+
+    # it is easier to load the model objects than to iterate over the raw dict of tensors
+    model_last_epoch = EvalModelTemplate.load_from_checkpoint(path_last_epoch)
+    model_last = EvalModelTemplate.load_from_checkpoint(
+        model_checkpoint.last_model_path
+    )
+    for w0, w1 in zip(model_last_epoch.parameters(), model_last.parameters()):
+        assert w0.eq(w1).all()
@@ -66,9 +66,6 @@ def run_test_from_config(trainer_options):
     assert hvd.size() == 2
 
     if trainer.global_rank > 0:
-        # on higher ranks the checkpoint location is unknown
-        # we want to test checkpointing on rank 0 only
-        assert not trainer.checkpoint_callback.best_model_path
         return
 
     # test model loading