Lightning-AI · lezwon · Aug 15, 2020 · Aug 15, 2020 · Aug 15, 2020 · Aug 15, 2020
@@ -67,13 +67,14 @@ def teardown(self):
         last_path = self.mp_queue.get()
 
         # transfer back the best path to the trainer
-        self.trainer.checkpoint_callback.best_model_path = best_path
+        if self.trainer.checkpoint_callback is not None:
+            self.trainer.checkpoint_callback.best_model_path = best_path
         # todo, pass also bets score
 
         # load last weights
         if last_path and not self.trainer.testing:
             ckpt = torch.load(last_path, map_location=lambda storage, loc: storage)
-            model.load_state_dict(ckpt)
+            model.load_state_dict(ckpt['state_dict'])
 
         self.trainer.model = model
 

@@ -182,6 +182,7 @@ def kth_best_model(self):
                        " and will be removed in v0.10.0", DeprecationWarning)
         return self.kth_best_model_path
 
+    @rank_zero_only
     def _del_model(self, filepath):
         if self._fs.exists(filepath):
             self._fs.rm(filepath)
@@ -261,7 +262,6 @@ def format_checkpoint_name(self, epoch, metrics, ver=None):
         ckpt_name = f'{filename}.ckpt'
         return os.path.join(self.dirpath, ckpt_name) if self.dirpath else ckpt_name
 
-    @rank_zero_only
     def on_pretrain_routine_start(self, trainer, pl_module):
         """
         Determines model checkpoint save directory at runtime. References attributes from the
@@ -315,11 +315,8 @@ def __warn_deprecated_monitor_key(self):
                 f" Remove `ModelCheckpoint(monitor='{self.monitor}')` to fix."
             )
 
-    @rank_zero_only
     def on_validation_end(self, trainer, pl_module):
         # only run on main process
-        if trainer.global_rank != 0:
-            return
 
         if trainer.running_sanity_check:
             return
@@ -379,7 +376,6 @@ def on_validation_end(self, trainer, pl_module):
             if self.verbose > 0:
                 log.info(f'Epoch {epoch:d}: saving model to {filepath}')
 
-            assert trainer.global_rank == 0, 'tried to make a checkpoint from non global_rank=0'
             self._save_model(filepath, trainer, pl_module)
 
         if self.save_last:
@@ -395,22 +391,26 @@ def _do_check_save(self, filepath, current, epoch, trainer, pl_module):
         # remove kth
 
         del_list = []
-        if len(self.best_k_models) == self.save_top_k and self.save_top_k > 0:
-            delpath = self.kth_best_model_path
-            self.best_k_models.pop(self.kth_best_model_path)
-            del_list.append(delpath)
-
-        self.best_k_models[filepath] = current
-        if len(self.best_k_models) == self.save_top_k:
-            # monitor dict has reached k elements
-            _op = max if self.mode == 'min' else min
-            self.kth_best_model_path = _op(self.best_k_models,
-                                           key=self.best_k_models.get)
-            self.kth_value = self.best_k_models[self.kth_best_model_path]
-
-        _op = min if self.mode == 'min' else max
-        self.best_model_path = _op(self.best_k_models, key=self.best_k_models.get)
-        self.best_model_score = self.best_k_models[self.best_model_path]
+
+        if trainer.is_global_zero:
+            if len(self.best_k_models) == self.save_top_k and self.save_top_k > 0:
+                delpath = self.kth_best_model_path
+                self.best_k_models.pop(self.kth_best_model_path)
+                del_list.append(delpath)
+
+            self.best_k_models[filepath] = current
+            if len(self.best_k_models) == self.save_top_k:
+                # monitor dict has reached k elements
+                _op = max if self.mode == 'min' else min
+                self.kth_best_model_path = _op(self.best_k_models,
+                                               key=self.best_k_models.get)
+                self.kth_value = self.best_k_models[self.kth_best_model_path]
+
+            _op = min if self.mode == 'min' else max
+            self.best_model_path = _op(self.best_k_models, key=self.best_k_models.get)
+            self.best_model_score = self.best_k_models[self.best_model_path]
+
+            print(inspect.currentframe().f_code.co_name + f' Line 401 rank: {trainer.global_rank}')
 
         if self.verbose > 0:
             log.info(

@@ -20,6 +20,16 @@
 import torch
 import fsspec
 
+# we want this for tf.io.gfile, which if tf is installed gives full tf,
+# otherwise gives a pruned down version which works for some file backends but
+# not all
+
+try:
+    import torch_xla.core.xla_model as xm
+except ImportError:
+    XLA_AVAILABLE = False
+else:
+    XLA_AVAILABLE = True
 
 pathlike = Union[Path, str]
 
@@ -49,12 +59,17 @@ def atomic_save(checkpoint, filepath: str):
             accepts.
         filepath: The path to which the checkpoint will be saved.
             This points to the file that the checkpoint will be stored in.
+        is_xla_tensor: If the tensor to be saved in an XLA Tensor
+            Is true if the model is being trained on a TPU
     """
     bytesbuffer = io.BytesIO()
-    # Can't use the new zipfile serialization for 1.6.0 because there's a bug in
-    # torch.hub.load_state_dict_from_url() that prevents it from loading the new files.
-    # More details can be found here: https://github.com/pytorch/pytorch/issues/42239
-    if LooseVersion(torch.__version__).version[:3] == [1, 6, 0]:
+
+    if checkpoint.device.type == "xla" and XLA_AVAILABLE:
+        return xm.save(checkpoint, filepath, master_only=True, global_master=True)
+    elif LooseVersion(torch.__version__).version[:3] == [1, 6, 0]:
+        # Can't use the new zipfile serialization for 1.6.0 because there's a bug in
+        # torch.hub.load_state_dict_from_url() that prevents it from loading the new files.
+        # More details can be found here: https://github.com/pytorch/pytorch/issues/42239
         torch.save(checkpoint, bytesbuffer, _use_new_zipfile_serialization=False)
     else:
         torch.save(checkpoint, bytesbuffer)