From be54ca3fd1bc073dbed31c655f5005e7ba0045dc Mon Sep 17 00:00:00 2001 From: Jennifer Dai Date: Fri, 10 Dec 2021 18:37:48 -0800 Subject: [PATCH 1/2] first commit --- pytorch_lightning/callbacks/model_checkpoint.py | 3 --- pytorch_lightning/plugins/io/xla_plugin.py | 2 ++ pytorch_lightning/plugins/training_type/single_tpu.py | 3 --- pytorch_lightning/trainer/trainer.py | 4 ---- 4 files changed, 2 insertions(+), 10 deletions(-) diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index cd307d18bc03a..64842b01c5136 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -600,9 +600,6 @@ def __resolve_ckpt_dir(self, trainer: "pl.Trainer") -> None: self.dirpath = ckpt_path - if not trainer.fast_dev_run and trainer.training_type_plugin.should_rank_save_checkpoint: - self._fs.makedirs(self.dirpath, exist_ok=True) - def __warn_if_dir_not_empty(self, dirpath: _PATH) -> None: if self.save_top_k != 0 and self._fs.isdir(dirpath) and len(self._fs.ls(dirpath)) > 0: rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.") diff --git a/pytorch_lightning/plugins/io/xla_plugin.py b/pytorch_lightning/plugins/io/xla_plugin.py index c40b6a1ada037..b928e5df5f07b 100644 --- a/pytorch_lightning/plugins/io/xla_plugin.py +++ b/pytorch_lightning/plugins/io/xla_plugin.py @@ -36,6 +36,8 @@ def save_checkpoint(self, checkpoint: Dict[str, Any], path: _PATH, storage_optio path: write-target path storage_options: Optional parameters when saving the model/training states. """ + fs = get_filesystem(path) + fs.makedirs(os.path.dirname(path), exist_ok=True) # Todo: TypeError: 'mappingproxy' object does not support item assignment # Ref: https://github.com/pytorch/xla/issues/2773 if _OMEGACONF_AVAILABLE: diff --git a/pytorch_lightning/plugins/training_type/single_tpu.py b/pytorch_lightning/plugins/training_type/single_tpu.py index 1f15f1cf8d885..011604468e1f5 100644 --- a/pytorch_lightning/plugins/training_type/single_tpu.py +++ b/pytorch_lightning/plugins/training_type/single_tpu.py @@ -75,9 +75,6 @@ def pre_dispatch(self, trainer: "pl.Trainer") -> None: self.tpu_local_core_rank = xm.get_local_ordinal() self.tpu_global_core_rank = xm.get_ordinal() - def save(self, state_dict: Dict, path: _PATH) -> None: - xm.save(state_dict, path) - def save_checkpoint(self, checkpoint: Dict[str, Any], filepath: _PATH) -> None: """Save model/training states as a checkpoint file through state-dump and file-write. diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index bacb4cbbd5475..6805009d745b8 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -1700,10 +1700,6 @@ def world_size(self) -> int: # some training types define a world size return getattr(self.training_type_plugin, "world_size", 1) - @property - def should_rank_save_checkpoint(self) -> bool: - return self.training_type_plugin.should_rank_save_checkpoint - @property def _distrib_type(self) -> _StrategyType: return self._accelerator_connector._distrib_type From f4cca530df189499d3878ddb95d55c0083d62527 Mon Sep 17 00:00:00 2001 From: Jennifer Dai Date: Fri, 10 Dec 2021 18:51:03 -0800 Subject: [PATCH 2/2] imports --- pytorch_lightning/callbacks/model_checkpoint.py | 2 +- pytorch_lightning/plugins/io/xla_plugin.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index 64842b01c5136..8ea585db3c4ce 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -249,7 +249,7 @@ def state_key(self) -> str: ) def on_pretrain_routine_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: - """When pretrain routine starts we build the ckpt dir on the fly.""" + """When pretrain routine starts we resolve the ckpt dir on the fly.""" if self._save_on_train_epoch_end is None: # if the user runs validation multiple times per training epoch or multiple training epochs without # validation, then we run after validation instead of on train epoch end diff --git a/pytorch_lightning/plugins/io/xla_plugin.py b/pytorch_lightning/plugins/io/xla_plugin.py index b928e5df5f07b..9c2f0088d12e5 100644 --- a/pytorch_lightning/plugins/io/xla_plugin.py +++ b/pytorch_lightning/plugins/io/xla_plugin.py @@ -11,11 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import os from typing import Any, Dict, Optional from pytorch_lightning.plugins.io.torch_plugin import TorchCheckpointIO from pytorch_lightning.utilities import _OMEGACONF_AVAILABLE, _TPU_AVAILABLE from pytorch_lightning.utilities.apply_func import apply_to_collection +from pytorch_lightning.utilities.cloud_io import get_filesystem from pytorch_lightning.utilities.types import _PATH if _TPU_AVAILABLE: