Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Clean up last ModelCheckpoint makedirs call to IOPlugin #11035

Merged
merged 2 commits into from
Dec 14, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 1 addition & 4 deletions pytorch_lightning/callbacks/model_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,7 @@ def state_key(self) -> str:
)

def on_pretrain_routine_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
"""When pretrain routine starts we build the ckpt dir on the fly."""
"""When pretrain routine starts we resolve the ckpt dir on the fly."""
if self._save_on_train_epoch_end is None:
# if the user runs validation multiple times per training epoch or multiple training epochs without
# validation, then we run after validation instead of on train epoch end
Expand Down Expand Up @@ -600,9 +600,6 @@ def __resolve_ckpt_dir(self, trainer: "pl.Trainer") -> None:

self.dirpath = ckpt_path

if not trainer.fast_dev_run and trainer.training_type_plugin.should_rank_save_checkpoint:
ananthsub marked this conversation as resolved.
Show resolved Hide resolved
self._fs.makedirs(self.dirpath, exist_ok=True)

def __warn_if_dir_not_empty(self, dirpath: _PATH) -> None:
if self.save_top_k != 0 and self._fs.isdir(dirpath) and len(self._fs.ls(dirpath)) > 0:
rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
Expand Down
4 changes: 4 additions & 0 deletions pytorch_lightning/plugins/io/xla_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,13 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from typing import Any, Dict, Optional

from pytorch_lightning.plugins.io.torch_plugin import TorchCheckpointIO
from pytorch_lightning.utilities import _OMEGACONF_AVAILABLE, _TPU_AVAILABLE
from pytorch_lightning.utilities.apply_func import apply_to_collection
from pytorch_lightning.utilities.cloud_io import get_filesystem
from pytorch_lightning.utilities.types import _PATH

if _TPU_AVAILABLE:
Expand All @@ -36,6 +38,8 @@ def save_checkpoint(self, checkpoint: Dict[str, Any], path: _PATH, storage_optio
path: write-target path
storage_options: Optional parameters when saving the model/training states.
"""
fs = get_filesystem(path)
fs.makedirs(os.path.dirname(path), exist_ok=True)
# Todo: TypeError: 'mappingproxy' object does not support item assignment
# Ref: https://github.com/pytorch/xla/issues/2773
if _OMEGACONF_AVAILABLE:
Expand Down
3 changes: 0 additions & 3 deletions pytorch_lightning/plugins/training_type/single_tpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,9 +75,6 @@ def pre_dispatch(self, trainer: "pl.Trainer") -> None:
self.tpu_local_core_rank = xm.get_local_ordinal()
self.tpu_global_core_rank = xm.get_ordinal()

def save(self, state_dict: Dict, path: _PATH) -> None:
xm.save(state_dict, path)

ananthsub marked this conversation as resolved.
Show resolved Hide resolved
def save_checkpoint(self, checkpoint: Dict[str, Any], filepath: _PATH) -> None:
"""Save model/training states as a checkpoint file through state-dump and file-write.

Expand Down
4 changes: 0 additions & 4 deletions pytorch_lightning/trainer/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1700,10 +1700,6 @@ def world_size(self) -> int:
# some training types define a world size
return getattr(self.training_type_plugin, "world_size", 1)

@property
def should_rank_save_checkpoint(self) -> bool:
return self.training_type_plugin.should_rank_save_checkpoint

Comment on lines -1703 to -1706
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

since this is publicly exposed, this needs to go through a deprecation process

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

noted in summary with context

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks for adding the context! i believe the initial removal was incorrect, and we should honor the deprecation process

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Resolving to merge as this PR is just addressing the merge mistake from #9901. Open an issue or comment on #9433 for this.

@property
def _distrib_type(self) -> _StrategyType:
return self._accelerator_connector._distrib_type
Expand Down