Skip to content

Commit

Permalink
add war fix for sync issues (NVIDIA#8130)
Browse files Browse the repository at this point in the history
Signed-off-by: Gerald Shen <[email protected]>
  • Loading branch information
gshennvm authored and Jaemin Choi committed Jan 19, 2024
1 parent 87fceb8 commit 0f0bb6c
Show file tree
Hide file tree
Showing 2 changed files with 2 additions and 2 deletions.
2 changes: 1 addition & 1 deletion nemo/collections/nlp/parts/nlp_overrides.py
Original file line number Diff line number Diff line change
Expand Up @@ -426,7 +426,7 @@ def remove_checkpoint(self, filepath: Union[str, Path]) -> None:
and self.lightning_module.sharded_state_dict() is not None
):
if self.is_global_zero:
shutil.rmtree(ckpt_to_dir(filepath))
shutil.rmtree(ckpt_to_dir(filepath), ignore_errors=True)

# legacy checkpoint logic, does not use megatron core
else:
Expand Down
2 changes: 1 addition & 1 deletion nemo/utils/callbacks/nemo_model_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ def _del_model_without_trainer(self, filepath: str) -> None:
if is_global_rank_zero():
try:
dist_ckpt = ckpt_to_dir(filepath)
shutil.rmtree(dist_ckpt)
shutil.rmtree(dist_ckpt, ignore_errors=True)
logging.info(f"Removed distributed checkpoint: {dist_ckpt}")
except:
logging.info(f"Tried to remove distributed checkpoint: {dist_ckpt} but failed.")
Expand Down

0 comments on commit 0f0bb6c

Please sign in to comment.