Skip to content

Commit

Permalink
Fix race condition when executing with multi-node where some ranks do…
Browse files Browse the repository at this point in the history
…es not wait for setup
  • Loading branch information
findkim committed Jul 11, 2023
1 parent 41d8477 commit 2875ac5
Showing 1 changed file with 10 additions and 11 deletions.
21 changes: 10 additions & 11 deletions nemo/collections/nlp/modules/common/megatron/megatron_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,14 @@
# limitations under the License.

import os
import shutil
from typing import Dict, List

import torch
import wget
from torch.hub import _get_torch_home

from nemo.utils import logging
from nemo.utils import get_rank, logging

__all__ = [
"get_megatron_lm_model",
Expand Down Expand Up @@ -202,16 +203,14 @@ def _download(path: str, url: str):
if url is None:
return None

if not os.path.exists(path):
master_device = not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0
if not os.path.exists(path):
if master_device:
os.makedirs(MEGATRON_CACHE, exist_ok=True)
logging.info(f"Downloading from {url}")
wget.download(url, path)
# wait until the master process downloads the file and writes it to the cache dir
if torch.distributed.is_initialized():
torch.distributed.barrier()
if get_rank.is_global_rank_zero() and not os.path.exists(path):
os.makedirs(MEGATRON_CACHE, exist_ok=True)
logging.info(f"Downloading from {url} to {path}")
downloaded_path = wget.download(url)
shutil.move(downloaded_path, path)
# wait until the master process downloads the file and writes it to the cache dir
if torch.distributed.is_initialized():
torch.distributed.barrier()

return path

Expand Down

0 comments on commit 2875ac5

Please sign in to comment.