Fix race condition when executing with multi-node where some ranks do…

…es not wait for setup
NVIDIA · Jul 11, 2023 · 2875ac5 · 2875ac5
1 parent 41d8477
commit 2875ac5
Showing 1 changed file with 10 additions and 11 deletions.
diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_utils.py b/nemo/collections/nlp/modules/common/megatron/megatron_utils.py
@@ -14,13 +14,14 @@
 # limitations under the License.
 
 import os
+import shutil
 from typing import Dict, List
 
 import torch
 import wget
 from torch.hub import _get_torch_home
 
-from nemo.utils import logging
+from nemo.utils import get_rank, logging
 
 __all__ = [
     "get_megatron_lm_model",
@@ -202,16 +203,14 @@ def _download(path: str, url: str):
     if url is None:
         return None
 
-    if not os.path.exists(path):
-        master_device = not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0
-        if not os.path.exists(path):
-            if master_device:
-                os.makedirs(MEGATRON_CACHE, exist_ok=True)
-                logging.info(f"Downloading from {url}")
-                wget.download(url, path)
-            # wait until the master process downloads the file and writes it to the cache dir
-            if torch.distributed.is_initialized():
-                torch.distributed.barrier()
+    if get_rank.is_global_rank_zero() and not os.path.exists(path):
+        os.makedirs(MEGATRON_CACHE, exist_ok=True)
+        logging.info(f"Downloading from {url} to {path}")
+        downloaded_path = wget.download(url)
+        shutil.move(downloaded_path, path)
+    # wait until the master process downloads the file and writes it to the cache dir
+    if torch.distributed.is_initialized():
+        torch.distributed.barrier()
 
     return path