Replace parent's _lock

Jaemin Choi · Jaemin Choi · commit dd78d22e956e · 2024-05-01T17:42:35.000-07:00
diff --git a/nemo/core/optim/distributed_adam.py b/nemo/core/optim/distributed_adam.py
@@ -15,6 +15,7 @@
 import collections
 import contextlib
 import itertools
+import threading
 from typing import Callable, Dict, Iterable, Optional, Union
 
 import torch
@@ -156,20 +157,20 @@ def __init__(
         # Construct distributed optimizer
         super().__init__(param_groups, **kwargs)
 
-        # Create mutex with timeout
-        self._lock_with_timeout = None
+        # Replace lock if timeout is provided
         if lock_timeout is not None:
+            self._lock_with_timeout: threading.Lock = threading.Lock()
 
             @contextlib.contextmanager
             def lock_with_timeout():
-                result = self._lock.acquire(timeout=lock_timeout)
+                result = self._lock_with_timeout.acquire(timeout=lock_timeout)
                 try:
                     yield result
                 finally:
                     if result:
-                        self._lock.release()
+                        self._lock_with_timeout.release()
 
-            self._lock_with_timeout = lock_with_timeout
+            self._lock = lock_with_timeout
 
     def _broadcast_params(self) -> None:
         # Assume params have already been synchronized
@@ -185,10 +186,7 @@ def hook(*unused):
                     'before the forward pass (e.g. by calling data_ptr) '
                     'or run DistributedFusedAdam with overlap_param_sync=False.'
                 )
-            lock = self._lock
-            if self._lock_with_timeout is not None:
-                lock = self._lock_with_timeout()
-            with lock:
+            with self._lock:
                 need_to_initialize = 'fragments' not in self.state[param]
                 if need_to_initialize:
                     self._init_param_state(param, param_group_id, param_id)