Add option for mutex timeout in distributed optimizer backward hook (#9087)

minitu · Jaemin Choi · web-flow · commit d5d72cda59a8 · 2024-05-02T15:53:47.000Z
* Tim: Add option for timeout in distopt callback mutex Signed-off-by: Jaemin Choi <jaeminc@nvidia.com> * Replace parent's _lock Signed-off-by: Jaemin Choi <jaeminc@nvidia.com> * Revert "Replace parent's _lock" This reverts commit 972d1b6. Signed-off-by: Jaemin Choi <jaeminc@nvidia.com> * Raise RuntimeError when timeout Signed-off-by: Jaemin Choi <jaeminc@nvidia.com> * Change RuntimeError to print Signed-off-by: Jaemin Choi <jaeminc@nvidia.com> --------- Signed-off-by: Jaemin Choi <jaeminc@nvidia.com> Co-authored-by: Jaemin Choi <jaeminc@nvidia.com>
diff --git a/nemo/core/optim/distributed_adam.py b/nemo/core/optim/distributed_adam.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import collections
+import contextlib
 import itertools
 from typing import Callable, Dict, Iterable, Optional, Union
 
@@ -55,6 +56,8 @@ class MegatronDistributedFusedAdam(DistributedFusedAdam):
             but requires larger memory than distributing within all
             ranks, especially for pure data parallel models.
             (default: False).
+        lock_timeout (float, optional): timeout for callback mutex in
+            seconds.
         **kwargs: keyword arguments to pass to Apex
             DistributedFusedAdam.
 
@@ -65,6 +68,7 @@ def __init__(
         params: Union[Iterable[torch.nn.Parameter], Iterable[dict]],
         disable_distributed_parameters: bool = False,
         distribute_within_nodes: bool = False,
+        lock_timeout: Optional[float] = None,
         **kwargs,
     ):
 
@@ -114,6 +118,25 @@ def __init__(
         # Construct distributed optimizer
         super().__init__(param_groups, **kwargs)
 
+        # Create mutex with timeout
+        self._lock_with_timeout = None
+        if lock_timeout is not None:
+
+            @contextlib.contextmanager
+            def lock_with_timeout():
+                result = self._lock.acquire(timeout=lock_timeout)
+                try:
+                    yield result
+                finally:
+                    if result:
+                        # Acquired lock before timeout
+                        self._lock.release()
+                    else:
+                        # Failed to acquire lock before timeout
+                        print(f'MegatronDistributedFusedAdam: Failed to acquire lock within {lock_timeout} seconds.')
+
+            self._lock_with_timeout = lock_with_timeout
+
     def _broadcast_params(self) -> None:
         # Assume params have already been synchronized
         pass
@@ -128,7 +151,10 @@ def hook(*unused):
                     'before the forward pass (e.g. by calling data_ptr) '
                     'or run DistributedFusedAdam with overlap_param_sync=False.'
                 )
-            with self._lock:
+            lock = self._lock
+            if self._lock_with_timeout is not None:
+                lock = self._lock_with_timeout()
+            with lock:
                 need_to_initialize = 'fragments' not in self.state[param]
                 if need_to_initialize:
                     self._init_param_state(param, param_group_id, param_id)