Add automatic set of CUDA_VISIBLE_DEVICES for local scheduler (#383)

aivanou · facebook-github-bot · commit 328bcc6bb770 · 2022-02-08T12:30:27.000-08:00
Summary: Pull Request resolved: #383 The diff adds automatic set of `CUDA_VISIBLE_DEVICES` based on `num_replicas`. Each replica gets the same number of devices The alg. applies only when `CUDA_VISIBLE_DEVICES` is not set The diff uses `nvidia-smi` to determine the number of GPUs #297 #377 Differential Revision: D34064433 fbshipit-source-id: 788ce92b0ad79e24f4be22bb2d5e9f784f25004b
diff --git a/torchx/schedulers/local_scheduler.py b/torchx/schedulers/local_scheduler.py
@@ -765,6 +765,31 @@ def _submit_dryrun(
         request = self._to_popen_request(app, cfg)
         return AppDryRunInfo(request, lambda p: pprint.pformat(p, indent=2, width=80))
 
+    def _get_gpu_count(self) -> int:
+        gpu_cmd = "nvidia-smi -L"
+        try:
+            result = subprocess.run(gpu_cmd.split(), capture_output=True)
+            log.debug(f"Cmd {gpu_cmd} returned: {result}")
+            gpus_info = result.stdout.decode().split("\n")[0:-1]
+            return len(gpus_info)
+        except Exception:
+            log.debug(f"No GPUs detected via {gpu_cmd}")
+            return 0
+
+    def _get_cuda_devices(self, replica_id: int, num_replicas: int) -> Optional[str]:
+        gpu_device_count = self._get_gpu_count()
+        gpu_bucket_size = int(gpu_device_count / num_replicas)
+        if gpu_device_count != 0:
+            devices = list(
+                range(
+                    gpu_bucket_size * replica_id,
+                    gpu_bucket_size * (replica_id + 1),
+                )
+            )
+            visible_devices = ",".join([str(d) for d in devices])
+            return visible_devices
+        return None
+
     def _to_popen_request(
         self,
         app: AppDef,
@@ -786,6 +811,19 @@ def _to_popen_request(
 
             img_root = image_provider.fetch_role(role)
 
+            gpu_device_count = self._get_gpu_count()
+            if gpu_device_count != 0 and gpu_device_count < role.num_replicas:
+                log.warning(
+                    "Different role replicas will occupy the same device"
+                    "Decreate the number of replicas by changing `role.num_replicas` parmeter "
+                    f"Devices detected: {gpu_device_count}, num replicas: {role.num_replicas}"
+                )
+            if gpu_device_count != 0 and gpu_device_count % role.num_replicas != 0:
+                log.warning(
+                    "Number of detected gpus is not proportional to the number of replicas"
+                    f"GPUs detected: {gpu_device_count}, num replicas: {role.num_replicas}"
+                )
+
             for replica_id in range(role.num_replicas):
                 values = macros.Values(
                     img_root=img_root,
@@ -794,6 +832,12 @@ def _to_popen_request(
                 )
                 replica_role = values.apply(role)
                 replica_log_dir = os.path.join(app_log_dir, role.name, str(replica_id))
+                visible_devices = self._get_cuda_devices(replica_id, role.num_replicas)
+                if visible_devices and "CUDA_VISIBLE_DEVICES" not in replica_role.env:
+                    log.debug(
+                        f"Setting role replica {role.num_replicas} with {visible_devices} devices"
+                    )
+                    replica_role.env["CUDA_VISIBLE_DEVICES"] = visible_devices
 
                 if "TORCHELASTIC_ERROR_FILE" not in replica_role.env:
                     # this is the top level (agent if using elastic role) error file
diff --git a/torchx/schedulers/test/local_scheduler_test.py b/torchx/schedulers/test/local_scheduler_test.py
@@ -15,6 +15,7 @@
 import time
 import unittest
 from contextlib import contextmanager
+from dataclasses import dataclass
 from datetime import datetime
 from os.path import join
 from typing import Callable, Generator, Optional
@@ -828,6 +829,62 @@ def test_close_twice(self) -> None:
         self.scheduler.close()
         # nothing to validate just make sure no errors are raised
 
+    def test_get_gpu_count(self) -> None:
+        @dataclass
+        class ProcResult:
+            stdout: bytes
+
+        nvidia_smi_out = (
+            "GPU 0: Tesla V100-SXM2-16GB (UUID: GPU-196a22c5-717b-66db-0acc-58cde6f3df85)\n"
+            "GPU 1: Tesla V100-SXM2-16GB (UUID: GPU-45e9165d-4f7e-d954-7ff5-481bc2c0ec7b)\n"
+            "GPU 2: Tesla V100-SXM2-16GB (UUID: GPU-26e22503-5fd5-8f55-d068-e1714fbb6fd6)\n"
+            "GPU 3: Tesla V100-SXM2-16GB (UUID: GPU-ebfc20c7-5f1a-1bc9-0d98-601cbe21fc2d)\n"
+        )
+
+        stdout = nvidia_smi_out.encode()
+        result = ProcResult(stdout)
+        with patch("subprocess.run", return_value=result):
+            gpu_count = self.scheduler._get_gpu_count()
+            self.assertEqual(4, gpu_count)
+
+    def test_get_gpu_count_error(self) -> None:
+        with patch("subprocess.run", side_effect=Exception("test error")):
+            gpu_count = self.scheduler._get_gpu_count()
+            self.assertEqual(0, gpu_count)
+
+    def test_get_cuda_devices(self) -> None:
+        with patch.object(self.scheduler, "_get_gpu_count", return_value=8):
+            self.assertEqual("0,1,2,3", self.scheduler._get_cuda_devices(0, 2))
+            self.assertEqual("4,5,6,7", self.scheduler._get_cuda_devices(1, 2))
+        with patch.object(self.scheduler, "_get_gpu_count", return_value=4):
+            self.assertEqual("0", self.scheduler._get_cuda_devices(0, 4))
+            self.assertEqual("1", self.scheduler._get_cuda_devices(1, 4))
+            self.assertEqual("2", self.scheduler._get_cuda_devices(2, 4))
+            self.assertEqual("3", self.scheduler._get_cuda_devices(3, 4))
+
+    def test_get_cuda_devices_is_set(self) -> None:
+        with patch.object(self.scheduler, "_get_gpu_count", return_value=8):
+            sleep_60sec = AppDef(
+                name="sleep",
+                roles=[
+                    Role(
+                        name="sleep",
+                        image=self.test_dir,
+                        entrypoint="sleep.sh",
+                        args=["60"],
+                        num_replicas=4,
+                    )
+                ],
+            )
+
+            popen_req = self.scheduler._to_popen_request(sleep_60sec, {})
+            role_params = popen_req.role_params["sleep"]
+            self.assertEqual(4, len(role_params))
+            self.assertEqual("0,1", role_params[0].env["CUDA_VISIBLE_DEVICES"])
+            self.assertEqual("2,3", role_params[1].env["CUDA_VISIBLE_DEVICES"])
+            self.assertEqual("4,5", role_params[2].env["CUDA_VISIBLE_DEVICES"])
+            self.assertEqual("6,7", role_params[3].env["CUDA_VISIBLE_DEVICES"])
+
     def test_no_orphan_process_function(self) -> None:
         self._test_orphan_workflow()
 
@@ -839,6 +896,9 @@ def _test_orphan_workflow(self) -> None:
             target=start_sleep_processes, args=(self.test_dir, mp_queue, child_nproc)
         )
         proc.start()
+        # Before querying the queue we need to wait
+        # Otherwise we will get `FileNotFoundError: [Errno 2] No such file or directory` error
+        time.sleep(10)
         total_processes = child_nproc + 1
         pids = []
         for _ in range(total_processes):