Add automatic set of CUDA_VISIBLE_DEVICES for local scheduler

aivanou · facebook-github-bot · commit 0584f7384514 · 2022-02-08T08:48:36.000-08:00
Summary: The diff adds automatic set of `CUDA_VISIBLE_DEVICES` based on `num_replicas`. Each replica gets the same number of devices The alg. applies only when `CUDA_VISIBLE_DEVICES` is not set #297 #377 Differential Revision: D34064433 fbshipit-source-id: bce7f25cde2336de10b20ac8a37cc0d154e1b8c4
diff --git a/torchx/schedulers/local_scheduler.py b/torchx/schedulers/local_scheduler.py
@@ -41,6 +41,7 @@
     TextIO,
 )
 
+import torch
 from pyre_extensions import none_throws
 from torchx.schedulers.api import AppDryRunInfo, DescribeAppResponse, Scheduler, Stream
 from torchx.schedulers.ids import make_unique
@@ -765,6 +766,20 @@ def _submit_dryrun(
         request = self._to_popen_request(app, cfg)
         return AppDryRunInfo(request, lambda p: pprint.pformat(p, indent=2, width=80))
 
+    def _get_cuda_devices(self, replica_id: int, num_replicas: int) -> Optional[str]:
+        gpu_device_count = torch.cuda.device_count()
+        gpu_bucket_size = int(gpu_device_count / num_replicas)
+        if gpu_device_count != 0:
+            devices = list(
+                range(
+                    gpu_bucket_size * replica_id,
+                    gpu_bucket_size * (replica_id + 1),
+                )
+            )
+            visible_devices = ",".join([str(d) for d in devices])
+            return visible_devices
+        return None
+
     def _to_popen_request(
         self,
         app: AppDef,
@@ -786,6 +801,19 @@ def _to_popen_request(
 
             img_root = image_provider.fetch_role(role)
 
+            gpu_device_count = torch.cuda.device_count()
+            if gpu_device_count != 0 and gpu_device_count < role.num_replicas:
+                log.warning(
+                    "Different role replicas will occupy the same device"
+                    "Decreate the number of replicas by changing `role.num_replicas` parmeter "
+                    f"Devices detected: {gpu_device_count}, num replicas: {role.num_replicas}"
+                )
+            if gpu_device_count != 0 and gpu_device_count % role.num_replicas != 0:
+                log.warning(
+                    "Number of detected gpus is not proportional to the number of replicas"
+                    f"GPUs detected: {gpu_device_count}, num replicas: {role.num_replicas}"
+                )
+
             for replica_id in range(role.num_replicas):
                 values = macros.Values(
                     img_root=img_root,
@@ -794,6 +822,12 @@ def _to_popen_request(
                 )
                 replica_role = values.apply(role)
                 replica_log_dir = os.path.join(app_log_dir, role.name, str(replica_id))
+                visible_devices = self._get_cuda_devices(replica_id, role.num_replicas)
+                if visible_devices and "CUDA_VISIBLE_DEVICES" not in replica_role.env:
+                    log.debug(
+                        f"Setting role replica {role.num_replicas} with {visible_devices} devices"
+                    )
+                    replica_role.env["CUDA_VISIBLE_DEVICES"] = visible_devices
 
                 if "TORCHELASTIC_ERROR_FILE" not in replica_role.env:
                     # this is the top level (agent if using elastic role) error file
diff --git a/torchx/schedulers/test/local_scheduler_test.py b/torchx/schedulers/test/local_scheduler_test.py
@@ -828,6 +828,39 @@ def test_close_twice(self) -> None:
         self.scheduler.close()
         # nothing to validate just make sure no errors are raised
 
+    def test_get_cuda_devices(self) -> None:
+        with patch("torch.cuda.device_count", return_value=8):
+            self.assertEqual("0,1,2,3", self.scheduler._get_cuda_devices(0, 2))
+            self.assertEqual("4,5,6,7", self.scheduler._get_cuda_devices(1, 2))
+        with patch("torch.cuda.device_count", return_value=4):
+            self.assertEqual("0", self.scheduler._get_cuda_devices(0, 4))
+            self.assertEqual("1", self.scheduler._get_cuda_devices(1, 4))
+            self.assertEqual("2", self.scheduler._get_cuda_devices(2, 4))
+            self.assertEqual("3", self.scheduler._get_cuda_devices(3, 4))
+
+    def test_get_cuda_devices_is_set(self) -> None:
+        with patch("torch.cuda.device_count", return_value=8):
+            sleep_60sec = AppDef(
+                name="sleep",
+                roles=[
+                    Role(
+                        name="sleep",
+                        image=self.test_dir,
+                        entrypoint="sleep.sh",
+                        args=["60"],
+                        num_replicas=4,
+                    )
+                ],
+            )
+
+            popen_req = self.scheduler._to_popen_request(sleep_60sec, {})
+            role_params = popen_req.role_params["sleep"]
+            self.assertEqual(4, len(role_params))
+            self.assertEqual("0,1", role_params[0].env["CUDA_VISIBLE_DEVICES"])
+            self.assertEqual("2,3", role_params[1].env["CUDA_VISIBLE_DEVICES"])
+            self.assertEqual("4,5", role_params[2].env["CUDA_VISIBLE_DEVICES"])
+            self.assertEqual("6,7", role_params[3].env["CUDA_VISIBLE_DEVICES"])
+
     def test_no_orphan_process_function(self) -> None:
         self._test_orphan_workflow()
 
@@ -839,6 +872,9 @@ def _test_orphan_workflow(self) -> None:
             target=start_sleep_processes, args=(self.test_dir, mp_queue, child_nproc)
         )
         proc.start()
+        # Before querying the queue we need to wait
+        # Otherwise we will get `FileNotFoundError: [Errno 2] No such file or directory` error
+        time.sleep(10)
         total_processes = child_nproc + 1
         pids = []
         for _ in range(total_processes):