Skip to content

Commit 0584f73

Browse files
aivanoufacebook-github-bot
authored andcommitted
Add automatic set of CUDA_VISIBLE_DEVICES for local scheduler
Summary: The diff adds automatic set of `CUDA_VISIBLE_DEVICES` based on `num_replicas`. Each replica gets the same number of devices The alg. applies only when `CUDA_VISIBLE_DEVICES` is not set #297 #377 Differential Revision: D34064433 fbshipit-source-id: bce7f25cde2336de10b20ac8a37cc0d154e1b8c4
1 parent 4b989d5 commit 0584f73

File tree

2 files changed

+70
-0
lines changed

2 files changed

+70
-0
lines changed

torchx/schedulers/local_scheduler.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
TextIO,
4242
)
4343

44+
import torch
4445
from pyre_extensions import none_throws
4546
from torchx.schedulers.api import AppDryRunInfo, DescribeAppResponse, Scheduler, Stream
4647
from torchx.schedulers.ids import make_unique
@@ -765,6 +766,20 @@ def _submit_dryrun(
765766
request = self._to_popen_request(app, cfg)
766767
return AppDryRunInfo(request, lambda p: pprint.pformat(p, indent=2, width=80))
767768

769+
def _get_cuda_devices(self, replica_id: int, num_replicas: int) -> Optional[str]:
770+
gpu_device_count = torch.cuda.device_count()
771+
gpu_bucket_size = int(gpu_device_count / num_replicas)
772+
if gpu_device_count != 0:
773+
devices = list(
774+
range(
775+
gpu_bucket_size * replica_id,
776+
gpu_bucket_size * (replica_id + 1),
777+
)
778+
)
779+
visible_devices = ",".join([str(d) for d in devices])
780+
return visible_devices
781+
return None
782+
768783
def _to_popen_request(
769784
self,
770785
app: AppDef,
@@ -786,6 +801,19 @@ def _to_popen_request(
786801

787802
img_root = image_provider.fetch_role(role)
788803

804+
gpu_device_count = torch.cuda.device_count()
805+
if gpu_device_count != 0 and gpu_device_count < role.num_replicas:
806+
log.warning(
807+
"Different role replicas will occupy the same device"
808+
"Decreate the number of replicas by changing `role.num_replicas` parmeter "
809+
f"Devices detected: {gpu_device_count}, num replicas: {role.num_replicas}"
810+
)
811+
if gpu_device_count != 0 and gpu_device_count % role.num_replicas != 0:
812+
log.warning(
813+
"Number of detected gpus is not proportional to the number of replicas"
814+
f"GPUs detected: {gpu_device_count}, num replicas: {role.num_replicas}"
815+
)
816+
789817
for replica_id in range(role.num_replicas):
790818
values = macros.Values(
791819
img_root=img_root,
@@ -794,6 +822,12 @@ def _to_popen_request(
794822
)
795823
replica_role = values.apply(role)
796824
replica_log_dir = os.path.join(app_log_dir, role.name, str(replica_id))
825+
visible_devices = self._get_cuda_devices(replica_id, role.num_replicas)
826+
if visible_devices and "CUDA_VISIBLE_DEVICES" not in replica_role.env:
827+
log.debug(
828+
f"Setting role replica {role.num_replicas} with {visible_devices} devices"
829+
)
830+
replica_role.env["CUDA_VISIBLE_DEVICES"] = visible_devices
797831

798832
if "TORCHELASTIC_ERROR_FILE" not in replica_role.env:
799833
# this is the top level (agent if using elastic role) error file

torchx/schedulers/test/local_scheduler_test.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -828,6 +828,39 @@ def test_close_twice(self) -> None:
828828
self.scheduler.close()
829829
# nothing to validate just make sure no errors are raised
830830

831+
def test_get_cuda_devices(self) -> None:
832+
with patch("torch.cuda.device_count", return_value=8):
833+
self.assertEqual("0,1,2,3", self.scheduler._get_cuda_devices(0, 2))
834+
self.assertEqual("4,5,6,7", self.scheduler._get_cuda_devices(1, 2))
835+
with patch("torch.cuda.device_count", return_value=4):
836+
self.assertEqual("0", self.scheduler._get_cuda_devices(0, 4))
837+
self.assertEqual("1", self.scheduler._get_cuda_devices(1, 4))
838+
self.assertEqual("2", self.scheduler._get_cuda_devices(2, 4))
839+
self.assertEqual("3", self.scheduler._get_cuda_devices(3, 4))
840+
841+
def test_get_cuda_devices_is_set(self) -> None:
842+
with patch("torch.cuda.device_count", return_value=8):
843+
sleep_60sec = AppDef(
844+
name="sleep",
845+
roles=[
846+
Role(
847+
name="sleep",
848+
image=self.test_dir,
849+
entrypoint="sleep.sh",
850+
args=["60"],
851+
num_replicas=4,
852+
)
853+
],
854+
)
855+
856+
popen_req = self.scheduler._to_popen_request(sleep_60sec, {})
857+
role_params = popen_req.role_params["sleep"]
858+
self.assertEqual(4, len(role_params))
859+
self.assertEqual("0,1", role_params[0].env["CUDA_VISIBLE_DEVICES"])
860+
self.assertEqual("2,3", role_params[1].env["CUDA_VISIBLE_DEVICES"])
861+
self.assertEqual("4,5", role_params[2].env["CUDA_VISIBLE_DEVICES"])
862+
self.assertEqual("6,7", role_params[3].env["CUDA_VISIBLE_DEVICES"])
863+
831864
def test_no_orphan_process_function(self) -> None:
832865
self._test_orphan_workflow()
833866

@@ -839,6 +872,9 @@ def _test_orphan_workflow(self) -> None:
839872
target=start_sleep_processes, args=(self.test_dir, mp_queue, child_nproc)
840873
)
841874
proc.start()
875+
# Before querying the queue we need to wait
876+
# Otherwise we will get `FileNotFoundError: [Errno 2] No such file or directory` error
877+
time.sleep(10)
842878
total_processes = child_nproc + 1
843879
pids = []
844880
for _ in range(total_processes):

0 commit comments

Comments
 (0)