Skip to content

Commit 210ce6e

Browse files
xinyu-inteljikunshang
authored andcommitted
[DP] Create placement groups by ray_device_key (vllm-project#25026)
Signed-off-by: Xinyu Chen <[email protected]> Co-authored-by: Kunshang Ji <[email protected]> Signed-off-by: xuebwang-amd <[email protected]>
1 parent 207b450 commit 210ce6e

File tree

1 file changed

+11
-8
lines changed

1 file changed

+11
-8
lines changed

vllm/v1/engine/utils.py

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -334,20 +334,22 @@ def create_dp_placement_groups(
334334
"No nodes with resources found in Ray cluster.")
335335
assert dp_master_ip_key in nodes[0], (
336336
"The DP master node (ip: %s) is missing or dead", dp_master_ip)
337+
device_str = current_platform.ray_device_key
337338
for node_resources in nodes:
338-
if "GPU" not in node_resources:
339+
if device_str not in node_resources:
339340
continue
340341
# For now, each DP rank can only be assigned to one node
341342
# TODO(rui): support allocating a single DP rank
342343
# to multiple nodes
343-
available_engine_count = int(node_resources["GPU"]) // world_size
344+
available_engine_count = int(
345+
node_resources[device_str]) // world_size
344346
if dp_master_ip_key in node_resources:
345347
assert available_engine_count >= local_engine_count, (
346348
"Not enough resources to allocate DP ranks "
347349
f"on DP master node {dp_master_ip}")
348350
for i in range(local_engine_count):
349351
bundles = [{
350-
"GPU": 1.0,
352+
device_str: 1.0,
351353
"node:" + dp_master_ip: 0.001
352354
}] * world_size + [{
353355
"CPU": 1.0
@@ -363,7 +365,7 @@ def create_dp_placement_groups(
363365
for i in range(available_engine_count):
364366
if len(placement_groups) == num_pg_to_create:
365367
break
366-
bundles = [{"GPU": 1.0}] * world_size + [{"CPU": 1.0}]
368+
bundles = [{device_str: 1.0}] * world_size + [{"CPU": 1.0}]
367369
pg = ray.util.placement_group(
368370
name=f"dp_rank_{len(placement_groups)}",
369371
strategy="STRICT_PACK",
@@ -415,17 +417,18 @@ def add_dp_placement_groups(
415417
local_dp_ranks = []
416418
num_pg_created = 0
417419

420+
device_str = current_platform.ray_device_key
418421
for node in nodes:
419422
if num_pg_created >= num_pg_to_create:
420423
break
421424

422425
node_ip = node.node_ip
423426
node_id = node.node_id
424-
available_gpus = int(available_resources[node_id]["GPU"])
427+
available_gpus = int(available_resources[node_id][device_str])
425428

426429
# Get total GPUs on this node from the node's resources
427430
# Ray stores node resources with node ID as key
428-
total_gpus = int(total_resources[node_id]["GPU"])
431+
total_gpus = int(total_resources[node_id][device_str])
429432

430433
# Calculate used GPUs and used engines on this node
431434
used_gpus = max(0, total_gpus - available_gpus)
@@ -444,13 +447,13 @@ def add_dp_placement_groups(
444447
# Create bundles with node constraint for master node
445448
if node_ip == dp_master_ip:
446449
bundles = [{
447-
"GPU": 1.0,
450+
device_str: 1.0,
448451
"node:" + dp_master_ip: 0.001
449452
}] * world_size + [{
450453
"CPU": 1.0
451454
}]
452455
else:
453-
bundles = [{"GPU": 1.0}] * world_size + [{"CPU": 1.0}]
456+
bundles = [{device_str: 1.0}] * world_size + [{"CPU": 1.0}]
454457

455458
pg = ray.util.placement_group(
456459
name=f"dp_rank_{rank}",

0 commit comments

Comments
 (0)