From bf11ffbe9f519632bcadb39f785f8d8caf01bc65 Mon Sep 17 00:00:00 2001 From: Akash Kaothalkar Date: Tue, 1 Jul 2025 10:22:05 -0500 Subject: [PATCH 1/3] feat: add power cpu bind function Signed-off-by: Akash Kaothalkar --- vllm/worker/cpu_worker.py | 56 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 54 insertions(+), 2 deletions(-) diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index ff110e050bb6..fca43cbab264 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -159,8 +159,11 @@ def __init__( omp_cpuids = envs.VLLM_CPU_OMP_THREADS_BIND self.local_omp_cpuid = "all" if omp_cpuids == "auto": - self.local_omp_cpuid = self.get_cpus_id_binding_based_on_numa_nodes( - ) + arch = platform.machine() + if arch == "ppc64le": + self.local_omp_cpuid = self.get_cpus_id_binding_based_on_numa_nodes_ppc64le() + else: + self.local_omp_cpuid = self.get_cpus_id_binding_based_on_numa_nodes() else: self.local_omp_cpuid = omp_cpuids.split("|")[rank] @@ -448,3 +451,52 @@ def get_cpus_id_binding_based_on_numa_nodes(self) -> str: "fallback to no thread-binding. To get better performance," "please try to manually bind threads.") return rank_to_cpus + + def get_cpus_id_binding_based_on_numa_nodes_ppc64le(self) -> str: + """ppc64le-specific: Always select CPUs whose id % 8 < 4 (first 4 threads per core), robust to SMT mode.""" + def select_first_4_of_each_power_core(node_cpu_ids): + # Select CPUs whose id % 8 is in {0,1,2,3} + return [cpu for cpu in node_cpu_ids if cpu % 8 < 4] + + rank_to_cpus = self.local_omp_cpuid + world_size = self.vllm_config.parallel_config.world_size + libnuma_found = util.find_spec("numa") is not None + psutil_found = util.find_spec("psutil") is not None + if libnuma_found and psutil_found: + import psutil + from numa import info + cpus_allow_list = psutil.Process().cpu_affinity() + numa_size = info.get_num_configured_nodes() + + node_to_cpus = [] + for i in range(numa_size): + node_intersect = set(info.node_to_cpus(i)).intersection(cpus_allow_list) + if bool(node_intersect): + node_to_cpus.append(sorted(list(node_intersect))) + + if world_size > len(node_to_cpus): + logger.error( + "Auto thread-binding failed due to " + "world size: %d is larger than " + "allowed NUMA nodes number: %d." + "Please try to bind threads manually.", world_size, + len(node_to_cpus)) + else: + node_cpus_this_rank = node_to_cpus[self.rank] + # Always select CPUs whose id % 8 < 4 (first 4 threads per core) + node_cpus_this_rank = select_first_4_of_each_power_core(node_cpus_this_rank) + cpu_count_per_numa = len(node_cpus_this_rank) + num_of_reserved_cpu = min(envs.VLLM_CPU_NUM_OF_RESERVED_CPU, + cpu_count_per_numa // 2) + end = cpu_count_per_numa - num_of_reserved_cpu + rank_to_cpus_list = node_cpus_this_rank[:end] + rank_to_cpus = ','.join(str(x) for x in rank_to_cpus_list) + logger.info("ppc64le thread-binding list: %s", rank_to_cpus) + else: + logger.warning( + "Auto thread-binding is not supported due to " + "the lack of package numa and psutil," + "fallback to no thread-binding. To get better performance," + "please try to manually bind threads.") + return rank_to_cpus + From 62e16f4ec015dc3c900f04b14dae318bb9ea3abd Mon Sep 17 00:00:00 2001 From: Akash Kaothalkar Date: Wed, 2 Jul 2025 04:02:14 -0500 Subject: [PATCH 2/3] fix: lint and comments Signed-off-by: Akash Kaothalkar --- vllm/worker/cpu_worker.py | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index fca43cbab264..19b84e3d7413 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """A CPU worker class.""" import os +import platform from importlib import util from typing import List, Optional, Set, Tuple, Type @@ -161,9 +162,11 @@ def __init__( if omp_cpuids == "auto": arch = platform.machine() if arch == "ppc64le": - self.local_omp_cpuid = self.get_cpus_id_binding_based_on_numa_nodes_ppc64le() + self.local_omp_cpuid = self.get_cpus_id_binding_based_on_numa_nodes_ppc64le( + ) else: - self.local_omp_cpuid = self.get_cpus_id_binding_based_on_numa_nodes() + self.local_omp_cpuid = self.get_cpus_id_binding_based_on_numa_nodes( + ) else: self.local_omp_cpuid = omp_cpuids.split("|")[rank] @@ -452,10 +455,14 @@ def get_cpus_id_binding_based_on_numa_nodes(self) -> str: "please try to manually bind threads.") return rank_to_cpus - def get_cpus_id_binding_based_on_numa_nodes_ppc64le(self) -> str: - """ppc64le-specific: Always select CPUs whose id % 8 < 4 (first 4 threads per core), robust to SMT mode.""" - def select_first_4_of_each_power_core(node_cpu_ids): - # Select CPUs whose id % 8 is in {0,1,2,3} + def get_cpus_id_binding_based_on_numa_nodes_ppc64le(self) -> str: + """ + Power (ppc64le) specific: Selects a subset of threads per core for each NUMA node. + This is robust to SMT mode (SMT-8, SMT-4, etc) because the OS only exposes available threads. + This maximizes performance by avoiding oversubscription of logical CPUs on Power systems. + """ + + def select_threads_per_power_core(node_cpu_ids): return [cpu for cpu in node_cpu_ids if cpu % 8 < 4] rank_to_cpus = self.local_omp_cpuid @@ -470,7 +477,8 @@ def select_first_4_of_each_power_core(node_cpu_ids): node_to_cpus = [] for i in range(numa_size): - node_intersect = set(info.node_to_cpus(i)).intersection(cpus_allow_list) + node_intersect = set( + info.node_to_cpus(i)).intersection(cpus_allow_list) if bool(node_intersect): node_to_cpus.append(sorted(list(node_intersect))) @@ -483,8 +491,8 @@ def select_first_4_of_each_power_core(node_cpu_ids): len(node_to_cpus)) else: node_cpus_this_rank = node_to_cpus[self.rank] - # Always select CPUs whose id % 8 < 4 (first 4 threads per core) - node_cpus_this_rank = select_first_4_of_each_power_core(node_cpus_this_rank) + node_cpus_this_rank = select_threads_per_power_core( + node_cpus_this_rank) cpu_count_per_numa = len(node_cpus_this_rank) num_of_reserved_cpu = min(envs.VLLM_CPU_NUM_OF_RESERVED_CPU, cpu_count_per_numa // 2) @@ -499,4 +507,3 @@ def select_first_4_of_each_power_core(node_cpu_ids): "fallback to no thread-binding. To get better performance," "please try to manually bind threads.") return rank_to_cpus - From c8e85274e146090f85661d4fd279e71b09c27454 Mon Sep 17 00:00:00 2001 From: Akash Kaothalkar Date: Wed, 2 Jul 2025 12:03:02 -0500 Subject: [PATCH 3/3] fix: lint erros Signed-off-by: Akash Kaothalkar --- vllm/worker/cpu_worker.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index 19b84e3d7413..8a21701ca72c 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -162,11 +162,11 @@ def __init__( if omp_cpuids == "auto": arch = platform.machine() if arch == "ppc64le": - self.local_omp_cpuid = self.get_cpus_id_binding_based_on_numa_nodes_ppc64le( - ) + self.local_omp_cpuid = ( + self.get_cpus_id_binding_based_on_numa_nodes_ppc64le()) else: - self.local_omp_cpuid = self.get_cpus_id_binding_based_on_numa_nodes( - ) + self.local_omp_cpuid = ( + self.get_cpus_id_binding_based_on_numa_nodes()) else: self.local_omp_cpuid = omp_cpuids.split("|")[rank] @@ -457,9 +457,10 @@ def get_cpus_id_binding_based_on_numa_nodes(self) -> str: def get_cpus_id_binding_based_on_numa_nodes_ppc64le(self) -> str: """ - Power (ppc64le) specific: Selects a subset of threads per core for each NUMA node. - This is robust to SMT mode (SMT-8, SMT-4, etc) because the OS only exposes available threads. - This maximizes performance by avoiding oversubscription of logical CPUs on Power systems. + Power (ppc64le) specific: Selects a subset of threads per core for + each NUMA node.This is robust to SMT mode (SMT-8, SMT-4, etc) + because the OS only exposes available threads.This maximizes + performance by avoiding oversubscription of logical CPUs on Power. """ def select_threads_per_power_core(node_cpu_ids):