vllm-project · Akashcodes732 · Jul 1, 2025 · Jul 1, 2025 · Jul 2, 2025 · Jul 2, 2025
diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """A CPU worker class."""
 import os
+import platform
 from importlib import util
 from typing import List, Optional, Set, Tuple, Type
 
@@ -159,8 +160,13 @@ def __init__(
         omp_cpuids = envs.VLLM_CPU_OMP_THREADS_BIND
         self.local_omp_cpuid = "all"
         if omp_cpuids == "auto":
-            self.local_omp_cpuid = self.get_cpus_id_binding_based_on_numa_nodes(
-            )
+            arch = platform.machine()
+            if arch == "ppc64le":
+                self.local_omp_cpuid = (
+                    self.get_cpus_id_binding_based_on_numa_nodes_ppc64le())
+            else:
+                self.local_omp_cpuid = (
+                    self.get_cpus_id_binding_based_on_numa_nodes())
         else:
             self.local_omp_cpuid = omp_cpuids.split("|")[rank]
 
@@ -448,3 +454,57 @@ def get_cpus_id_binding_based_on_numa_nodes(self) -> str:
                 "fallback to no thread-binding. To get better performance,"
                 "please try to manually bind threads.")
         return rank_to_cpus
+
+    def get_cpus_id_binding_based_on_numa_nodes_ppc64le(self) -> str:
+        """
+        Power (ppc64le) specific: Selects a subset of threads per core for 
+        each NUMA node.This is robust to SMT mode (SMT-8, SMT-4, etc) 
+        because the OS only exposes available threads.This maximizes 
+        performance by avoiding oversubscription of logical CPUs on Power.
+        """
+
+        def select_threads_per_power_core(node_cpu_ids):
+            return [cpu for cpu in node_cpu_ids if cpu % 8 < 4]
-        def select_threads_per_power_core(node_cpu_ids):
-            return [cpu for cpu in node_cpu_ids if cpu % 8 < 4]
+        def select_threads_per_power_core(node_cpu_ids):
+            # For SMT-8 on Power, use 4 threads per core for better performance.
+            # This assumes logical CPUs for a core are assigned in blocks of 8.
+            THREADS_PER_CORE = 8
+            THREADS_TO_USE = 4
+            return [cpu for cpu in node_cpu_ids if cpu % THREADS_PER_CORE < THREADS_TO_USE]
-        def select_threads_per_power_core(node_cpu_ids):
-            return [cpu for cpu in node_cpu_ids if cpu % 8 < 4]
+        def select_threads_per_power_core(node_cpu_ids):
+            # For SMT-8 on Power, use 4 threads per core for better performance.
+            # This assumes logical CPUs for a core are assigned in blocks of 8.
+            THREADS_PER_CORE = 8
+            THREADS_TO_USE = 4
+            return [cpu for cpu in node_cpu_ids if cpu % THREADS_PER_CORE < THREADS_TO_USE]
+
+        rank_to_cpus = self.local_omp_cpuid
+        world_size = self.vllm_config.parallel_config.world_size
+        libnuma_found = util.find_spec("numa") is not None
+        psutil_found = util.find_spec("psutil") is not None
+        if libnuma_found and psutil_found:
+            import psutil
+            from numa import info
+            cpus_allow_list = psutil.Process().cpu_affinity()
+            numa_size = info.get_num_configured_nodes()
+
+            node_to_cpus = []
+            for i in range(numa_size):
+                node_intersect = set(
+                    info.node_to_cpus(i)).intersection(cpus_allow_list)
+                if bool(node_intersect):
+                    node_to_cpus.append(sorted(list(node_intersect)))
+
+            if world_size > len(node_to_cpus):
+                logger.error(
+                    "Auto thread-binding failed due to "
+                    "world size: %d is larger than "
+                    "allowed NUMA nodes number: %d."
+                    "Please try to bind threads manually.", world_size,
+                    len(node_to_cpus))
+            else:
+                node_cpus_this_rank = node_to_cpus[self.rank]
+                node_cpus_this_rank = select_threads_per_power_core(
+                    node_cpus_this_rank)
+                cpu_count_per_numa = len(node_cpus_this_rank)
+                num_of_reserved_cpu = min(envs.VLLM_CPU_NUM_OF_RESERVED_CPU,
+                                          cpu_count_per_numa // 2)
+                end = cpu_count_per_numa - num_of_reserved_cpu
+                rank_to_cpus_list = node_cpus_this_rank[:end]
+                rank_to_cpus = ','.join(str(x) for x in rank_to_cpus_list)
+                logger.info("ppc64le thread-binding list: %s", rank_to_cpus)
+        else:
+            logger.warning(
+                "Auto thread-binding is not supported due to "
+                "the lack of package numa and psutil,"
+                "fallback to no thread-binding. To get better performance,"
+                "please try to manually bind threads.")
+        return rank_to_cpus