From bf11ffbe9f519632bcadb39f785f8d8caf01bc65 Mon Sep 17 00:00:00 2001
From: Akash Kaothalkar <akash.kaothalkar@ibm.com>
Date: Tue, 1 Jul 2025 10:22:05 -0500
Subject: [PATCH 1/3] feat: add power cpu bind function

Signed-off-by: Akash Kaothalkar <akash.kaothalkar@ibm.com>
---
 vllm/worker/cpu_worker.py | 56 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 54 insertions(+), 2 deletions(-)

diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index ff110e050bb6..fca43cbab264 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -159,8 +159,11 @@ def __init__(
         omp_cpuids = envs.VLLM_CPU_OMP_THREADS_BIND
         self.local_omp_cpuid = "all"
         if omp_cpuids == "auto":
-            self.local_omp_cpuid = self.get_cpus_id_binding_based_on_numa_nodes(
-            )
+            arch = platform.machine()
+            if arch == "ppc64le":
+                self.local_omp_cpuid = self.get_cpus_id_binding_based_on_numa_nodes_ppc64le()
+            else:
+                self.local_omp_cpuid = self.get_cpus_id_binding_based_on_numa_nodes()
         else:
             self.local_omp_cpuid = omp_cpuids.split("|")[rank]
 
@@ -448,3 +451,52 @@ def get_cpus_id_binding_based_on_numa_nodes(self) -> str:
                 "fallback to no thread-binding. To get better performance,"
                 "please try to manually bind threads.")
         return rank_to_cpus
+
+     def get_cpus_id_binding_based_on_numa_nodes_ppc64le(self) -> str:
+        """ppc64le-specific: Always select CPUs whose id % 8 < 4 (first 4 threads per core), robust to SMT mode."""
+        def select_first_4_of_each_power_core(node_cpu_ids):
+            # Select CPUs whose id % 8 is in {0,1,2,3}
+            return [cpu for cpu in node_cpu_ids if cpu % 8 < 4]
+
+        rank_to_cpus = self.local_omp_cpuid
+        world_size = self.vllm_config.parallel_config.world_size
+        libnuma_found = util.find_spec("numa") is not None
+        psutil_found = util.find_spec("psutil") is not None
+        if libnuma_found and psutil_found:
+            import psutil
+            from numa import info
+            cpus_allow_list = psutil.Process().cpu_affinity()
+            numa_size = info.get_num_configured_nodes()
+
+            node_to_cpus = []
+            for i in range(numa_size):
+                node_intersect = set(info.node_to_cpus(i)).intersection(cpus_allow_list)
+                if bool(node_intersect):
+                    node_to_cpus.append(sorted(list(node_intersect)))
+
+            if world_size > len(node_to_cpus):
+                logger.error(
+                    "Auto thread-binding failed due to "
+                    "world size: %d is larger than "
+                    "allowed NUMA nodes number: %d."
+                    "Please try to bind threads manually.", world_size,
+                    len(node_to_cpus))
+            else:
+                node_cpus_this_rank = node_to_cpus[self.rank]
+                # Always select CPUs whose id % 8 < 4 (first 4 threads per core)
+                node_cpus_this_rank = select_first_4_of_each_power_core(node_cpus_this_rank)
+                cpu_count_per_numa = len(node_cpus_this_rank)
+                num_of_reserved_cpu = min(envs.VLLM_CPU_NUM_OF_RESERVED_CPU,
+                                          cpu_count_per_numa // 2)
+                end = cpu_count_per_numa - num_of_reserved_cpu
+                rank_to_cpus_list = node_cpus_this_rank[:end]
+                rank_to_cpus = ','.join(str(x) for x in rank_to_cpus_list)
+                logger.info("ppc64le thread-binding list: %s", rank_to_cpus)
+        else:
+            logger.warning(
+                "Auto thread-binding is not supported due to "
+                "the lack of package numa and psutil,"
+                "fallback to no thread-binding. To get better performance,"
+                "please try to manually bind threads.")
+        return rank_to_cpus
+

From 62e16f4ec015dc3c900f04b14dae318bb9ea3abd Mon Sep 17 00:00:00 2001
From: Akash Kaothalkar <akash.kaothalkar@ibm.com>
Date: Wed, 2 Jul 2025 04:02:14 -0500
Subject: [PATCH 2/3] fix: lint and comments

Signed-off-by: Akash Kaothalkar <akash.kaothalkar@ibm.com>
---
 vllm/worker/cpu_worker.py | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index fca43cbab264..19b84e3d7413 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """A CPU worker class."""
 import os
+import platform
 from importlib import util
 from typing import List, Optional, Set, Tuple, Type
 
@@ -161,9 +162,11 @@ def __init__(
         if omp_cpuids == "auto":
             arch = platform.machine()
             if arch == "ppc64le":
-                self.local_omp_cpuid = self.get_cpus_id_binding_based_on_numa_nodes_ppc64le()
+                self.local_omp_cpuid = self.get_cpus_id_binding_based_on_numa_nodes_ppc64le(
+                )
             else:
-                self.local_omp_cpuid = self.get_cpus_id_binding_based_on_numa_nodes()
+                self.local_omp_cpuid = self.get_cpus_id_binding_based_on_numa_nodes(
+                )
         else:
             self.local_omp_cpuid = omp_cpuids.split("|")[rank]
 
@@ -452,10 +455,14 @@ def get_cpus_id_binding_based_on_numa_nodes(self) -> str:
                 "please try to manually bind threads.")
         return rank_to_cpus
 
-     def get_cpus_id_binding_based_on_numa_nodes_ppc64le(self) -> str:
-        """ppc64le-specific: Always select CPUs whose id % 8 < 4 (first 4 threads per core), robust to SMT mode."""
-        def select_first_4_of_each_power_core(node_cpu_ids):
-            # Select CPUs whose id % 8 is in {0,1,2,3}
+    def get_cpus_id_binding_based_on_numa_nodes_ppc64le(self) -> str:
+        """
+        Power (ppc64le) specific: Selects a subset of threads per core for each NUMA node.
+        This is robust to SMT mode (SMT-8, SMT-4, etc) because the OS only exposes available threads.
+        This maximizes performance by avoiding oversubscription of logical CPUs on Power systems.
+        """
+
+        def select_threads_per_power_core(node_cpu_ids):
             return [cpu for cpu in node_cpu_ids if cpu % 8 < 4]
 
         rank_to_cpus = self.local_omp_cpuid
@@ -470,7 +477,8 @@ def select_first_4_of_each_power_core(node_cpu_ids):
 
             node_to_cpus = []
             for i in range(numa_size):
-                node_intersect = set(info.node_to_cpus(i)).intersection(cpus_allow_list)
+                node_intersect = set(
+                    info.node_to_cpus(i)).intersection(cpus_allow_list)
                 if bool(node_intersect):
                     node_to_cpus.append(sorted(list(node_intersect)))
 
@@ -483,8 +491,8 @@ def select_first_4_of_each_power_core(node_cpu_ids):
                     len(node_to_cpus))
             else:
                 node_cpus_this_rank = node_to_cpus[self.rank]
-                # Always select CPUs whose id % 8 < 4 (first 4 threads per core)
-                node_cpus_this_rank = select_first_4_of_each_power_core(node_cpus_this_rank)
+                node_cpus_this_rank = select_threads_per_power_core(
+                    node_cpus_this_rank)
                 cpu_count_per_numa = len(node_cpus_this_rank)
                 num_of_reserved_cpu = min(envs.VLLM_CPU_NUM_OF_RESERVED_CPU,
                                           cpu_count_per_numa // 2)
@@ -499,4 +507,3 @@ def select_first_4_of_each_power_core(node_cpu_ids):
                 "fallback to no thread-binding. To get better performance,"
                 "please try to manually bind threads.")
         return rank_to_cpus
-

From c8e85274e146090f85661d4fd279e71b09c27454 Mon Sep 17 00:00:00 2001
From: Akash Kaothalkar <akash.kaothalkar@ibm.com>
Date: Wed, 2 Jul 2025 12:03:02 -0500
Subject: [PATCH 3/3] fix: lint erros

Signed-off-by: Akash Kaothalkar <akash.kaothalkar@ibm.com>
---
 vllm/worker/cpu_worker.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index 19b84e3d7413..8a21701ca72c 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -162,11 +162,11 @@ def __init__(
         if omp_cpuids == "auto":
             arch = platform.machine()
             if arch == "ppc64le":
-                self.local_omp_cpuid = self.get_cpus_id_binding_based_on_numa_nodes_ppc64le(
-                )
+                self.local_omp_cpuid = (
+                    self.get_cpus_id_binding_based_on_numa_nodes_ppc64le())
             else:
-                self.local_omp_cpuid = self.get_cpus_id_binding_based_on_numa_nodes(
-                )
+                self.local_omp_cpuid = (
+                    self.get_cpus_id_binding_based_on_numa_nodes())
         else:
             self.local_omp_cpuid = omp_cpuids.split("|")[rank]
 
@@ -457,9 +457,10 @@ def get_cpus_id_binding_based_on_numa_nodes(self) -> str:
 
     def get_cpus_id_binding_based_on_numa_nodes_ppc64le(self) -> str:
         """
-        Power (ppc64le) specific: Selects a subset of threads per core for each NUMA node.
-        This is robust to SMT mode (SMT-8, SMT-4, etc) because the OS only exposes available threads.
-        This maximizes performance by avoiding oversubscription of logical CPUs on Power systems.
+        Power (ppc64le) specific: Selects a subset of threads per core for 
+        each NUMA node.This is robust to SMT mode (SMT-8, SMT-4, etc) 
+        because the OS only exposes available threads.This maximizes 
+        performance by avoiding oversubscription of logical CPUs on Power.
         """
 
         def select_threads_per_power_core(node_cpu_ids):