NVIDIA · PerkzZheng · Jul 23, 2025 · Jul 22, 2025
diff --git a/cpp/tensorrt_llm/common/attentionOp.h b/cpp/tensorrt_llm/common/attentionOp.h
@@ -341,6 +341,11 @@ class AttentionOp
 
     void debugCheckSemaphores(cudaStream_t stream);
 
+    [[nodiscard]] int getMultiProcessorCount() const
+    {
+        return mMultiProcessorCount;
+    }
+
     [[nodiscard]] std::string toString() const;
 
     int mLayerIdx = -1;

diff --git a/cpp/tensorrt_llm/thop/attentionOp.cpp b/cpp/tensorrt_llm/thop/attentionOp.cpp
@@ -101,7 +101,9 @@ class Runner : public RunnerBase
 
         // Always reserve SemaphoreArray (for multi-block mode) as MMHA may enable multi-block mode when shared memory
         // is not enough.
-        op.reserveSemaphoreArray(op.mNumHeads * max_num_requests);
+        // The attention kernel might split the heads into multiple blocks, so we might need to reserve more semaphores.
+        // Use mMultiProcessorCount as the lower-bound to make sure we reserve enough semaphores.
+        op.reserveSemaphoreArray(std::max(op.mNumHeads * max_num_requests, op.getMultiProcessorCount()));
     }
 
     int64_t getWorkspaceSize(AttentionOp const& op, int const num_tokens, int const max_attention_window_size,