Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions cpp/tensorrt_llm/common/attentionOp.h
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,11 @@ class AttentionOp

void debugCheckSemaphores(cudaStream_t stream);

[[nodiscard]] int getMultiProcessorCount() const
{
return mMultiProcessorCount;
}

[[nodiscard]] std::string toString() const;

int mLayerIdx = -1;
Expand Down
4 changes: 3 additions & 1 deletion cpp/tensorrt_llm/thop/attentionOp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,9 @@ class Runner : public RunnerBase

// Always reserve SemaphoreArray (for multi-block mode) as MMHA may enable multi-block mode when shared memory
// is not enough.
op.reserveSemaphoreArray(op.mNumHeads * max_num_requests);
// The attention kernel might split the heads into multiple blocks, so we might need to reserve more semaphores.
// Use mMultiProcessorCount as the lower-bound to make sure we reserve enough semaphores.
op.reserveSemaphoreArray(std::max(op.mNumHeads * max_num_requests, op.getMultiProcessorCount()));
}

int64_t getWorkspaceSize(AttentionOp const& op, int const num_tokens, int const max_attention_window_size,
Expand Down