[None][feat] add gatherKvPageOffsetsKernel (#32)

lfr-0531 · heyuhhh · commit 1ff8d9e9055f · 2025-09-26T06:37:55.000Z
* add gatherKvPageOffsetsKernel.

Signed-off-by: Fanrong Li &lt;23290157+lfr-0531@users.noreply.github.com&gt;

* fix.

Signed-off-by: Fanrong Li &lt;23290157+lfr-0531@users.noreply.github.com&gt;

* fix.

Signed-off-by: Fanrong Li &lt;23290157+lfr-0531@users.noreply.github.com&gt;

---------

Signed-off-by: Fanrong Li &lt;23290157+lfr-0531@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/common/attentionOp.cpp b/cpp/tensorrt_llm/common/attentionOp.cpp
@@ -24,6 +24,7 @@
 #include "tensorrt_llm/kernels/gptKernels.h"
 #include "tensorrt_llm/kernels/kvCacheUtils.h"
 #include "tensorrt_llm/kernels/multiHeadAttentionCommon.h"
+#include "tensorrt_llm/kernels/sparseAttentionKernels.h"
 #include "tensorrt_llm/kernels/unfusedAttentionKernels.h"
 #include "tensorrt_llm/runtime/iBuffer.h"
 #include "tensorrt_llm/runtime/utils/debugUtils.h"
@@ -120,9 +121,6 @@ struct FusedQKVMaskedAttentionDispatchParams
     bool block_sparse_attention = false;
     BlockSparseParams block_sparse_params;
     int32_t const* mrope_position_deltas;
-    int32_t const* sparse_attn_indices;
-    int32_t const* sparse_attn_offsets;
-    int32_t num_sparse_attn_indices;
 };
 
 template <typename T, typename KVCacheBuffer>
@@ -203,10 +201,6 @@ bool AttentionOp::convertMMHAParamsToXQAParams(tensorrt_llm::kernels::XQAParams&
     // Medusa mode will have multiple query tokens.
     xqaParams.multi_query_tokens = mIsSpecDecodingEnabled && mUseSpecDecoding;
     xqaParams.is_spec_dec_tree = mIsSpecDecTree;
-    // Sparse attention parameters for XQA
-    xqaParams.sparse_attn_indices = mRuntimeSparseAttentionParams.sparse_attn_indices;
-    xqaParams.sparse_attn_offsets = mRuntimeSparseAttentionParams.sparse_attn_offsets;
-    xqaParams.num_sparse_attn_indices = mRuntimeSparseAttentionParams.num_sparse_attn_indices;
 
     if (mKVCacheQuantMode.hasInt8KvCache())
     {
@@ -294,6 +288,9 @@ bool AttentionOp::convertMMHAParamsToXQAParams(tensorrt_llm::kernels::XQAParams&
     xqaParams.output_sf = generationsParams.context_buf_sf;
     xqaParams.fp4_out_sf_scale = generationsParams.attention_output_sf_scale;
     xqaParams.start_token_idx_sf = generationsParams.start_token_idx_sf;
+    // Parameters for sparse attention
+    xqaParams.sparse_attn_indices = mRuntimeSparseAttentionParams.sparse_attn_indices;
+    xqaParams.sparse_attn_offsets = mRuntimeSparseAttentionParams.sparse_attn_offsets;
 
     // Cross attention parameters.
     xqaParams.encoder_input_lengths = generationsParams.encoder_input_lengths;
@@ -676,11 +673,6 @@ void fusedQKV_masked_attention_dispatch(Multihead_attention_params<T_MMHA, CROSS
 
     params.multi_processor_count = input_params.multi_processor_count;
 
-    // sparse indices and offsets for attention
-    params.sparse_attn_indices = input_params.sparse_attn_indices;
-    params.sparse_attn_offsets = input_params.sparse_attn_offsets;
-    params.num_sparse_attn_indices = input_params.num_sparse_attn_indices;
-
     // cross attn
     params.memory_length_per_sample = input_params.memory_length_per_sample;
 
@@ -825,7 +817,7 @@ size_t AttentionOp::getWorkspaceSizeForContext(nvinfer1::DataType type, int32_t
 }
 
 size_t AttentionOp::getWorkspaceSizeForGeneration(nvinfer1::DataType type, int32_t max_num_seq,
-    int32_t max_attention_window_size, int32_t max_num_tokens) const noexcept
+    int32_t max_attention_window_size, int32_t max_num_tokens, int32_t max_blocks_per_sequence) const noexcept
 {
     if (max_num_tokens == 0)
     {
@@ -908,14 +900,19 @@ size_t AttentionOp::getWorkspaceSizeForGeneration(nvinfer1::DataType type, int32
     size_t const cpMaxPaddedSequenceLength = (batch_beam + mCpSize - 1) / mCpSize * mCpSize;
     size_t const cpWorkspaceSize
         = mCpSize == 1 ? 0 : (2 * size * cpMaxPaddedSequenceLength * getHeadSize() * (mNumHeads + 2 * mNumKVHeads));
+    // Two workspaces for sparse attention. One for the sequence lengths, and one for kv block offsets.
+    size_t const sparse_attn_cache_size = (mUseSparseAttention && mEnableXQA)
+        ? sizeof(int) * (batch_beam + batch_beam * 2 * max_blocks_per_sequence * mNumKVHeads)
+        : 0;
 
-    int const NUM_BUFFERS = 5;
+    int const NUM_BUFFERS = 6;
     size_t workspaces[NUM_BUFFERS];
     workspaces[0] = partial_out_size;
     workspaces[1] = partial_sum_size;
     workspaces[2] = partial_max_size;
     workspaces[3] = shift_k_cache_size;
     workspaces[4] = cpWorkspaceSize;
+    workspaces[5] = sparse_attn_cache_size;
     generation_workspace_size = tc::calculateTotalWorkspaceSize(workspaces, NUM_BUFFERS);
 
     size_t xqa_workspace_size = 0;
@@ -2275,6 +2272,17 @@ int AttentionOp::enqueueGeneration(EnqueueGenerationParams<T> const& params, cud
                 xqaParams.output = mhaOutput;
                 xqaParams.qkv = attention_input;
             }
+            if (mUseSparseAttention && std::is_same_v<KVCacheBuffer, KVBlockArray>)
+            {
+                size_t kv_block_offsets_size = batch_beam * 2 * params.max_blocks_per_sequence * mNumKVHeads;
+                size_t seq_lengths_size = batch_beam;
+                int* sparse_kv_block_offsets
+                    = reinterpret_cast<int*>(nextWorkspacePtr(workspace_byte_ptr, offset, kv_block_offsets_size));
+                int* sparse_seq_lengths
+                    = reinterpret_cast<int*>(nextWorkspacePtr(workspace_byte_ptr, offset, seq_lengths_size));
+                xqaParams.sparse_kv_block_offsets = sparse_kv_block_offsets;
+                xqaParams.sparse_seq_lengths = sparse_seq_lengths;
+            }
             mXqaDispatcher->run(xqaParams, kv_cache_buffer, kv_scale_cache_buffer);
             if (mCpSize > 1 && mAttnTpSize > 1 && mAttnCpSize == 1)
             {
@@ -2427,9 +2435,6 @@ int AttentionOp::enqueueGeneration(EnqueueGenerationParams<T> const& params, cud
     dispatch_params.block_sparse_attention = mMaskType == AttentionMaskType::BLOCKSPARSE;
     dispatch_params.block_sparse_params = mBlockSparseParams;
     dispatch_params.mrope_position_deltas = params.mrope_position_deltas;
-    dispatch_params.sparse_attn_indices = mRuntimeSparseAttentionParams.sparse_attn_indices;
-    dispatch_params.sparse_attn_offsets = mRuntimeSparseAttentionParams.sparse_attn_offsets;
-    dispatch_params.num_sparse_attn_indices = mRuntimeSparseAttentionParams.num_sparse_attn_indices;
 
     using DataType = typename SATypeConverter<T>::Type;
     if (!isCrossAttention())
diff --git a/cpp/tensorrt_llm/common/attentionOp.h b/cpp/tensorrt_llm/common/attentionOp.h
@@ -56,7 +56,7 @@ class AttentionOp
         int32_t cross_kv_length = 0, int32_t max_num_tokens = 0) const noexcept;
     // total_num_seq is the sum of beam_width for multiple requests
     [[nodiscard]] size_t getWorkspaceSizeForGeneration(nvinfer1::DataType type, int32_t total_num_seq,
-        int32_t max_attention_window_size, int32_t max_num_tokens) const noexcept;
+        int32_t max_attention_window_size, int32_t max_num_tokens, int32_t max_blocks_per_sequence) const noexcept;
 
     template <typename T>
     class EnqueueParams
@@ -488,7 +488,7 @@ class AttentionOp
             mPosShiftEnabled, mPagedContextFMHA, mFP8ContextFMHA, mFP8AttenOutput, mFP8ContextMLA, mFP8GenerationMLA,
             mChunkPrefillBufferBatchSize, mDenseContextFMHA, mHasFullAttentionMask, mIsSpecDecodingEnabled,
             mUseSpecDecoding, mIsSpecDecTree, mSpecDecodingIsGenerationLengthVariable, mSpecDecodingMaxGenerationLength,
-            mIsMLAEnabled, mIsGenerationMLA, mUseGenFlashMLA, mMLAParams.data(), mCpSize, mCpRank, mCpGroup,
+            mIsMLAEnabled, mIsGenerationMLA, mUseGenFlashMLA, mUseSparseAttention, mMLAParams.data(), mCpSize, mCpRank, mCpGroup,
             mNumAttnHeads, mNumAttnKVHeads, mNumKVHeadsOrigin, mAttnTpSize, mAttnTpRank, mAttnCpSize, mAttnCpRank,
             mUlyssesMQABroadcast, mEnableContextFMHA, mFMHAForceFP32Acc, mMultiBlockMode, mEnableXQA, mUseKVCache,
             mSkipAttn, mFuseFp4Quant, mRuntimeSparseAttentionParams.data(), mNbMultiBlockSemaphores, mAttentionChunkSize.value_or(-1));
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention.h
@@ -208,11 +208,6 @@ struct Multihead_attention_params_base
     // threadblock counter to identify the complete of partial attention computations
     int* block_counter = nullptr;
 
-    // sparse indices and offsets for attention calculation
-    int32_t const* sparse_attn_indices = nullptr;
-    int32_t const* sparse_attn_offsets = nullptr;
-    int32_t num_sparse_attn_indices = 0;
-
     int const* memory_length_per_sample = nullptr;
     int32_t const* mrope_position_deltas = nullptr;
 };
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h
@@ -108,9 +108,12 @@ struct XQAParams
 
     // for cross attention
     int32_t const* encoder_input_lengths = nullptr;
-    int32_t num_sparse_attn_indices = 0;
-    int32_t const* sparse_attn_indices = nullptr;
-    int32_t const* sparse_attn_offsets = nullptr;
+
+    // sparse attention parameters
+    int32_t* sparse_attn_indices = nullptr;
+    int32_t* sparse_attn_offsets = nullptr;
+    int* sparse_seq_lengths = nullptr;
+    int* sparse_kv_block_offsets = nullptr;
 
     cudaStream_t stream = 0;
 
@@ -182,7 +185,6 @@ struct XQAParams
            << "is_fp8_output :" << (is_fp8_output ? "true" : "false") << std ::endl
            << "fp8_out_scale :" << fp8_out_scale << std ::endl
            << "encoder_input_lengths: " << encoder_input_lengths << std::endl
-           << "num_sparse_attn_indices :" << num_sparse_attn_indices << std ::endl
            << "sparse_attn_indices :" << sparse_attn_indices << std ::endl
            << "sparse_attn_offsets :" << sparse_attn_offsets << std ::endl
            << "stream :" << stream;
diff --git a/cpp/tensorrt_llm/kernels/sparseAttentionKernels.cu b/cpp/tensorrt_llm/kernels/sparseAttentionKernels.cu
@@ -0,0 +1,74 @@
+#include "tensorrt_llm/kernels/sparseAttentionKernels.h"
+
+namespace tensorrt_llm
+{
+namespace kernels
+{
+__global__ void gatherKvPageOffsetsKernel(
+    int* output_kv_page_offsets, // [num_head_kv, batch_size, 2, max_num_pages_per_seq]
+    int* output_seq_lengths,     // [batch_size]
+    int const* kv_page_offsets,  // [batch_size, 2, max_num_pages_per_seq]
+    int const* seq_lengths,      // [batch_size]
+    SparseAttentionParams const& sparse_params)
+{
+    // Each CUDA block processes one sequence from the batch.
+    int const head_idx = blockIdx.x;
+    int const batch_idx = blockIdx.y;
+    if (batch_idx >= sparse_params.batch_size)
+    {
+        return;
+    }
+
+    // Get the range of sparse indices.
+    int const start_offset = sparse_params.sparse_attn_offsets[batch_idx];
+    int const end_offset = sparse_params.sparse_attn_offsets[batch_idx + 1];
+    int const num_sparse_pages = end_offset - start_offset;
+
+    // Get the base memory offset. shape: [batch_size, 2, max_num_pages_per_seq]
+    int const max_num_pages_per_seq = sparse_params.max_num_pages_per_seq;
+    size_t const src_base_offset = (size_t) batch_idx * 2 * max_num_pages_per_seq;
+    size_t const dst_base_offset
+        = (size_t) head_idx * sparse_params.batch_size * 2 * max_num_pages_per_seq + src_base_offset;
+
+    // Set the sequence length.
+    if (threadIdx.x == 0)
+    {
+        int const tokens_per_page = sparse_params.tokens_per_page;
+        int const num_pages = (seq_lengths[batch_idx] + tokens_per_page - 1) / tokens_per_page;
+        output_seq_lengths[batch_idx] = seq_lengths[batch_idx] - (num_pages - num_sparse_pages) * tokens_per_page;
+    }
+
+    // Perform the gather operation.
+    for (int i = threadIdx.x; i < num_sparse_pages; i += blockDim.x)
+    {
+        // Get the source idx and offset.
+        int const sparse_idx_global = (start_offset + i) * sparse_params.num_head_kv + head_idx;
+        int const src_idx = sparse_params.sparse_attn_indices[sparse_idx_global];
+        size_t const src_offset_dim0 = src_base_offset + 0 * max_num_pages_per_seq + src_idx;
+        size_t const src_offset_dim1 = src_base_offset + 1 * max_num_pages_per_seq + src_idx;
+
+        // Get the destination offset.
+        size_t const dst_offset_dim0 = dst_base_offset + 0 * max_num_pages_per_seq + i;
+        size_t const dst_offset_dim1 = dst_base_offset + 1 * max_num_pages_per_seq + i;
+
+        // Perform the gather operation: read from the sparse location and write to the dense location.
+        output_kv_page_offsets[dst_offset_dim0] = kv_page_offsets[src_offset_dim0];
+        output_kv_page_offsets[dst_offset_dim1] = kv_page_offsets[src_offset_dim1];
+    }
+}
+
+// Host-side launcher function
+void invokeGatherKvPageOffsets(int* output_kv_page_offsets, int* output_seq_lengths, int const* kv_page_offsets,
+    int const* seq_lengths, SparseAttentionParams const& sparse_params, cudaStream_t stream)
+{
+    // The grid.
+    dim3 grid(sparse_params.num_head_kv, sparse_params.batch_size, 1);
+    // The block.
+    dim3 block(256, 1, 1);
+
+    // Launch the kernel.
+    gatherKvPageOffsetsKernel<<<grid, block, 0, stream>>>(
+        output_kv_page_offsets, output_seq_lengths, kv_page_offsets, seq_lengths, sparse_params);
+}
+} // namespace kernels
+} // namespace tensorrt_llm
diff --git a/cpp/tensorrt_llm/kernels/sparseAttentionKernels.h b/cpp/tensorrt_llm/kernels/sparseAttentionKernels.h
@@ -1,5 +1,8 @@
 #pragma once
 
+#include <cuda_runtime.h>
+#include <sstream>
+
 namespace tensorrt_llm
 {
 namespace kernels
@@ -13,26 +16,37 @@ struct SparseAttentionParams
     int32_t* sparse_attn_offsets{nullptr}; // [num_generations + 1]
 
     int32_t num_sparse_kv_indices{0};
-    int32_t num_sparse_attn_indices{0};
+
+    // Scalars
+    int32_t batch_size{0};
+    int32_t num_head_kv{0};
+    int32_t tokens_per_page{0};
+    int32_t max_num_pages_per_seq{0};
 
     std::string toString() const
     {
         std::stringstream ss;
-        ss << "num_sparse_kv_indices: " << this->num_sparse_kv_indices << std::endl;
-        ss << "num_sparse_attn_indices: " << this->num_sparse_attn_indices << std::endl;
-        ss << "sparse_kv_indices: " << this->sparse_kv_indices << std::endl;
-        ss << "sparse_attn_indices: " << this->sparse_attn_indices << std::endl;
-        ss << "sparse_kv_offsets: " << this->sparse_kv_offsets << std::endl;
-        ss << "sparse_attn_offsets: " << this->sparse_attn_offsets << std::endl;
+        ss << "num_sparse_kv_indices: " << this->num_sparse_kv_indices << std::endl
+           << "sparse_kv_indices: " << this->sparse_kv_indices << std::endl
+           << "sparse_attn_indices: " << this->sparse_attn_indices << std::endl
+           << "sparse_kv_offsets: " << this->sparse_kv_offsets << std::endl
+           << "sparse_attn_offsets: " << this->sparse_attn_offsets << std::endl
+           << "batch_size: " << this->batch_size << std::endl
+           << "num_head_kv: " << this->num_head_kv << std::endl
+           << "tokens_per_page: " << this->tokens_per_page << std::endl
+           << "max_num_pages_per_seq: " << this->max_num_pages_per_seq << std::endl;
         return ss.str();
     }
 
     auto data() const
     {
         return std::make_tuple(sparse_kv_indices, sparse_attn_indices, sparse_kv_offsets, sparse_attn_offsets,
-            num_sparse_kv_indices, num_sparse_attn_indices);
+            num_sparse_kv_indices, batch_size, num_head_kv, tokens_per_page, max_num_pages_per_seq);
     }
 };
 
+void invokeGatherKvPageOffsets(int* output_kv_page_offsets, int* output_seq_lengths, int const* kv_page_offsets,
+    int const* seq_lengths, SparseAttentionParams const& sparse_params, cudaStream_t stream);
+
 } // namespace kernels
 } // namespace tensorrt_llm
diff --git a/cpp/tensorrt_llm/kernels/xqaDispatcher.cpp b/cpp/tensorrt_llm/kernels/xqaDispatcher.cpp
@@ -17,6 +17,7 @@
 #include "xqaDispatcher.h"
 #include "tensorrt_llm/common/cudaUtils.h"
 #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplCommon.h"
+#include "tensorrt_llm/kernels/sparseAttentionKernels.h"
 #include "tensorrt_llm/kernels/unfusedAttentionKernels.h"
 #include <cstdint>
 
@@ -404,11 +405,30 @@ void XqaDispatcher::runImpl(
         // Otherwise, always enable the persistent scheduler for better performance.
         tllmRunnerParams.mTileScheduler = params.multi_block_mode ? TileScheduler::Static : TileScheduler::Persistent;
 
+        // The sequence lengths for K/V.
+        tllmRunnerParams.seqLensKvPtr = params.cross_attention ? params.encoder_input_lengths : params.sequence_lengths;
+
         // Q buffer.
         tllmRunnerParams.qPtr = xqa_q_input_ptr;
         // KV buffer
+        bool use_sparse_attention = (params.sparse_attn_indices != nullptr && params.sparse_attn_offsets != nullptr);
         if constexpr (std::is_same_v<KVCacheBuffer, KVBlockArray>)
         {
+            // Gather kv page offsets for sparse attention.
+            if (use_sparse_attention)
+            {
+                SparseAttentionParams sparse_params;
+                sparse_params.sparse_attn_indices = params.sparse_attn_indices;
+                sparse_params.sparse_attn_offsets = params.sparse_attn_offsets;
+                sparse_params.batch_size = batch_beam_size;
+                sparse_params.num_head_kv = num_kv_heads;
+                sparse_params.tokens_per_page = kv_cache_buffer.mTokensPerBlock;
+                sparse_params.max_num_pages_per_seq = kv_cache_buffer.mMaxBlocksPerSeq;
+                invokeGatherKvPageOffsets(params.sparse_kv_block_offsets, params.sparse_seq_lengths,
+                    launchParams.cu_kv_seq_lens, params.sequence_lengths, sparse_params, params.stream);
+                sync_check_cuda_error(params.stream);
+            }
+
             // Paged KV
             tllmRunnerParams.mQkvLayout = QkvLayout::PagedKv;
             tllmRunnerParams.kvPtr = kv_cache_buffer.mPrimaryPoolPtr;
@@ -419,6 +439,7 @@ void XqaDispatcher::runImpl(
         }
         else
         {
+            TLLM_CHECK_WITH_INFO(!use_sparse_attention, "Sparse attention is not supported for KVLinearBuffer.");
             static_assert(std::is_same_v<KVCacheBuffer, KVLinearBuffer>);
             // Contiguous KV
             tllmRunnerParams.mQkvLayout = QkvLayout::ContiguousKv;
@@ -437,8 +458,6 @@ void XqaDispatcher::runImpl(
         tllmRunnerParams.scaleSoftmaxLog2Ptr
             = reinterpret_cast<float const*>(launchParams.bmm1_scale_ptr + kIdxScaleSoftmaxLog2Ptr);
         tllmRunnerParams.oSfScalePtr = params.fp4_out_sf_scale;
-        // The sequence lengths for K/V.
-        tllmRunnerParams.seqLensKvPtr = params.cross_attention ? params.encoder_input_lengths : params.sequence_lengths;
 
         tllmRunnerParams.oPtr = params.output;
         tllmRunnerParams.oSfPtr = params.output_sf;
diff --git a/cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.cpp b/cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.cpp
@@ -572,11 +572,15 @@ size_t GPTAttentionPlugin::getWorkspaceSize(nvinfer1::PluginTensorDesc const* in
         = isCrossAttention() ? cross_kv_length : (useKVCache() ? inputs[getIdx(IdxEntry::CACHE_INDIR)].dims.d[2] : 0);
     int const max_num_tokens
         = mRemovePadding ? inputs[getIdx(IdxEntry::QKV_TENSOR)].dims.d[0] : max_num_seq * max_context_length;
+    auto const& kvCacheBlockOffsetsShape = inputs[getIdx(IdxEntry::KV_CACHE_BLOCK_OFFSETS)].dims;
+    int const max_blocks_per_sequence
+        = (useKVCache() && mPagedKVCache) ? inputs[getIdx(IdxEntry::KV_CACHE_BLOCK_OFFSETS)].dims.d[3] : 0;
+
     size_t const context_workspace_size
         = getWorkspaceSizeForContext(type, max_num_seq, max_context_length, cross_kv_length, max_num_tokens);
 
-    size_t const generation_workspace_size
-        = getWorkspaceSizeForGeneration(type, max_num_seq, max_kv_cache_length, max_num_tokens);
+    size_t const generation_workspace_size = getWorkspaceSizeForGeneration(
+        type, max_num_seq, max_kv_cache_length, max_num_tokens, max_blocks_per_sequence);
 
     size_t attention_input_workspace_size = 0;
 
diff --git a/cpp/tensorrt_llm/thop/attentionOp.cpp b/cpp/tensorrt_llm/thop/attentionOp.cpp
diff --git a/cpp/tests/unit_tests/kernels/CMakeLists.txt b/cpp/tests/unit_tests/kernels/CMakeLists.txt
diff --git a/cpp/tests/unit_tests/kernels/sparseAttentionKernelsTest.cpp b/cpp/tests/unit_tests/kernels/sparseAttentionKernelsTest.cpp
diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py