lfr-0531
diff --git a/‎cpp/tensorrt_llm/common/attentionOp.cpp‎
Lines changed: 24 additions & 3 deletions b/‎cpp/tensorrt_llm/common/attentionOp.cpp‎
Lines changed: 24 additions & 3 deletions
diff --git a/‎cpp/tensorrt_llm/common/attentionOp.h‎
Lines changed: 35 additions & 13 deletions b/‎cpp/tensorrt_llm/common/attentionOp.h‎
Lines changed: 35 additions & 13 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplCommon.h‎
Lines changed: 13 additions & 0 deletions b/‎cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplCommon.h‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h‎
Lines changed: 7 additions & 0 deletions b/‎cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/sparseAttentionKernels.cu‎
Lines changed: 117 additions & 0 deletions b/‎cpp/tensorrt_llm/kernels/sparseAttentionKernels.cu‎
Lines changed: 117 additions & 0 deletions
@@ -24,6 +24,7 @@
 #include "tensorrt_llm/kernels/gptKernels.h"
 #include "tensorrt_llm/kernels/kvCacheUtils.h"
 #include "tensorrt_llm/kernels/multiHeadAttentionCommon.h"
+#include "tensorrt_llm/kernels/sparseAttentionKernels.h"
 #include "tensorrt_llm/kernels/unfusedAttentionKernels.h"
 #include "tensorrt_llm/runtime/iBuffer.h"
 #include "tensorrt_llm/runtime/utils/debugUtils.h"
@@ -287,6 +288,9 @@ bool AttentionOp::convertMMHAParamsToXQAParams(tensorrt_llm::kernels::XQAParams&
     xqaParams.output_sf = generationsParams.context_buf_sf;
     xqaParams.fp4_out_sf_scale = generationsParams.attention_output_sf_scale;
     xqaParams.start_token_idx_sf = generationsParams.start_token_idx_sf;
+    // Parameters for sparse attention
+    xqaParams.sparse_params = mRuntimeSparseAttentionParams;
+    xqaParams.use_sparse_attention = useTllmGenSparseAttention();
 
     // Cross attention parameters.
     xqaParams.encoder_input_lengths = generationsParams.encoder_input_lengths;
@@ -813,7 +817,7 @@ size_t AttentionOp::getWorkspaceSizeForContext(nvinfer1::DataType type, int32_t
 }
 
 size_t AttentionOp::getWorkspaceSizeForGeneration(nvinfer1::DataType type, int32_t max_num_seq,
-    int32_t max_attention_window_size, int32_t max_num_tokens) const noexcept
+    int32_t max_attention_window_size, int32_t max_num_tokens, int32_t max_blocks_per_sequence) const noexcept
 {
     if (max_num_tokens == 0)
     {
@@ -909,11 +913,15 @@ size_t AttentionOp::getWorkspaceSizeForGeneration(nvinfer1::DataType type, int32
     size_t xqa_workspace_size = 0;
     if (mEnableXQA)
     {
-        int const XQA_NUM_BUFFERS = 7;
+        int const XQA_NUM_BUFFERS = 8;
         size_t xqa_workspaces[XQA_NUM_BUFFERS];
         size_t const cu_seqlens_size = sizeof(int) * (batch_beam + 1);
         size_t const cu_kv_seqlens_size = sizeof(int) * (batch_beam + 1);
         size_t const rotary_inv_freq_size = sizeof(float) * batch_beam * mRotaryEmbeddingDim / 2;
+        // Two workspaces for sparse attention. One for the sequence lengths, and one for kv block offsets.
+        size_t const sparse_attn_cache_size = useTllmGenSparseAttention()
+            ? sizeof(int) * (batch_beam + batch_beam * 2 * max_blocks_per_sequence) * mNumKVHeads
+            : 0;
         xqa_workspaces[0] = cu_seqlens_size;
         xqa_workspaces[1] = cu_kv_seqlens_size;
         xqa_workspaces[2] = rotary_inv_freq_size;
@@ -922,7 +930,8 @@ size_t AttentionOp::getWorkspaceSizeForGeneration(nvinfer1::DataType type, int32
         // Scales used for trtllm-gen kernels.
         xqa_workspaces[4] = sizeof(float) * 2;
         xqa_workspaces[5] = sizeof(float);
-        xqa_workspaces[6] = mXqaDispatcher->getWorkspaceSize(
+        xqa_workspaces[6] = sparse_attn_cache_size;
+        xqa_workspaces[7] = mXqaDispatcher->getWorkspaceSize(
             std::min<uint32_t>(mSpecDecodingMaxGenerationLength * max_num_seq, max_num_tokens));
         xqa_workspace_size
             = tc::calculateTotalWorkspaceSize(xqa_workspaces, XQA_NUM_BUFFERS, mXqaDispatcher->getWorkspaceAlignment());
@@ -1647,6 +1656,10 @@ int AttentionOp::enqueueContext(EnqueueContextParams<T> const& params, cudaStrea
         preprocessingParams.spec_decoding_position_offsets = nullptr;
         preprocessingParams.logn_scaling = params.logn_scaling_ptr;
 
+        // Sparse KV write
+        preprocessingParams.sparse_kv_indices = mRuntimeSparseAttentionParams.sparse_kv_indices;
+        preprocessingParams.sparse_kv_offsets = mRuntimeSparseAttentionParams.sparse_kv_offsets;
+
         // Scalars
         preprocessingParams.batch_size = params.batch_size;
         preprocessingParams.max_input_seq_len = params.input_seq_length;
@@ -1676,6 +1689,8 @@ int AttentionOp::enqueueContext(EnqueueContextParams<T> const& params, cudaStrea
 
         preprocessingParams.rotary_vision_start = mVisionStart;
         preprocessingParams.rotary_vision_length = mVisionLength;
+        preprocessingParams.is_last_chunk
+            = !mAttentionChunkSize.has_value() || (params.input_seq_length == params.max_past_kv_length);
 
         {
             std::string const beforeRopeStr = "ctx attention before RoPE at layer " + std::to_string(mLayerIdx);
@@ -1841,6 +1856,12 @@ int AttentionOp::enqueueContext(EnqueueContextParams<T> const& params, cudaStrea
                 gatherInBuffer, params, cu_q_seqlens, cu_cp_partial_seqlens, stream);
             sync_check_cuda_error(stream);
         }
+
+        if (!mIsMLAEnabled) // Only for non-MLA attention
+        {
+            invokeKvCachePostprocessing(preprocessingParams, stream);
+            sync_check_cuda_error(stream);
+        }
     }
     else
     {
 
@@ -26,6 +26,7 @@
 #include "tensorrt_llm/kernels/gptKernels.h"
 #include "tensorrt_llm/kernels/kvCacheUtils.h"
 #include "tensorrt_llm/kernels/mlaKernels.h"
+#include "tensorrt_llm/kernels/sparseAttentionKernels.h"
 #include "tensorrt_llm/kernels/xqaDispatcher.h"
 #include <cassert>
 #include <set>
@@ -55,7 +56,7 @@ class AttentionOp
         int32_t cross_kv_length = 0, int32_t max_num_tokens = 0) const noexcept;
     // total_num_seq is the sum of beam_width for multiple requests
     [[nodiscard]] size_t getWorkspaceSizeForGeneration(nvinfer1::DataType type, int32_t total_num_seq,
-        int32_t max_attention_window_size, int32_t max_num_tokens) const noexcept;
+        int32_t max_attention_window_size, int32_t max_num_tokens, int32_t max_blocks_per_sequence) const noexcept;
 
     template <typename T>
     class EnqueueParams
@@ -156,14 +157,20 @@ class AttentionOp
             ss << "max_cyclic_attention_window_size: " << this->max_cyclic_attention_window_size << std::endl;
             ss << "can_use_one_more_block: " << (this->can_use_one_more_block ? "true" : "false") << std::endl;
             ss << "sink_token_length: " << this->sink_token_length << std::endl;
-            ss << "context_lengths: "
-               << *(runtime::ITensor::wrap((void*) this->context_lengths, nvinfer1::DataType::kINT32,
-                      runtime::ITensor::makeShape({batch_size})))
-               << std::endl;
-            ss << "sequence_lengths: "
-               << *(runtime::ITensor::wrap((void*) this->sequence_lengths, nvinfer1::DataType::kINT32,
-                      runtime::ITensor::makeShape({batch_size})))
-               << std::endl;
+            if (this->context_lengths && batch_size > 0)
+            {
+                ss << "context_lengths: "
+                   << *(runtime::ITensor::wrap((void*) this->context_lengths, nvinfer1::DataType::kINT32,
+                          runtime::ITensor::makeShape({batch_size})))
+                   << std::endl;
+            }
+            if (this->sequence_lengths && batch_size > 0)
+            {
+                ss << "sequence_lengths: "
+                   << *(runtime::ITensor::wrap((void*) this->sequence_lengths, nvinfer1::DataType::kINT32,
+                          runtime::ITensor::makeShape({batch_size})))
+                   << std::endl;
+            }
             ss << "kv_scale_orig_quant: " << this->kv_scale_orig_quant << std::endl;
             ss << "kv_scale_quant_orig: " << this->kv_scale_quant_orig << std::endl;
             ss << "attention_output_orig_quant: " << this->attention_output_orig_quant << std::endl;
@@ -348,6 +355,16 @@ class AttentionOp
         return mIsMLAEnabled;
     }
 
+    [[nodiscard]] bool useSparseAttention() const
+    {
+        return mUseSparseAttention && mPagedKVCache && mEnableXQA;
+    }
+
+    [[nodiscard]] bool useTllmGenSparseAttention() const
+    {
+        return mUseTllmGenSparseAttention && useSparseAttention();
+    }
+
     [[nodiscard]] int smVersion() const
     {
         return mSM;
@@ -427,6 +444,8 @@ class AttentionOp
     bool mIsMLAEnabled = false;
     bool mIsGenerationMLA = false;
     bool mUseGenFlashMLA = false;
+    bool mUseSparseAttention = false;
+    bool mUseTllmGenSparseAttention = false;
     tensorrt_llm::kernels::MlaMetaParams mMLAParams;
     int mCpSize = 1;
     int mCpRank = 0;
@@ -454,6 +473,8 @@ class AttentionOp
     // Whether to fuse FP4 quant into attention kernel.
     bool mFuseFp4Quant = false;
 
+    kernels::SparseAttentionParams mRuntimeSparseAttentionParams;
+
     // This is implementation details which we want to save when serializing, but not expose as
     // a plugin field or a constructor parameter
     int32_t mNbMultiBlockSemaphores = 0;
@@ -473,10 +494,11 @@ class AttentionOp
             mPosShiftEnabled, mPagedContextFMHA, mFP8ContextFMHA, mFP8AttenOutput, mFP8ContextMLA, mFP8GenerationMLA,
             mChunkPrefillBufferBatchSize, mDenseContextFMHA, mHasFullAttentionMask, mIsSpecDecodingEnabled,
             mUseSpecDecoding, mIsSpecDecTree, mSpecDecodingIsGenerationLengthVariable, mSpecDecodingMaxGenerationLength,
-            mIsMLAEnabled, mIsGenerationMLA, mUseGenFlashMLA, mMLAParams.data(), mCpSize, mCpRank, mCpGroup,
-            mNumAttnHeads, mNumAttnKVHeads, mNumKVHeadsOrigin, mAttnTpSize, mAttnTpRank, mAttnCpSize, mAttnCpRank,
-            mUlyssesMQABroadcast, mEnableContextFMHA, mFMHAForceFP32Acc, mMultiBlockMode, mEnableXQA, mUseKVCache,
-            mSkipAttn, mFuseFp4Quant, mNbMultiBlockSemaphores, mAttentionChunkSize.value_or(-1));
+            mIsMLAEnabled, mIsGenerationMLA, mUseGenFlashMLA, mUseSparseAttention, mUseTllmGenSparseAttention,
+            mMLAParams.data(), mCpSize, mCpRank, mCpGroup, mNumAttnHeads, mNumAttnKVHeads, mNumKVHeadsOrigin,
+            mAttnTpSize, mAttnTpRank, mAttnCpSize, mAttnCpRank, mUlyssesMQABroadcast, mEnableContextFMHA,
+            mFMHAForceFP32Acc, mMultiBlockMode, mEnableXQA, mUseKVCache, mSkipAttn, mFuseFp4Quant,
+            mRuntimeSparseAttentionParams.data(), mNbMultiBlockSemaphores, mAttentionChunkSize.value_or(-1));
     };
 
 private:
 
@@ -233,6 +233,8 @@ struct XQALaunchParam
     float* bmm2_scale_ptr = nullptr;
     int32_t* semaphores = nullptr;
     void* scratch = nullptr;
+    void* sparse_kv_block_offsets = nullptr;
+    int32_t* sparse_seq_lengths = nullptr;
 };
 
 // Setup launch params and ioScratch. ioScratch is for RoPE and output type conversion.
@@ -266,6 +268,9 @@ void buildXQALaunchParams(XQALaunchParam<KVCacheBuffer>& launchParams, void*& in
     const size_t cu_kv_seqlens_size = sizeof(int) * (batch_beam_size + 1);
     const size_t rotary_inv_freq_size = sizeof(float) * batch_beam_size * params.rotary_embedding_dim / 2;
     const size_t tokens_info_size = sizeof(int2) * params.total_num_input_tokens;
+    const size_t kv_block_offsets_size
+        = sizeof(int) * batch_beam_size * 2 * params.max_blocks_per_sequence * params.num_kv_heads;
+    const size_t seq_lengths_size = sizeof(int) * batch_beam_size * params.num_kv_heads;
     launchParams.cu_seq_lens = reinterpret_cast<int*>(workspace);
     workspace = tensorrt_llm::common::nextWorkspacePtrWithAlignment(workspace, cu_seqlens_size);
     launchParams.cu_kv_seq_lens = reinterpret_cast<int*>(workspace);
@@ -281,6 +286,14 @@ void buildXQALaunchParams(XQALaunchParam<KVCacheBuffer>& launchParams, void*& in
     workspace = tensorrt_llm::common::nextWorkspacePtrWithAlignment(workspace, bmm1_scale_size);
     launchParams.bmm2_scale_ptr = reinterpret_cast<float*>(workspace);
     workspace = tensorrt_llm::common::nextWorkspacePtrWithAlignment(workspace, bmm2_scale_size);
+    // Used for block sparse attention
+    if (params.use_sparse_attention)
+    {
+        launchParams.sparse_kv_block_offsets = reinterpret_cast<void*>(workspace);
+        workspace = tensorrt_llm::common::nextWorkspacePtrWithAlignment(workspace, kv_block_offsets_size);
+        launchParams.sparse_seq_lengths = reinterpret_cast<int*>(workspace);
+        workspace = tensorrt_llm::common::nextWorkspacePtrWithAlignment(workspace, seq_lengths_size);
+    }
     inputScratch = workspace;
     if (hasOutputScratch)
     {
 
@@ -17,6 +17,7 @@
 #include "tensorrt_llm/common/quantization.h"
 #include "tensorrt_llm/kernels/gptKernels.h"
 #include "tensorrt_llm/kernels/multiHeadAttentionCommon.h"
+#include "tensorrt_llm/kernels/sparseAttentionKernels.h"
 
 namespace tensorrt_llm
 {
@@ -109,6 +110,10 @@ struct XQAParams
     // for cross attention
     int32_t const* encoder_input_lengths = nullptr;
 
+    // sparse attention parameters
+    SparseAttentionParams sparse_params;
+    bool use_sparse_attention = false;
+
     cudaStream_t stream = 0;
 
     std::string toString() const
@@ -179,6 +184,8 @@ struct XQAParams
            << "is_fp8_output :" << (is_fp8_output ? "true" : "false") << std ::endl
            << "fp8_out_scale :" << fp8_out_scale << std ::endl
            << "encoder_input_lengths: " << encoder_input_lengths << std::endl
+           << "sparse_params: " << sparse_params.toString() << std::endl
+           << "use_sparse_attention :" << (use_sparse_attention ? "true" : "false") << std ::endl
            << "stream :" << stream;
 
         return ss.str();
 
@@ -0,0 +1,117 @@
+#include "tensorrt_llm/kernels/sparseAttentionKernels.h"
+#include <cub/cub.cuh>
+
+namespace tensorrt_llm
+{
+namespace kernels
+{
+template <int THREADS_PER_BLOCK>
+__global__ void gatherKvPageOffsetsKernel(
+    int32_t* output_kv_page_offsets, // [num_head_kv, batch_size, 2, max_num_pages_per_seq]
+    int32_t* output_seq_lengths,     // [num_head_kv, batch_size]
+    int32_t const* kv_page_offsets,  // [batch_size, 2, max_num_pages_per_seq]
+    int32_t const* seq_lengths,      // [batch_size]
+    SparseAttentionParams const sparse_params, int32_t const batch_size, int32_t const tokens_per_page,
+    int32_t const max_num_pages_per_seq)
+{
+    // Each CUDA block processes one sequence from the batch for one head.
+    int32_t const head_idx = blockIdx.x;
+    int32_t const batch_idx = blockIdx.y;
+    if (batch_idx >= batch_size)
+    {
+        return;
+    }
+
+    // Shared memory for reduction.
+    __shared__ typename cub::BlockReduce<Pair, THREADS_PER_BLOCK>::TempStorage temp_storage;
+
+    // Get the range of sparse indices and the sequence length.
+    int32_t const start_offset = sparse_params.sparse_attn_offsets[batch_idx];
+    int32_t const end_offset = sparse_params.sparse_attn_offsets[batch_idx + 1];
+    int32_t const total_pages = sparse_params.sparse_attn_offsets[batch_size];
+    int32_t const num_sparse_pages = end_offset - start_offset;
+    int32_t const original_seq_len = seq_lengths[batch_idx];
+
+    // Get global sparse index.
+    int32_t const sparse_idx_global = head_idx * total_pages + start_offset;
+
+    // Get the base memory offset. shape: [batch_size, 2, max_num_pages_per_seq]
+    size_t const src_base_offset = (size_t) batch_idx * 2 * max_num_pages_per_seq;
+    size_t const dst_base_offset = (size_t) head_idx * batch_size * 2 * max_num_pages_per_seq + src_base_offset;
+
+    // Initialize the local max page index and number of valid pages.
+    int32_t local_max_page_index = -1;
+    int32_t local_num_valid_pages = 0;
+
+    // Perform the gather operation.
+    for (int32_t i = threadIdx.x; i < num_sparse_pages; i += blockDim.x)
+    {
+        // Get the source idx and offset.
+        int32_t const src_idx = sparse_params.sparse_attn_indices[sparse_idx_global + i];
+        if (src_idx < 0)
+        {
+            continue;
+        }
+
+        // Update the local max page index.
+        local_max_page_index = max(local_max_page_index, src_idx);
+        local_num_valid_pages++;
+
+        // Get the source and destination offsets.
+        size_t const src_offset_dim0 = src_base_offset + 0 * max_num_pages_per_seq + src_idx;
+        size_t const src_offset_dim1 = src_base_offset + 1 * max_num_pages_per_seq + src_idx;
+        size_t const dst_offset_dim0 = dst_base_offset + 0 * max_num_pages_per_seq + i;
+        size_t const dst_offset_dim1 = dst_base_offset + 1 * max_num_pages_per_seq + i;
+
+        // Perform the gather operation: read from the sparse location and write to the dense location.
+        output_kv_page_offsets[dst_offset_dim0] = kv_page_offsets[src_offset_dim0];
+        output_kv_page_offsets[dst_offset_dim1] = kv_page_offsets[src_offset_dim1];
+    }
+
+    // Reduce the local max page indices and number of valid pages.
+    Pair local_pair = {local_max_page_index, local_num_valid_pages};
+    Pair result = cub::BlockReduce<Pair, THREADS_PER_BLOCK>(temp_storage).Reduce(local_pair, PairReduceOp());
+
+    // Update sequence length for this head and batch.
+    if (threadIdx.x == 0)
+    {
+        int32_t const max_page_index = result.max_val;
+        int32_t const num_valid_pages = result.sum_val;
+        int32_t const ori_valid_pages = (original_seq_len + tokens_per_page - 1) / tokens_per_page;
+        size_t const seq_len_offset = (size_t) head_idx * batch_size + batch_idx;
+        if (num_valid_pages > 0)
+        {
+            int32_t seq_len = original_seq_len - (ori_valid_pages - num_valid_pages) * tokens_per_page;
+            int32_t seq_len_remain = original_seq_len % tokens_per_page;
+            if (max_page_index != ori_valid_pages - 1 && seq_len_remain != 0)
+            {
+                seq_len += tokens_per_page - seq_len_remain;
+            }
+            output_seq_lengths[seq_len_offset] = seq_len;
+        }
+        else
+        {
+            output_seq_lengths[seq_len_offset] = 0;
+        }
+    }
+}
+
+// Host-side launcher function
+void invokeGatherKvPageOffsets(int32_t* output_kv_page_offsets, int32_t* output_seq_lengths,
+    int32_t const* kv_page_offsets, int32_t const* seq_lengths, SparseAttentionParams const sparse_params,
+    int32_t const batch_size, int32_t const num_head_kv, int32_t const tokens_per_page,
+    int32_t const max_num_pages_per_seq, cudaStream_t stream)
+{
+    // The grid.
+    dim3 grid(num_head_kv, batch_size, 1);
+    // The block.
+    dim3 block(256, 1, 1);
+    // Shared memory size.
+    size_t smem_size = sizeof(Pair) * 256;
+
+    // Launch the kernel.
+    gatherKvPageOffsetsKernel<256><<<grid, block, smem_size, stream>>>(output_kv_page_offsets, output_seq_lengths,
+        kv_page_offsets, seq_lengths, sparse_params, batch_size, tokens_per_page, max_num_pages_per_seq);
+}
+} // namespace kernels
+} // namespace tensorrt_llm