NVIDIA
diff --git a/‎.coderabbit.yaml‎
Lines changed: 22 additions & 0 deletions b/‎.coderabbit.yaml‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎.github/pull_request_template.md‎
Lines changed: 12 additions & 5 deletions b/‎.github/pull_request_template.md‎
Lines changed: 12 additions & 5 deletions
diff --git a/‎benchmarks/cpp/disaggServerBenchmark.cpp‎
Lines changed: 2 additions & 0 deletions b/‎benchmarks/cpp/disaggServerBenchmark.cpp‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/cacheTransceiver.h‎
Lines changed: 5 additions & 14 deletions b/‎cpp/include/tensorrt_llm/batch_manager/cacheTransceiver.h‎
Lines changed: 5 additions & 14 deletions
diff --git a/‎cpp/include/tensorrt_llm/executor/executor.h‎
Lines changed: 15 additions & 4 deletions b/‎cpp/include/tensorrt_llm/executor/executor.h‎
Lines changed: 15 additions & 4 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/cacheTransBuffer.cpp‎
Lines changed: 28 additions & 9 deletions b/‎cpp/tensorrt_llm/batch_manager/cacheTransBuffer.cpp‎
Lines changed: 28 additions & 9 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/cacheTransBuffer.h‎
Lines changed: 3 additions & 1 deletion b/‎cpp/tensorrt_llm/batch_manager/cacheTransBuffer.h‎
Lines changed: 3 additions & 1 deletion
@@ -0,0 +1,22 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# yaml-language-server: $schema=https://coderabbit.ai/integrations/schema.v2.json
+language: "en-US"
+reviews:
+  auto_review:
+    drafts: true
+    base_branches: ["main", "release/.+"]
+  commit_status: false
@@ -1,17 +1,24 @@
+@coderabbitai summary
 
-# PR title
-
-Please write the PR title by following template:
+<!--
+Please write the PR title by following this template:
 
-[JIRA ticket link/nvbug link/github issue link][fix/feat/doc/infra/...] \<summary of this PR\>
+[JIRA ticket/NVBugs ID/GitHub issue][fix/feat/doc/infra/...] \<summary of this PR\>
 
-For example, assume I have a PR hope to support a new feature about cache manager of Jira TRTLLM-1000 ticket, it would be like
+For example, assume I have a PR to support a new feature about cache manager for JIRA ticket TRTLLM-1000, it would be like:
 
 [TRTLLM-1000][feat] Support a new feature about cache manager
 
+Or I have a PR to fix a Llama3 accuracy issue:
+
+[https://nvbugs/1234567][fix] Fix Llama3 accuracy issue
+-->
+
 ## Description
 
+<!--
 Please explain the issue and the solution in short.
+-->
 
 ## Test Coverage
 
 
@@ -636,6 +636,8 @@ class DisaggExecutorServer
                                                                                 : texec::DecodingMode::Auto(),
                     benchmarkParams.executorLookaheadConfig, benchmarkParams.medusaChoices));
             executorConfig.setExtendedRuntimePerfKnobConfig(extendedRuntimePerfKnobConfig);
+            executorConfig.setCacheTransceiverConfig(
+                texec::CacheTransceiverConfig(texec::CacheTransceiverConfig::BackendType::DEFAULT));
             constexpr int maxIterationsForRequestStats = 1000;
             if (mEnableCollectKvCacheTransferTime)
             {
 
@@ -70,28 +70,20 @@ class BaseCacheTransceiver
 class CacheTransceiver : public BaseCacheTransceiver
 {
 public:
-    enum class CommType : std::uint8_t
-    {
-        UNKNOWN = 0,
-        MPI = 1,
-        UCX = 2,
-        NIXL = 3
-    };
-
-    CacheTransceiver(kv_cache_manager::BaseKVCacheManager* cacheManager, CommType commType,
+    CacheTransceiver(kv_cache_manager::BaseKVCacheManager* cacheManager,
         executor::kv_cache::CacheState::ModelConfig const& cacheStateModelCfg, runtime::WorldConfig const& worldConfig,
         nvinfer1::DataType dataType,
         executor::kv_cache::CacheState::AttentionType attentionType
         = executor::kv_cache::CacheState::AttentionType::kDEFAULT,
         std::optional<executor::CacheTransceiverConfig> cacheTransceiverConfig = std::nullopt);
 
-    CacheTransceiver(kv_cache_manager::BaseKVCacheManager* cacheManager, CommType commType,
-        std::vector<SizeType32> numKvHeadsPerLayer, SizeType32 sizePerHead, SizeType32 tokensPerBlock,
-        runtime::WorldConfig const& worldConfig, nvinfer1::DataType dataType,
+    CacheTransceiver(kv_cache_manager::BaseKVCacheManager* cacheManager, std::vector<SizeType32> numKvHeadsPerLayer,
+        SizeType32 sizePerHead, SizeType32 tokensPerBlock, runtime::WorldConfig const& worldConfig,
+        nvinfer1::DataType dataType,
         executor::kv_cache::CacheState::AttentionType attentionType
         = executor::kv_cache::CacheState::AttentionType::kDEFAULT,
         std::optional<executor::CacheTransceiverConfig> cacheTransceiverConfig = std::nullopt)
-        : CacheTransceiver(cacheManager, commType,
+        : CacheTransceiver(cacheManager,
             executor::kv_cache::CacheState::ModelConfig{numKvHeadsPerLayer, sizePerHead, tokensPerBlock}, worldConfig,
             dataType, attentionType, cacheTransceiverConfig)
     {
@@ -118,7 +110,6 @@ class CacheTransceiver : public BaseCacheTransceiver
 
     void setContextState(LlmRequest* llmRequest);
 
-    CommType mCommType;
     std::unique_ptr<DataResponder> mDataResponder;
     std::unique_ptr<DataRequester> mDataRequester;
     std::vector<std::pair<LlmRequest*, std::future<void>>> mResponderFutures;
 
@@ -1430,18 +1430,29 @@ class LogitsPostProcessorConfig
 class CacheTransceiverConfig
 {
 public:
-    explicit CacheTransceiverConfig(std::optional<size_t> maxNumTokens = std::nullopt);
+    enum class BackendType : std::uint8_t
+    {
+        DEFAULT = 0,
+        MPI = 1,
+        UCX = 2,
+        NIXL = 3
+    };
+    explicit CacheTransceiverConfig(
+        std::optional<BackendType> backendType = std::nullopt, std::optional<size_t> maxNumTokens = std::nullopt);
 
     bool operator==(CacheTransceiverConfig const& other) const;
+    void setBackendType(std::optional<BackendType> backendType);
+    void setMaxTokensInBuffer(std::optional<size_t> maxTokensInBuffer);
 
-    [[nodiscard]] std::optional<size_t> getMaxNumTokens() const;
-    void setMaxNumTokens(size_t maxNumTokens);
+    [[nodiscard]] std::optional<size_t> getMaxTokensInBuffer() const;
+    [[nodiscard]] std::optional<BackendType> getBackendType() const;
 
 private:
+    std::optional<BackendType> mBackendType;
     /// @brief The maximum number of tokens that the CacheTransceiver's pre-allocated buffer can hold. If the number of
     /// kvCache tokens to be transferred for a single request is greater than this value, the performance of the cache
     /// transfer may be degraded.
-    std::optional<size_t> mMaxNumTokens;
+    std::optional<size_t> mMaxTokensInBuffer;
 };
 
 /// @brief Configuration class for the model executor
 
@@ -210,7 +210,7 @@ CacheTransBufferManager::CacheTransBufferManager(
         {
             auto poolIdx = mCacheManager->getBlockManager().getLayerPoolIdx(layerId);
             auto windowSize = static_cast<size_t>(mCacheManager->getBlockManager().getPoolWindowSize(poolIdx));
-            auto validTokenNum = windowSize < maxNumTokens.value() ? windowSize : maxNumTokens.value();
+            auto validTokenNum = (windowSize < maxNumTokens.value() ? windowSize : maxNumTokens.value());
             bufferSizeFromMaxNumToken += validTokenNum * kvCacheByteSizePerTokenPerLayer;
         }
     }
@@ -230,26 +230,37 @@ CacheTransBufferManager::CacheTransBufferManager(
     TLLM_LOG_INFO(
         "CacheTransBufferManager: mMaxNumTokens:%ld, mRecvBufferCount:%ld, "
         "mSendBufferCount:%ld,mTransferBufferSize:%ld, mPreAllocBufferSize:%ld,mOnlyUseDynamicBuffer:%d "
-        "mUseFabricMemory:%d",
+        "mUseFabricMemory:%d mDataType:%d",
         maxNumTokens.has_value() ? maxNumTokens.value() : 0, mRecvBufferCount, mSendBufferCount, mTransferBufferSize,
-        mPreAllocBufferSize, mOnlyUseDynamicBuffer, mUseFabricMemory);
-    bool to_allocate = common::getEnvUseMPIKvCache() || common::getEnvUseUCXKvCache() || common::getEnvUseNixlKvCache();
+        mPreAllocBufferSize, mOnlyUseDynamicBuffer, mUseFabricMemory, mDataType);
 
-    TLLM_CHECK_WITH_INFO(to_allocate, "CacheTransBufferManager: to_allocate is false");
     allocateBuffer();
 }
 
-size_t CacheTransBufferManager::preAllocBufferSize(std::optional<size_t> maxNumTokens)
+size_t CacheTransBufferManager::preAllocBufferSize(
+    std::map<SizeType32, SizeType32> const& cacheSizeBytesPerTokenPerWindow,
+    std::optional<executor::CacheTransceiverConfig> const& cacheTransceiverConfig)
 {
-    bool to_allocate = common::getEnvUseMPIKvCache() || common::getEnvUseUCXKvCache() || common::getEnvUseNixlKvCache();
-    if (!to_allocate)
+    if (!cacheTransceiverConfig.has_value())
     {
         return 0;
     }
+    if (!cacheTransceiverConfig->getBackendType().has_value())
+    {
+        return 0;
+    }
+    auto maxNumTokens = cacheTransceiverConfig->getMaxTokensInBuffer();
     size_t TransferBufferSize = common::getEnvMemSizeForKVCacheTransferBuffer();
     if (maxNumTokens.has_value())
     {
-        TransferBufferSize = maxNumTokens.value();
+        TransferBufferSize = 0;
+        for (auto const& [windowSize, cacheSizeBytesPerToken] : cacheSizeBytesPerTokenPerWindow)
+        {
+            auto validTokenNum
+                = (static_cast<size_t>(windowSize) < maxNumTokens.value() ? static_cast<size_t>(windowSize)
+                                                                          : maxNumTokens.value());
+            TransferBufferSize += validTokenNum * cacheSizeBytesPerToken;
+        }
     }
     bool useFabricMemory = FabricMemory::supportFbaricMemory()
         && (!(common::getEnvKVCacheTransferUseSyncBuffer() || common::getEnvKVCacheTransferUseAsyncBuffer()));
@@ -329,6 +340,14 @@ std::tuple<std::vector<runtime::ITensor::SharedPtr>, size_t, bool> CacheTransBuf
     size_t bufferCoverTargetNum = std::min(
         static_cast<size_t>(targetNum), mTransferBufferSize / (targetBufferEleSize * common::getDTypeSize(mDataType)));
     TLLM_LOG_DEBUG("getOrAllocateBuffers bufferCoverTargetNum:%d", bufferCoverTargetNum);
+    if (bufferCoverTargetNum < static_cast<size_t>(targetNum))
+    {
+        TLLM_LOG_WARNING(
+            "CacheTransceiver getOrAllocateBuffers: bufferCoverTargetNum:%d < targetNum:%d, may use dynamic buffer, "
+            "it's better to increase MaxTokensInBuffer in cacheTransceiverConfig, otherwise, the performance may "
+            "be degraded",
+            bufferCoverTargetNum, targetNum);
+    }
     if (bufferId.has_value())
     {
         TLLM_CHECK(static_cast<size_t>(bufferId.value()) < concurrenceResource.mBuffers.size());
 
@@ -18,6 +18,7 @@
 #pragma once
 
 #include "tensorrt_llm/batch_manager/kvCacheManager.h"
+#include "tensorrt_llm/executor/executor.h"
 #include "tensorrt_llm/runtime/bufferManager.h"
 #include "tensorrt_llm/runtime/iTensor.h"
 #include <atomic>
@@ -59,7 +60,8 @@ class CacheTransBufferManager
     CacheTransBufferManager(
         KVCacheManager::BaseKVCacheManager* cacheManager, std::optional<size_t> maxNumTokens = std::nullopt);
 
-    static size_t preAllocBufferSize(std::optional<size_t> maxNumTokens = std::nullopt);
+    static size_t preAllocBufferSize(std::map<SizeType32, SizeType32> const& cacheSizeBytesPerTokenPerWindow,
+        std::optional<executor::CacheTransceiverConfig> const& cacheTransceiverConfig = std::nullopt);
 
     std::optional<int> assignBufferIndexForSend();
     void freeBufferIndexForSend(std::optional<int> bufferId);
Original file line number	Diff line number	Diff line change
`@@ -636,6 +636,8 @@ class DisaggExecutorServer`
`636`	`636`	`: texec::DecodingMode::Auto(),`
`637`	`637`	`benchmarkParams.executorLookaheadConfig, benchmarkParams.medusaChoices));`
`638`	`638`	`executorConfig.setExtendedRuntimePerfKnobConfig(extendedRuntimePerfKnobConfig);`
	`639`	`+ executorConfig.setCacheTransceiverConfig(`
	`640`	`+ texec::CacheTransceiverConfig(texec::CacheTransceiverConfig::BackendType::DEFAULT));`
`639`	`641`	`constexpr int maxIterationsForRequestStats = 1000;`
`640`	`642`	`if (mEnableCollectKvCacheTransferTime)`
`641`	`643`	`{`