NVIDIA
diff --git a/‎.coderabbit.yaml‎
Lines changed: 0 additions & 1 deletion b/‎.coderabbit.yaml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎.github/workflows/label_issue.yml‎
Lines changed: 47 additions & 0 deletions b/‎.github/workflows/label_issue.yml‎
Lines changed: 47 additions & 0 deletions
diff --git a/‎benchmarks/cpp/prepare_dataset.py‎
Lines changed: 18 additions & 9 deletions b/‎benchmarks/cpp/prepare_dataset.py‎
Lines changed: 18 additions & 9 deletions
diff --git a/‎cpp/include/tensorrt_llm/executor/executor.h‎
Lines changed: 8 additions & 1 deletion b/‎cpp/include/tensorrt_llm/executor/executor.h‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎cpp/include/tensorrt_llm/kernels/archCondition.h‎
Lines changed: 16 additions & 7 deletions b/‎cpp/include/tensorrt_llm/kernels/archCondition.h‎
Lines changed: 16 additions & 7 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp‎
Lines changed: 30 additions & 4 deletions b/‎cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp‎
Lines changed: 30 additions & 4 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/cacheFormatter.h‎
Lines changed: 9 additions & 1 deletion b/‎cpp/tensorrt_llm/batch_manager/cacheFormatter.h‎
Lines changed: 9 additions & 1 deletion
@@ -27,7 +27,6 @@ reviews:
   related_issues: true
   related_prs: true
   suggested_labels: true
-  auto_apply_labels: true
   suggested_reviewers: true
   auto_assign_reviewers: true
   poem: false
 
@@ -0,0 +1,47 @@
+name: Label New Issues
+
+on:
+  issues:
+    types: [opened]
+
+permissions:
+  issues: write
+  contents: read
+
+jobs:
+  label-issue:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout private action repository
+        uses: actions/checkout@v4
+        with:
+          repository: poweiw/goggles_action
+          path: ./.github/actions/goggles_action # local path to store the action
+          token: ${{ secrets.GOGGLES_ACTION_REPO_TOKEN}} # token to access poweiw/goggles_action
+          ref: v1.2.1
+
+      - name: AI Label Issue
+        uses: ./.github/actions/goggles_action/actions/llm_label
+        with:
+          ACTION_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          LLM_MODEL_NAME: ${{ secrets.GOGGLES_LLM_MODEL_NAME }}
+          LLM_TOKEN_SERVER_URL: ${{ secrets.GOGGLES_LLM_TOKEN_SERVER_URL }}
+          LLM_TOKEN_CLIENT_ID: ${{ secrets.GOGGLES_LLM_TOKEN_CLIENT_ID }}
+          LLM_TOKEN_CLIENT_SECRET: ${{ secrets.GOGGLES_LLM_TOKEN_CLIENT_SECRET }}
+          LLM_GENERATE_URL: ${{ secrets.GOGGLES_LLM_GENERATE_URL }}
+          LLM_TOKEN_SCOPE: ${{ secrets.GOGGLES_LLM_TOKEN_SCOPE }}
+          REPO_OWNER: ${{ github.repository_owner }}
+          REPO_NAME: ${{ github.event.repository.name }}
+          ISSUE_NUMBER: ${{ github.event.issue.number }}
+          ISSUE_TITLE: ${{ github.event.issue.title }}
+          ISSUE_BODY: ${{ github.event.issue.body }}
+          GITHUB_API_URL: ${{ github.api_url }}
+          ACTIONS_STEP_VERBOSE: false
+          EXCLUDED_LABELS: "bug,Community want to contribute,Community Engagement,duplicate,help wanted,Investigating,need more info,question,roadmap,stale,waiting for feedback,wontfix"
+          LLM_SYSTEM_PROMPT: |
+            You are an expert GitHub issue labeler. Your task is to analyze the provided issue title, issue body, and a list of available labels with their descriptions.
+            Based on this information, select the single most appropriate label from the list that best captures the primary issue or request.
+            Prefer selecting only one label that represents the main topic or problem. Only suggest multiple labels if the issue genuinely spans multiple distinct areas that are equally important.
+            Respond with ONLY the chosen label name (e.g., 'bug', 'feature-request') or comma-separated names if multiple are truly needed.
+            If no labels seem appropriate, respond with 'NONE'.
+            Do not add any other text, explanation, or markdown formatting.
@@ -16,10 +16,8 @@
 from typing import Optional, Tuple
 
 import click
-from pydantic import BaseModel, field_validator
+from pydantic import BaseModel, model_validator
 from transformers import AutoTokenizer
-from transformers.tokenization_utils import PreTrainedTokenizer
-from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
 from utils.prepare_real_data import dataset
 from utils.prepare_synthetic_data import token_norm_dist, token_unif_dist
 
@@ -30,20 +28,25 @@ class RootArgs(BaseModel):
     random_seed: int
     task_id: int
     std_out: bool
+    trust_remote_code: bool = False
     rand_task_id: Optional[Tuple[int, int]]
     lora_dir: Optional[str] = None
 
-    @field_validator('tokenizer')
-    def get_tokenizer(cls,
-                      v: str) -> PreTrainedTokenizer | PreTrainedTokenizerFast:
+    @model_validator(mode='after')
+    def validate_tokenizer(self):
         try:
-            tokenizer = AutoTokenizer.from_pretrained(v, padding_side='left')
+            tokenizer = AutoTokenizer.from_pretrained(
+                self.tokenizer,
+                padding_side='left',
+                trust_remote_code=self.trust_remote_code)
         except EnvironmentError as e:
             raise ValueError(
                 f"Cannot find a tokenizer from the given string because of {e}\nPlease set tokenizer to the directory that contains the tokenizer, or set to a model name in HuggingFace."
             )
         tokenizer.pad_token = tokenizer.eos_token
-        return tokenizer
+        self.tokenizer = tokenizer
+
+        return self
 
 
 @click.group()
@@ -82,6 +85,11 @@ def get_tokenizer(cls,
               default="info",
               type=click.Choice(['info', 'debug']),
               help="Logging level.")
+@click.option("--trust-remote-code",
+              is_flag=True,
+              default=False,
+              envvar="TRUST_REMOTE_CODE",
+              help="Trust remote code.")
 @click.pass_context
 def cli(ctx, **kwargs):
     """This script generates dataset input for gptManagerBenchmark."""
@@ -98,7 +106,8 @@ def cli(ctx, **kwargs):
                        random_seed=kwargs['random_seed'],
                        task_id=kwargs['task_id'],
                        rand_task_id=kwargs['rand_task_id'],
-                       lora_dir=kwargs['lora_dir'])
+                       lora_dir=kwargs['lora_dir'],
+                       trust_remote_code=kwargs['trust_remote_code'])
 
 
 cli.add_command(dataset)
 
@@ -1484,7 +1484,8 @@ class ExecutorConfig
         std::optional<GuidedDecodingConfig> guidedDecodingConfig = std::nullopt,
         std::optional<std::vector<AdditionalModelOutput>> additionalModelOutputs = std::nullopt,
         std::optional<CacheTransceiverConfig> cacheTransceiverConfig = std::nullopt,
-        bool gatherGenerationLogits = false, bool promptTableOffloading = false, bool enableTrtOverlap = false);
+        bool gatherGenerationLogits = false, bool promptTableOffloading = false, bool enableTrtOverlap = false,
+        bool failFastOnAttentionWindowTooLarge = false);
 
     [[nodiscard]] SizeType32 getMaxBeamWidth() const;
     [[nodiscard]] SchedulerConfig getSchedulerConfig() const;
@@ -1519,6 +1520,7 @@ class ExecutorConfig
     [[nodiscard]] bool getPromptTableOffloading() const;
     [[nodiscard]] std::optional<CacheTransceiverConfig> getCacheTransceiverConfig() const;
     [[nodiscard]] bool getEnableTrtOverlap() const;
+    [[nodiscard]] bool getFailFastOnAttentionWindowTooLarge() const;
 
     void setMaxBeamWidth(SizeType32 maxBeamWidth);
     void setMaxBatchSize(SizeType32 maxBatchSize);
@@ -1548,6 +1550,7 @@ class ExecutorConfig
     void setPromptTableOffloading(bool promptTableOffloading);
     void setCacheTransceiverConfig(CacheTransceiverConfig const& cacheTransceiverConfig);
     void setEnableTrtOverlap(bool enableTrtOverlap);
+    void setFailFastOnAttentionWindowTooLarge(bool failFastOnAttentionWindowTooLarge);
 
 private:
     friend class Serialization;
@@ -1634,6 +1637,10 @@ class ExecutorConfig
 
     /// @brief Controls whether preparation and TRT engine execution should be overlapped.
     bool mEnableTrtOverlap{false};
+
+    /// @brief Controls whether to fail fast when attention window is too large to fit even a single sequence in the KV
+    /// cache.
+    bool mFailFastOnAttentionWindowTooLarge{false};
 };
 
 struct KVCacheCreatedData
 
@@ -24,7 +24,22 @@ namespace detail
 
 #ifdef __CUDA_ARCH__
 
-#ifdef __CUDA_ARCH_SPECIFIC__
+// __CUDA_ARCH_SPECIFIC__ is only available starting from CUDA 12.9
+#if (__CUDACC_VER_MAJOR__ > 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 9))
+#define HAS_CUDA_SPECIFIC_MACRO 1
+
+#if __CUDA_ARCH__ >= 900
+#if !defined(__CUDA_ARCH_SPECIFIC__) && !defined(__CUDA_ARCH_FAMILY_SPECIFIC__)
+#error "Compiling for SM90 or newer architectures must use Arch specific or Arch Family specific target"
+#endif
+#endif
+
+#else
+#define HAS_CUDA_SPECIFIC_MACRO 0
+#endif
+
+// For CUDA < 12.9, we assume that sm90 or newer architectures are always built with arch specific.
+#if defined(__CUDA_ARCH_SPECIFIC__) || (!HAS_CUDA_SPECIFIC_MACRO && __CUDA_ARCH__ >= 900)
 static constexpr bool isArchSpecific = true;
 #else
 static constexpr bool isArchSpecific = false;
@@ -52,12 +67,6 @@ struct arch_info
 
 #endif
 
-#if __CUDA_ARCH__ >= 900
-#if !defined(__CUDA_ARCH_SPECIFIC__) && !defined(__CUDA_ARCH_FAMILY_SPECIFIC__)
-#error "Compiling for SM90 or newer architectures must use Arch specific or Arch Family specific target"
-#endif
-#endif
-
 } // namespace detail
 
 namespace arch
 
@@ -166,6 +166,9 @@ void CacheFormatter::format(TransferSession& session)
     auto const numPools = blockManager.getNumPools();
     // TODO(oargov): are we sure the other side has the same number of pools? this might not hold for pp_size>1...
 
+    auto lastTokenTime = llmRequest.getPerfMetrics().timingMetrics.lastTokenTime;
+    bool recordDelay = lastTokenTime != std::chrono::steady_clock::time_point();
+
     bool layerWise = common::getEnvDisaggLayerwise() && numPools == 1;
     if (layerWise)
     {
@@ -350,9 +353,14 @@ void CacheFormatter::format(TransferSession& session)
             }
 
             auto endTime = std::chrono::steady_clock::now();
+            double delay = 0.0;
+            if (recordDelay)
+            {
+                delay = std::chrono::duration<double, std::milli>(startTime - lastTokenTime).count();
+            }
             double cacheTransferTime
                 = std::max(0.0, std::chrono::duration<double, std::milli>(endTime - startTime).count());
-            kvCacheMeasureHelper.appendKVCacheTransfer(llmRequest.mRequestId, cacheTransferTime, size);
+            kvCacheMeasureHelper.appendKVCacheTransfer(llmRequest.mRequestId, delay, cacheTransferTime, size);
         };
 
         if (connections.size() > 1)
@@ -408,16 +416,19 @@ void CacheFormatter::unformat(TransferSession& session)
 {
     NVTX3_SCOPED_RANGE(CacheFormatter_unformat);
     auto const& llmRequest = session.getLlmRequest();
+    auto const ctxReqId = llmRequest.getContextPhaseParams().value().getReqId();
     TLLM_LOG_DEBUG(mpi::MpiComm::world().getRank(),
-        "Start receiving KV cache for request ID: %ld, context request ID: %ld.", llmRequest.mRequestId,
-        llmRequest.getContextPhaseParams().value().getReqId());
+        "Start receiving KV cache for request ID: %ld, context request ID: %ld.", llmRequest.mRequestId, ctxReqId);
     auto const& connections = session.getConnections();
     auto const& selfConfig = session.getSelfState().getCacheState().value();
     auto const& destConfig = session.getOtherState().getCacheState().value();
     auto const selfIdx = session.getSelfState().getCommState().value().getSelfIdx();
     auto& bufferManager = session.getBufferManager();
     auto blockRange = getBlockRangeForReceiving(mCacheManager, llmRequest);
 
+    auto arrivalTime = llmRequest.getPerfMetrics().timingMetrics.arrivalTime;
+    bool recordDelay = arrivalTime != std::chrono::steady_clock::time_point();
+
     auto pickUpConnections = pickRecvConnections(connections.size(), selfConfig, selfIdx, destConfig);
 
     TLLM_LOG_DEBUG("pickUpConnections size: %d connections size: %d", pickUpConnections.size(), connections.size());
@@ -546,7 +557,7 @@ void CacheFormatter::unformat(TransferSession& session)
                 }
                 TLLM_LOG_DEBUG(mpi::MpiComm::world().getRank(),
                     "End receiving KV cache for request ID: %ld, context request ID: %ld.", llmRequest.mRequestId,
-                    llmRequest.getContextPhaseParams().value().getReqId());
+                    ctxReqId);
                 return;
             }
             // legacyPath: context executor rank only send data to one gen executor rank. it sends multiple cache
@@ -634,6 +645,8 @@ void CacheFormatter::unformat(TransferSession& session)
                 TLLM_CUDA_CHECK(cudaSetDevice(deviceId));
                 TLLM_CHECK(pickUpConnections.size() > processIdx);
                 TLLM_CHECK(recvSplitCaches.size() > processIdx);
+                auto startTime = std::chrono::steady_clock::now();
+                size_t size = 0;
                 if (legacyPath)
                 {
                     size_t idx = processIdx * blockNum;
@@ -645,6 +658,7 @@ void CacheFormatter::unformat(TransferSession& session)
                         size_t recvBufferIdx = blockIdx * pickUpConnections.size() + commIdx;
                         llmRequest.updateKvCacheSize((*recvSplitCaches[recvBufferIdx]).getSizeInBytes());
                         auto& buffer = recvSplitCaches.at(recvBufferIdx);
+                        size += buffer->getSizeInBytes();
                         session.recv(pickUpConnections[processIdx], buffer->data(), buffer->getSizeInBytes());
                         idx++;
                     }
@@ -655,6 +669,7 @@ void CacheFormatter::unformat(TransferSession& session)
                     {
                         llmRequest.updateKvCacheSize((*recvSplitCaches.at(processIdx)).getSizeInBytes());
                         auto& buffer = recvSplitCaches[processIdx];
+                        size = buffer->getSizeInBytes();
                         session.recv(pickUpConnections[processIdx], buffer->data(), buffer->getSizeInBytes());
                     }
                     else if (bufferCoverTargetNum > 0)
@@ -663,6 +678,7 @@ void CacheFormatter::unformat(TransferSession& session)
                             + remainNoCoverTargetNum; // caches.at(recvBufferIdx) is allocated by cudaMalloc
                         llmRequest.updateKvCacheSize((*recvSplitCaches.at(recvBufferIdx)).getSizeInBytes());
                         auto& buffer = recvSplitCaches.at(recvBufferIdx);
+                        size = buffer->getSizeInBytes();
                         session.recv(pickUpConnections[processIdx], buffer->data(), buffer->getSizeInBytes());
                         bufferManager.copy(*recvSplitCaches.at(recvBufferIdx), *recvSplitCaches[processIdx]);
                         bufferManager.getStream().synchronize();
@@ -679,6 +695,7 @@ void CacheFormatter::unformat(TransferSession& session)
                             auto recvSlice = runtime::ITensor::slice(preAllocRecvBuffer, 0, recvSize);
                             auto copySlice = runtime::ITensor::slice(
                                 recvSplitCaches[processIdx], targetBufferSize - remainRecvSize, recvSize);
+                            size += recvSlice->getSizeInBytes();
                             llmRequest.updateKvCacheSize((*recvSlice).getSizeInBytes());
                             session.recv(pickUpConnections[processIdx], recvSlice->data(), recvSlice->getSizeInBytes());
                             bufferManager.copy(*recvSlice, *copySlice);
@@ -687,6 +704,15 @@ void CacheFormatter::unformat(TransferSession& session)
                         }
                     }
                 }
+                auto endTime = std::chrono::steady_clock::now();
+                double delay = 0.0;
+                if (recordDelay)
+                {
+                    delay = std::chrono::duration<double, std::milli>(startTime - arrivalTime).count();
+                }
+                double cacheTransferTime
+                    = std::max(0.0, std::chrono::duration<double, std::milli>(endTime - startTime).count());
+                kvCacheMeasureHelper.appendKVCacheTransfer(ctxReqId, delay, cacheTransferTime, size);
             };
             if (pickUpConnections.size() > 1)
             {
 
@@ -76,6 +76,15 @@ class BaseCacheFormatter
 
     /// @brief Destructor.
     virtual ~BaseCacheFormatter() = default;
+
+    // TODO: better way for context/generation tagging
+    void markAsSender(bool isSender)
+    {
+        kvCacheMeasureHelper.markAsSender(isSender);
+    }
+
+protected:
+    KvCacheMeasureHelper kvCacheMeasureHelper{common::getEnvKVCacheTransferOutputPath()};
 };
 
 // Simple cache block copy. Because it does not involve data splitting or merging, it performs best when the
@@ -115,7 +124,6 @@ class CacheFormatter final : public BaseCacheFormatter
 private:
     BaseKVCacheManager* mCacheManager;
     CacheTransBufferManager* mCacheTransBufferManager;
-    KvCacheMeasureHelper kvCacheMeasureHelper{common::getEnvKVCacheTransferOutputPath()};
 };
 
 std::unique_ptr<BaseCacheFormatter> createCacheFormatter(