chore: Adjust cache indirection passing to AttentionMetadata

stnie · stnie · commit 0236a0a0191a · 2025-07-24T09:28:11.000+02:00
- Moved the cache indirection buffer into AttentionMetadata instead of TrtllmAttentionMetadata
- Updated PyTorchModelEngine to utilize the cache indirection buffer conditionally based on the attention backend.
- Combined the beam search testcases for overlap scheduling and cuda graphs.
- Adjusted size estimation of cache indirection buffer in model_engine to correctly cover overlap scheduling

Signed-off-by: Stefan Niebler &lt;82932102+stnie@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/attention_backend/interface.py b/tensorrt_llm/_torch/attention_backend/interface.py
@@ -135,6 +135,9 @@ class AttentionMetadata:
     _num_ctx_tokens: int = field(init=False, default=0, repr=False)
     _num_tokens: int = field(init=False, default=0, repr=False)
 
+    # This buffer is currently only used for TrtllmAttentionMetadata.
+    cache_indirection: Optional[torch.Tensor] = None
+
     def __post_init__(self) -> None:
         if self.is_cross:
             assert self.cross is None or self.cross is self, "Cross attention metadata should not have sub metadata"
diff --git a/tensorrt_llm/_torch/attention_backend/trtllm.py b/tensorrt_llm/_torch/attention_backend/trtllm.py
@@ -517,10 +517,9 @@ def is_nvfp4_output_kernel_available(
 class TrtllmAttentionMetadata(AttentionMetadata):
     workspace: Optional[torch.Tensor] = None
 
-    # TrtllmAttention needs to know the beam width and access to the cache indirection buffer,
+    # TrtllmAttention needs to know the beam width to access to the cache indirection buffer,
     # when beam search is enabled.
     beam_width: int = 1
-    cache_indirection: Optional[torch.Tensor] = None
 
     # TrtllmAttention needs to know the max sequence length.
     # Implemented as a property to support no cache mode.
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -426,7 +426,8 @@ def __init__(
         # This way it can also be used for CUDA graphs.
         if self.use_beam_search:
             self.cache_indirection_attention = torch.zeros(
-                (self.batch_size, self.max_beam_width, self.max_seq_len),
+                (self.batch_size, self.max_beam_width, self.max_seq_len +
+                 (0 if self._disable_overlap_scheduler else 1)),
                 device="cuda",
                 dtype=torch.int32)
         else:
@@ -753,11 +754,7 @@ def _set_up_attn_metadata(self, kv_cache_manager: KVCacheManager):
             self.model.model_config.pretrained_config) and (
                 self.attn_runtime_features.cache_reuse
                 or self.attn_runtime_features.chunked_prefill)
-        # Cache indirection is only used for beam search on generation requests with TRTLLM backend.
-        if self.attn_backend.Metadata is TrtllmAttentionMetadata:
-            kwargs = {"cache_indirection": self.cache_indirection_attention}
-        else:
-            kwargs = {}
+        cache_indirection = self.cache_indirection_attention if self.attn_backend.Metadata is TrtllmAttentionMetadata else None
         if kv_cache_manager is None:
             return self.attn_backend.Metadata(
                 max_num_requests=self.batch_size,
@@ -768,7 +765,7 @@ def _set_up_attn_metadata(self, kv_cache_manager: KVCacheManager):
                 runtime_features=self.attn_runtime_features,
                 enable_flash_mla=self.model.model_config.enable_flash_mla,
                 enable_paged_context_mla=enable_paged_context_mla,
-                **kwargs)
+                cache_indirection=cache_indirection)
 
         if self.attn_metadata is not None:
             # This assertion can be relaxed if needed: just create a new metadata
@@ -785,7 +782,7 @@ def _set_up_attn_metadata(self, kv_cache_manager: KVCacheManager):
             runtime_features=self.attn_runtime_features,
             enable_flash_mla=self.model.model_config.enable_flash_mla,
             enable_paged_context_mla=enable_paged_context_mla,
-            **kwargs)
+            cache_indirection=cache_indirection)
 
         return self.attn_metadata
 
diff --git a/tests/unittest/_torch/test_beam_search.py b/tests/unittest/_torch/test_beam_search.py
@@ -51,7 +51,7 @@ def llm(fixed_params, input_prompts):
 
 
 @pytest.fixture(scope="module")
-def llm_overlap(fixed_params, input_prompts):
+def llm_cuda_graph(fixed_params, input_prompts):
     return LLM(
         model=os.path.join(llm_models_root(), "llama-models-v2",
                            "TinyLlama-1.1B-Chat-v1.0"),
@@ -63,8 +63,7 @@ def llm_overlap(fixed_params, input_prompts):
         enable_trtllm_sampler=True,
         max_beam_width=fixed_params["max_beam_width"],
         disable_overlap_scheduler=False,
-        #TODO: remove this once we have a proper fix for CUDA graph in beam search
-        cuda_graph_config=None,
+        cuda_graph_config=CudaGraphConfig(enabled=True),
     )
 
 
@@ -131,10 +130,10 @@ def test_beam_search_output_shapes(gather_context_logits: bool,
 @pytest.mark.parametrize("num_output_beams", [1, 2])
 @pytest.mark.parametrize("num_prompts", [1, 2])
 @pytest.mark.threadleak(enabled=False)
-def test_beam_search_output_shapes_overlap(
+def test_beam_search_output_shapes_cuda_graph_and_overlap(
         gather_context_logits: bool, gather_generation_logits: bool,
         return_log_probs: bool, num_output_beams: int, num_prompts: int,
-        llm_overlap, fixed_params, input_prompts, expected_outputs):
+        llm_cuda_graph, fixed_params, input_prompts, expected_outputs):
     if return_log_probs and num_prompts > 1:
         pytest.skip(
             "Beam search currently does not support return_log_probs with multiple prompts"
@@ -148,8 +147,8 @@ def test_beam_search_output_shapes_overlap(
         return_generation_logits=gather_generation_logits,
         logprobs=return_log_probs,
     )
-    outputs = llm_overlap.generate(input_prompts[:num_prompts],
-                                   sampling_params=sampling_params)
+    outputs = llm_cuda_graph.generate(input_prompts[:num_prompts],
+                                      sampling_params=sampling_params)
     assert len(outputs) == num_prompts
     for output_idx, output in enumerate(outputs):
         if gather_context_logits: