NVIDIA
diff --git a/‎tensorrt_llm/_torch/models/modeling_nemotron_h.py‎
Lines changed: 16 additions & 2 deletions b/‎tensorrt_llm/_torch/models/modeling_nemotron_h.py‎
Lines changed: 16 additions & 2 deletions
diff --git a/‎tensorrt_llm/_torch/modules/mamba/mamba2_metadata.py‎
Lines changed: 47 additions & 0 deletions b/‎tensorrt_llm/_torch/modules/mamba/mamba2_metadata.py‎
Lines changed: 47 additions & 0 deletions
@@ -20,6 +20,8 @@
 from torch.nn import functional as F
 from transformers import AutoConfig, PretrainedConfig
 
+from tensorrt_llm._torch.modules.mamba.mamba2_metadata import Mamba2Metadata
+
 from ..attention_backend import AttentionMetadata
 from ..model_config import ModelConfig
 from ..modules.attention import Attention
@@ -71,6 +73,7 @@ def forward(
         self,
         hidden_states: torch.Tensor,
         attn_metadata: AttentionMetadata,
+        **kwargs,
     ) -> torch.Tensor:
         return super().forward(hidden_states)
 
@@ -99,6 +102,7 @@ def forward(
         self,
         hidden_states: torch.Tensor,
         attn_metadata: AttentionMetadata,
+        **kwargs,
     ) -> torch.Tensor:
         return super().forward(position_ids=None,
                                hidden_states=hidden_states,
@@ -153,12 +157,13 @@ def forward(
         position_ids: torch.IntTensor,
         hidden_states: torch.Tensor,
         attn_metadata: AttentionMetadata,
+        **kwargs,
     ) -> torch.Tensor:
 
         residual = hidden_states
 
         hidden_states = self.norm(hidden_states)
-        hidden_states = self.mixer(hidden_states, attn_metadata)
+        hidden_states = self.mixer(hidden_states, attn_metadata, **kwargs)
         hidden_states = torch.add(hidden_states, residual)
 
         return hidden_states
@@ -190,6 +195,8 @@ def __init__(self, model_config: ModelConfig[NemotronHConfig]):
             dtype=config.torch_dtype,
         )
 
+        self.mamba_metadata: Optional[Mamba2Metadata] = None
+
     def forward(
         self,
         attn_metadata: AttentionMetadata,
@@ -203,13 +210,20 @@ def forward(
                 "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
             )
 
+        if self.mamba_metadata is None or self.mamba_metadata.max_batch_size != attn_metadata.max_num_requests:
+            self.mamba_metadata = Mamba2Metadata(attn_metadata.max_num_requests)
+        self.mamba_metadata.prepare(attn_metadata)
+
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
 
         hidden_states = inputs_embeds
 
         for layer in self.layers:
-            hidden_states = layer(position_ids, hidden_states, attn_metadata)
+            hidden_states = layer(position_ids,
+                                  hidden_states,
+                                  attn_metadata,
+                                  mamba_metadata=self.mamba_metadata)
 
         hidden_states = self.norm_f(hidden_states)
 
 
@@ -0,0 +1,47 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from tensorrt_llm._torch.attention_backend.interface import AttentionMetadata
+
+
+class Mamba2Metadata:
+
+    def __init__(self, max_batch_size: int):
+        self.max_batch_size = max_batch_size
+
+        # cumulative sequence lengths for prefill requests [batch_size+1]
+        self.cu_seqlens = torch.zeros(max_batch_size + 1,
+                                      dtype=torch.int,
+                                      device="cuda")
+
+        # sequence index for prefill requests [num_prefill_tokens] - specifies which request each token belongs to
+        self.seq_idx: torch.Tensor = None
+
+    def prepare(self, attn_metadata: AttentionMetadata):
+        num_contexts = attn_metadata.num_contexts
+        context_lens = attn_metadata.seq_lens_cuda[:num_contexts]
+        if num_contexts > 0:
+            torch.cumsum(context_lens,
+                         dim=0,
+                         dtype=torch.int,
+                         out=self.cu_seqlens[1:num_contexts + 1])
+            self.seq_idx = torch.repeat_interleave(
+                torch.arange(num_contexts,
+                             dtype=torch.int,
+                             device=self.cu_seqlens.device),
+                repeats=context_lens,
+                output_size=self.cu_seqlens[num_contexts]).unsqueeze(0)