fix accepted_tokens tensor.

lfr-0531 · lfr-0531 · commit d6276c0631b7 · 2025-06-18T15:35:58.000+08:00
Signed-off-by: Fanrong Li &lt;23290157+lfr-0531@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/thop/mtpOp.cpp b/cpp/tensorrt_llm/thop/mtpOp.cpp
@@ -109,8 +109,8 @@ std::tuple<th::Tensor, th::Tensor> mtp_sampling_and_accepted_draft_tokens_op(th:
     TLLM_CHECK(draftTokensSizes[0] == (numGenerationRequest * numMTPModules));
 
     auto stream = at::cuda::getCurrentCUDAStream(logits.get_device());
-    auto acceptedTokens = torch::empty(
-        {batchSize, numMTPModules + 1}, at::TensorOptions().dtype(torch::kInt32).device(logits.device()));
+    auto acceptedTokens
+        = torch::ones({batchSize, numMTPModules + 1}, at::TensorOptions().dtype(torch::kInt32).device(logits.device()));
     auto numAcceptedTokens = torch::ones({batchSize}, at::TensorOptions().dtype(torch::kInt32).device(logits.device()));
 
     // Fill params
diff --git a/tensorrt_llm/_torch/speculative/mtp.py b/tensorrt_llm/_torch/speculative/mtp.py
@@ -745,12 +745,13 @@ def sample_and_accept_draft_tokens(
             logits = logits.unsqueeze(0)
 
         # The return buffer
-        accepted_tokens = torch.empty((batch_size, (mtp_num_modules + 1)),
-                                      dtype=torch.int,
-                                      device=logits.device)
-        num_accepted_tokens = torch.ones(batch_size,
+        if self.spec_config.use_relaxed_acceptance_for_thinking or not self.is_thop:
+            accepted_tokens = torch.ones((batch_size, (mtp_num_modules + 1)),
                                          dtype=torch.int,
                                          device=logits.device)
+            num_accepted_tokens = torch.ones(batch_size,
+                                             dtype=torch.int,
+                                             device=logits.device)
         if self.spec_config.use_relaxed_acceptance_for_thinking:
             mtp_relaxed_delta_pool = spec_metadata.mtp_hidden_states_manager.mtp_relaxed_delta_pool
 
@@ -1068,7 +1069,6 @@ class MTPEagleWorker(MTPWorker):
     def __init__(self, spec_config: MTPConfig):
         super().__init__(spec_config)
         self.mtp_num_modules = spec_config.num_nextn_predict_layers
-        self.is_thop = False
 
     def forward(
         self,