[https://nvbugs/5534705][fix] Update inference state for draft model

ziyixiong-nv · ziyixiong-nv · commit aac743e8c8c8 · 2025-09-28T19:40:54.000-07:00
Signed-off-by: ziyixiong-nv &lt;219238287+ziyixiong-nv@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -719,6 +719,7 @@ def release_batch(result: ScheduledRequests | None):
                     logger.info(
                         f"Run generation only CUDA graph warmup for batch size={bs}, draft_len={draft_len}"
                     )
+                    # The draft model has draft_len = 0, so we need to check either draft_len > 0 or is_draft_model.
                     self.enable_spec_decode = draft_len > 0 or self.is_draft_model
 
                     def _update_draft_inference_state(is_first_draft: bool,
@@ -732,7 +733,8 @@ def _update_draft_inference_state(is_first_draft: bool,
                                     # Reset the draft tokens for the first draft inference
                                     req.py_draft_tokens = []
 
-                    _update_draft_inference_state(draft_len > 0, batch)
+                    _update_draft_inference_state(self.enable_spec_decode,
+                                                  batch)
 
                     self.forward(batch,
                                  new_tensors_device=None,