File tree Expand file tree Collapse file tree 1 file changed +3
-1
lines changed
tensorrt_llm/_torch/pyexecutor Expand file tree Collapse file tree 1 file changed +3
-1
lines changed Original file line number Diff line number Diff line change @@ -719,6 +719,7 @@ def release_batch(result: ScheduledRequests | None):
719
719
logger .info (
720
720
f"Run generation only CUDA graph warmup for batch size={ bs } , draft_len={ draft_len } "
721
721
)
722
+ # The draft model has draft_len = 0, so we need to check either draft_len > 0 or is_draft_model.
722
723
self .enable_spec_decode = draft_len > 0 or self .is_draft_model
723
724
724
725
def _update_draft_inference_state (is_first_draft : bool ,
@@ -732,7 +733,8 @@ def _update_draft_inference_state(is_first_draft: bool,
732
733
# Reset the draft tokens for the first draft inference
733
734
req .py_draft_tokens = []
734
735
735
- _update_draft_inference_state (draft_len > 0 , batch )
736
+ _update_draft_inference_state (self .enable_spec_decode ,
737
+ batch )
736
738
737
739
self .forward (batch ,
738
740
new_tensors_device = None ,
You can’t perform that action at this time.
0 commit comments