Guard NWOR staging from unexpected graph capture

yuz207 · yuz207 · commit 44866a30d3a8 · 2025-10-19T00:55:48.000Z
diff --git a/vllm/v1/kv_cache/deferred.py b/vllm/v1/kv_cache/deferred.py
@@ -251,6 +251,12 @@ def stage_layer(
         if not self._window_active:
             return False
 
+        if _in_restricted_context():
+            logger.warning_once(
+                "NWOR: Graph capture detected during staging; skipping staged writes."
+            )
+            return False
+
         if not (_tensor_has_storage(key) and _tensor_has_storage(value)):
             raise ShouldFallback("kv_slice_without_storage")
 
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -2731,10 +2731,10 @@ def _compute_nwor_acceptance(
         draft_ids = draft_ids.to(dtype=sampled_token_ids.dtype, copy=False)
 
         if return_mask:
-        mask_work = torch.zeros(total_tokens, dtype=torch.bool, device=work_device)
-    else:
-        mask_work = None
-    accepted_counts = []
+            mask_work = torch.zeros(total_tokens, dtype=torch.bool, device=work_device)
+        else:
+            mask_work = None
+        accepted_counts = []
 
         if sampled_token_ids.ndim == 0:
             zero_counts = [0 for _ in num_draft_tokens]