NIXL: re-work send timeout tracking on prefill side

markmc · markmc · commit 73f5fcbedb18 · 2025-10-03T10:33:46.000-04:00
In a prefill instance, we need to free KV blocks that have not been fetched after a timeout. See vllm-project#20139. In vllm-project#26012, we're trying to deal with corner cases involved with doing this request timeout tracking on the worker side. This PR proposes moving all of this to the scheduler side, hopefully making the logic simpler. Note the expiry timer is switched back to monotonic time because the timestamp is no longer sent across process boundaries. Signed-off-by: Mark McLoughlin <markmc@redhat.com>
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -40,6 +40,7 @@
 if TYPE_CHECKING:
     from vllm.attention.backends.abstract import AttentionMetadata
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
+    from vllm.v1.outputs import KVConnectorOutput
     from vllm.v1.request import Request
 
 Transfer = tuple[int, float]  # (xfer_handle, start_time)
@@ -107,8 +108,6 @@ class NixlConnectorMetadata(KVConnectorMetadata):
     def __init__(self):
         self.reqs_to_recv: dict[ReqId, ReqMeta] = {}
         self.reqs_to_save: dict[ReqId, ReqMeta] = {}
-        self.reqs_to_send: dict[ReqId, float] = {}
-        self.reqs_in_batch: set[ReqId] = set()
 
     def add_new_req(
         self,
@@ -195,6 +194,14 @@ def build_connector_meta(
         assert self.connector_scheduler is not None
         return self.connector_scheduler.build_connector_meta(scheduler_output)
 
+    def update_connector_output(
+        self,
+        connector_output: "KVConnectorOutput",
+    ):
+        assert self.connector_scheduler is not None
+        return self.connector_scheduler.update_connector_output(
+            connector_output)
+
     def request_finished(
         self,
         request: "Request",
@@ -280,9 +287,14 @@ def __init__(self, vllm_config: VllmConfig, engine_id: str):
         # the scheduler. Used to make metadata passed to Worker.
         self._reqs_need_recv: dict[ReqId, tuple[Request, list[int]]] = {}
         self._reqs_need_save: dict[ReqId, tuple[Request, list[int]]] = {}
-        # Reqs to send and their expiration time
-        self._reqs_need_send: dict[ReqId, float] = {}
-        self._reqs_in_batch: set[ReqId] = set()
+
+        # Requests that need to be sent for remote decode, along with:
+        # 1. an expiry time to avoid stranded KV blocks if they
+        # are never fetched
+        # 2. a consumer notification count - with heterogeneous TP, P
+        # must wait for all assigned D TP workers to finish reading
+        # before safely freeing the blocks.
+        self._reqs_need_send: dict[ReqId, tuple[float, int]] = {}
 
     def get_num_new_matched_tokens(
             self, request: "Request",
@@ -330,8 +342,6 @@ def update_state_after_alloc(self, request: "Request",
         if not params:
             return
 
-        if params.get("do_remote_decode"):
-            self._reqs_in_batch.add(request.request_id)
         if self.use_host_buffer and params.get("do_remote_decode"):
             # NOTE: when accelerator is not directly supported by Nixl,
             # prefilled blocks need to be saved to host memory before transfer.
@@ -395,17 +405,56 @@ def build_connector_meta(
                 save_to_host=True,
             )
 
-        meta.reqs_to_send = self._reqs_need_send
-        meta.reqs_in_batch = self._reqs_in_batch
-
         # Clear the list once workers start the transfers
         self._reqs_need_recv.clear()
         self._reqs_need_save.clear()
-        self._reqs_in_batch = set()
-        self._reqs_need_send = {}
 
         return meta
 
+    def update_connector_output(
+        self,
+        connector_output: "KVConnectorOutput",
+    ):
+        finished_sending: set[str] = set()
+
+        # Blocks sent - remove expiry timeout
+        for notif in (connector_output.finished_sending or ()):
+            req_id, tp_ratio = notif.decode("utf-8").rsplit(":", 1)
+            # Sent notifications received after we already timed out
+            if req_id not in self._reqs_need_send:
+                logger.debug(
+                    "Already finished or expired KV transfer for request %s",
+                    req_id)
+                continue
+
+            # Wait all consumers (D) to be done reading before freeing.
+            count = self._reqs_need_send[req_id][1] + 1
+            if count < int(tp_ratio):
+                self._reqs_need_send[req_id] = (
+                    self._reqs_need_send[req_id][0], count)
+                continue
+            logger.debug(
+                "KV transfer finished for request %s after "
+                "retrieval by %d decode worker(s).", req_id, count)
+            del self._reqs_need_send[req_id]
+            finished_sending.add(req_id)
+
+        # Mark as finished if the expiry timeout has passed
+        now = time.monotonic()
+        while self._reqs_need_send:
+            req_id, (expires, count) = next(iter(self._reqs_need_send.items()))
+            # Insertion-ordered dict; oldest first so we can exit early.
+            if now < expires:
+                break
+            logger.warning(
+                "Releasing expired KV blocks for request %s which were "
+                "retrieved by %d decode worker(s) within %d seconds.", req_id,
+                count, envs.VLLM_NIXL_ABORT_REQUEST_TIMEOUT)
+            del self._reqs_need_send[req_id]
+            finished_sending.add(req_id)
+
+        connector_output.finished_sending = finished_sending
+
     def request_finished(
         self,
         request: "Request",
@@ -435,8 +484,15 @@ def request_finished(
             params["do_remote_prefill"] = False
             return False, None
 
-        if (not params.get("do_remote_decode")
-                or request.status != RequestStatus.FINISHED_LENGTH_CAPPED):
+        if not params.get("do_remote_decode"):
+            return False, None
+
+        if request.status != RequestStatus.FINISHED_LENGTH_CAPPED:
+            if request.request_id in self._reqs_need_send:
+                # Request aborted after we delayed freeing the blocks
+                logger.debug("Deleting KV transfer timeout for request %s",
+                             request.request_id)
+                del self._reqs_need_send[request.request_id]
             return False, None
 
         # TODO: check whether block_ids actually ever be 0. If not we could
@@ -445,8 +501,8 @@ def request_finished(
 
         if delay_free_blocks:
             # Prefill request on remote. It will be read from D upon completion
-            self._reqs_need_send[request.request_id] = time.perf_counter(
-            ) + envs.VLLM_NIXL_ABORT_REQUEST_TIMEOUT
+            expiry = time.monotonic() + envs.VLLM_NIXL_ABORT_REQUEST_TIMEOUT
+            self._reqs_need_send[request.request_id] = (expiry, 0)
 
         return delay_free_blocks, dict(
             do_remote_prefill=True,
@@ -559,10 +615,6 @@ def __init__(self, vllm_config: VllmConfig, engine_id: str):
         # [req_id -> list[handle]]
         self._recving_metadata: dict[ReqId, ReqMeta] = {}
         self._recving_transfers = defaultdict[ReqId, list[Transfer]](list)
-        # Track the expiration time of requests that are waiting to be sent.
-        self._reqs_to_send: dict[ReqId, float] = {}
-        # Set of requests that have been part of a batch, regardless of status.
-        self._reqs_to_process: set[ReqId] = set()
 
         # Background thread for handling new handshake requests.
         self._nixl_handshake_listener_t: Optional[threading.Thread] = None
@@ -601,9 +653,6 @@ def __init__(self, vllm_config: VllmConfig, engine_id: str):
         logger.debug("Detected kv cache layout %s", self.kv_cache_layout)
 
         self._tp_size: dict[EngineId, int] = {self.engine_id: self.world_size}
-        # With heterogeneous TP, P must wait for all assigned D TP workers to
-        # finish reading before safely freeing the blocks.
-        self.consumer_notification_counts_by_req = defaultdict[ReqId, int](int)
         self.xfer_stats = NixlKVConnectorStats()
 
     @staticmethod
@@ -1113,22 +1162,6 @@ def get_finished(self) -> tuple[set[str], set[str]]:
                 assert meta, f"{req_id} not found in recving_metadata list"
                 self.sync_recved_kv_to_device(req_id, meta)
 
-        # Handle timeout to avoid stranding blocks on remote.
-        now = time.perf_counter()
-        while self._reqs_to_send:
-            req_id, expires = next(iter(self._reqs_to_send.items()))
-            # Sorted dict, oldest requests are put first so we can exit early.
-            if now < expires:
-                break
-            count = self.consumer_notification_counts_by_req.pop(req_id, 0)
-            logger.warning(
-                "Releasing expired KV blocks for request %s which were "
-                "retrieved by %d decode worker(s) within %d seconds.", req_id,
-                count, envs.VLLM_NIXL_ABORT_REQUEST_TIMEOUT)
-            self._reqs_to_process.remove(req_id)
-            del self._reqs_to_send[req_id]
-            done_sending.add(req_id)
-
         return done_sending, done_recving
 
     def _get_new_notifs(self) -> set[str]:
@@ -1140,23 +1173,8 @@ def _get_new_notifs(self) -> set[str]:
         notified_req_ids: set[str] = set()
         for notifs in self.nixl_wrapper.get_new_notifs().values():
             for notif in notifs:
-                req_id, tp_ratio = notif.decode("utf-8").rsplit(":", 1)
-                if (req_id not in self._reqs_to_send
-                        and req_id not in self._reqs_to_process):
-                    logger.error(
-                        "Potentially invalid KV blocks for "
-                        "unrecognized request %s were retrieved by "
-                        "a decode worker. They may have expired.", req_id)
-                    continue
-
-                self.consumer_notification_counts_by_req[req_id] += 1
-                # Wait all consumers (D) to be done reading before freeing.
-                if self.consumer_notification_counts_by_req[req_id] == int(
-                        tp_ratio):
-                    notified_req_ids.add(req_id)
-                    del self.consumer_notification_counts_by_req[req_id]
-                    self._reqs_to_process.remove(req_id)
-                    self._reqs_to_send.pop(req_id, None)
+                # Note - this is in req_id:tp_ratio format
+                notified_req_ids.add(notif)
         return notified_req_ids
 
     def _pop_done_transfers(
@@ -1217,20 +1235,6 @@ def start_load_kv(self, metadata: NixlConnectorMetadata):
         while not self._ready_requests.empty():
             self._read_blocks_for_req(*self._ready_requests.get_nowait())
 
-        # Keep around the requests that have been part of a batch. This is
-        # needed because async scheduling pushes the misalignment between the
-        # moment in which requests expiration is set (P side) and the moment in
-        # which blocks are read from D. As P can now more easily lag behind D
-        # while processing the next batch, we make sure to only set an
-        # expiration for requests that have not been read from D yet.
-        for req_id in metadata.reqs_in_batch:
-            self._reqs_to_process.add(req_id)
-
-        # Add to requests that are waiting to be read and track expiration.
-        for req_id, expiration_time in metadata.reqs_to_send.items():
-            if req_id in self._reqs_to_process:
-                self._reqs_to_send[req_id] = expiration_time
-
     def _read_blocks_for_req(self, req_id: str, meta: ReqMeta):
         logger.debug(
             "Remote agent %s available, calling _read_blocks for req %s",