diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index cf85a2135c81..539fbc238d38 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -646,6 +646,12 @@ def get_and_reset_finished_requests_ids(self) -> List[str]: self._finished_requests_ids = list() return finished_requests_ids + def get_async_stopped_request_ids(self): + if self._async_stopped: + return [seq_group.request_id for seq_group in self._async_stopped] + else: + return [] + def _schedule_running( self, budget: SchedulingBudget, diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 7f9f85e1f93f..0b8ff5e6649f 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -309,6 +309,9 @@ async def step_async( finished_requests_ids = self.scheduler[ virtual_engine].get_and_reset_finished_requests_ids() + finished_requests_ids += self.scheduler[ + virtual_engine].get_async_stopped_request_ids() + # Maybe switch from async mode to sync mode if not allow_async_output_proc and len(ctx.output_queue) > 0: self._process_model_outputs(ctx=ctx) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 54f7b8fb69b5..7fbe21368e6c 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1385,6 +1385,9 @@ def step(self) -> List[Union[RequestOutput, PoolingRequestOutput]]: if finished_request_id in self.seq_id_to_seq_group: del self.seq_id_to_seq_group[finished_request_id] + finished_requests_ids += self.scheduler[ + virtual_engine].get_async_stopped_request_ids() + # Maybe switch from async mode to sync mode if not allow_async_output_proc and len(ctx.output_queue) > 0: self._process_model_outputs(ctx=ctx)