@@ -411,10 +411,21 @@ def _wait_for_engine_startup(self):
411411
412412 # Wait for engine core process(es) to send ready messages.
413413 identities = set (eng .index for eng in self .resources .core_engines )
414+ poller = zmq .Poller ()
415+ poller .register (sync_input_socket , zmq .POLLIN )
416+ for eng in self .resources .core_engines :
417+ poller .register (eng .proc_handle , zmq .POLLIN )
414418 while identities :
415- while not sync_input_socket .poll (timeout = STARTUP_POLL_PERIOD_MS ):
416- logger .info ("Waiting for %d core engine proc(s) to start: %s" ,
417- len (identities ), identities )
419+ events = poller .poll (STARTUP_POLL_PERIOD_MS )
420+ if not events :
421+ logger .debug ("Waiting for %d core engine proc(s) to start: %s" ,
422+ len (identities ), identities )
423+ continue
424+ if len (events ) > 1 or events [0 ][0 ] != sync_input_socket :
425+ # One of the core processes exited.
426+ raise RuntimeError ("Engine core initialization failed. "
427+ "See root cause above." )
428+
418429 eng_id_bytes , msg = sync_input_socket .recv_multipart ()
419430 eng_id = int .from_bytes (eng_id_bytes , byteorder = "little" )
420431 if eng_id not in identities :
@@ -424,12 +435,6 @@ def _wait_for_engine_startup(self):
424435 logger .info ("Core engine process %d ready." , eng_id )
425436 identities .discard (eng_id )
426437
427- # Double check that the process are running.
428- for engine in self .resources .core_engines :
429- proc = engine .proc_handle .proc
430- if proc .exitcode is not None :
431- raise RuntimeError (f"Engine proc { proc .name } not running" )
432-
433438 def _init_core_engines (
434439 self ,
435440 vllm_config : VllmConfig ,
0 commit comments