From 64bd19df6f56935f406fd84b0ec90e616e52b31e Mon Sep 17 00:00:00 2001 From: Sam Lurye Date: Thu, 9 Oct 2025 00:25:02 -0700 Subject: [PATCH] [monarch][supervision] Increase GetState:: default timeout and make it configurable The current timeout of 1 second for `GetState::` for supervision was causing at least one test to fail. This diff makes the value configurable, and also increases the default to 30 seconds. The failing test now passes. Example test failure that shows the supervision timeout: P1984316048 Differential Revision: [D84232284](https://our.internmc.facebook.com/intern/diff/D84232284/) **NOTE FOR REVIEWERS**: This PR has internal Meta-specific changes or comments, please review them on [Phabricator](https://our.internmc.facebook.com/intern/diff/D84232284/)! [ghstack-poisoned] --- hyperactor_mesh/src/v1/proc_mesh.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/hyperactor_mesh/src/v1/proc_mesh.rs b/hyperactor_mesh/src/v1/proc_mesh.rs index 333202e58..83fa45874 100644 --- a/hyperactor_mesh/src/v1/proc_mesh.rs +++ b/hyperactor_mesh/src/v1/proc_mesh.rs @@ -72,6 +72,9 @@ declare_attrs! { /// The maximum idle time between updates while spawning actor meshes. @meta(CONFIG_ENV_VAR = "HYPERACTOR_MESH_ACTOR_SPAWN_MAX_IDLE".to_string()) pub attr ACTOR_SPAWN_MAX_IDLE: Duration = Duration::from_secs(30); + + @meta(CONFIG_ENV_VAR = "HYPERACTOR_MESH_GET_ACTOR_STATE_MAX_IDLE".to_string()) + pub attr GET_ACTOR_STATE_MAX_IDLE: Duration = Duration::from_secs(30); } /// A reference to a single [`hyperactor::Proc`]. @@ -533,7 +536,9 @@ impl ProcMeshRef { // the agent will be unresponsive. // We handle this by setting a timeout on the recv, and if we don't get a // message we assume the agent is dead and return a failed state. - let state = RealClock.timeout(Duration::from_secs(1), rx.recv()).await; + let state = RealClock + .timeout(config::global::get(GET_ACTOR_STATE_MAX_IDLE), rx.recv()) + .await; if let Ok(state) = state { // Handle non-timeout receiver error. let state = state?;