diff --git a/src/kbmod_wf/multi_night_workflow.py b/src/kbmod_wf/multi_night_workflow.py index bf36b1d6..330a9dae 100644 --- a/src/kbmod_wf/multi_night_workflow.py +++ b/src/kbmod_wf/multi_night_workflow.py @@ -4,6 +4,7 @@ import toml import parsl from parsl import python_app, File +from parsl.monitoring.errors import MonitoringHubStartError import parsl.executors from kbmod_wf.utilities import ( @@ -153,5 +154,11 @@ def workflow_runner(env=None, runtime_config={}): if args.runtime_config is not None and os.path.exists(args.runtime_config): with open(args.runtime_config, "r") as toml_runtime_config: runtime_config = toml.load(toml_runtime_config) + max_tries = 5 + for i in range(max_tries): + try: + workflow_runner(env=args.env, runtime_config=runtime_config) + break + except MonitoringHubStartError as msg: + print(f"Got MonitoringHubStartError during attempt {i} of {max_tries}. Will try again...") - workflow_runner(env=args.env, runtime_config=runtime_config) diff --git a/src/kbmod_wf/resource_configs/usdf_configuration.py b/src/kbmod_wf/resource_configs/usdf_configuration.py index dcbf6b28..adc7c228 100644 --- a/src/kbmod_wf/resource_configs/usdf_configuration.py +++ b/src/kbmod_wf/resource_configs/usdf_configuration.py @@ -26,7 +26,7 @@ max_ram_dict = {"ada":350, # 351 Gb total, with 5 GPUs total on the one node, leaves 70 Gb per task "ampere":952, # 896 per each of the two nodes we can access, each with 4 GPUs - "roma":140, # 240 to 140 + "roma":140, # 240 to 140; to 240 5/20/2025 COC; RAM fixes, back to 140 5/23/2025 COC "milano":140 # 240 to 140 } max_block_dict = {"ada":1, "ampere":2} @@ -75,7 +75,7 @@ def usdf_resource_config(): partition=cpu_partition, # or ada?; see resource notes at top account=account_name, min_blocks=0, - max_blocks=20, # 12 to 20 4/16/2025 COC + max_blocks=10, # 12 to 20 4/16/2025 COC; 20 to 10 while debugging RAM issues 5/20/2025 COC init_blocks=0, parallelism=1, nodes_per_block=1,