Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion src/kbmod_wf/multi_night_workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import toml
import parsl
from parsl import python_app, File
from parsl.monitoring.errors import MonitoringHubStartError
import parsl.executors

from kbmod_wf.utilities import (
Expand Down Expand Up @@ -153,5 +154,11 @@ def workflow_runner(env=None, runtime_config={}):
if args.runtime_config is not None and os.path.exists(args.runtime_config):
with open(args.runtime_config, "r") as toml_runtime_config:
runtime_config = toml.load(toml_runtime_config)
max_tries = 5
for i in range(max_tries):
try:
workflow_runner(env=args.env, runtime_config=runtime_config)
break
except MonitoringHubStartError as msg:
print(f"Got MonitoringHubStartError during attempt {i} of {max_tries}. Will try again...")

workflow_runner(env=args.env, runtime_config=runtime_config)
4 changes: 2 additions & 2 deletions src/kbmod_wf/resource_configs/usdf_configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@

max_ram_dict = {"ada":350, # 351 Gb total, with 5 GPUs total on the one node, leaves 70 Gb per task
"ampere":952, # 896 per each of the two nodes we can access, each with 4 GPUs
"roma":140, # 240 to 140
"roma":140, # 240 to 140; to 240 5/20/2025 COC; RAM fixes, back to 140 5/23/2025 COC
"milano":140 # 240 to 140
}
max_block_dict = {"ada":1, "ampere":2}
Expand Down Expand Up @@ -75,7 +75,7 @@ def usdf_resource_config():
partition=cpu_partition, # or ada?; see resource notes at top
account=account_name,
min_blocks=0,
max_blocks=20, # 12 to 20 4/16/2025 COC
max_blocks=10, # 12 to 20 4/16/2025 COC; 20 to 10 while debugging RAM issues 5/20/2025 COC
init_blocks=0,
parallelism=1,
nodes_per_block=1,
Expand Down
Loading