Skip to content

Commit 88e708e

Browse files
committed
Add error to point user to slurm resume log
1 parent e45dfd8 commit 88e708e

File tree

1 file changed

+3
-3
lines changed

1 file changed

+3
-3
lines changed

src/slurm_plugin/instance_manager.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -848,7 +848,7 @@ def _best_effort_node_assignment(
848848
# No instances launched at all, e.g. CreateFleet API returns no EC2 instances,
849849
# or no left instances available from a best-effort EC2 launch
850850
logger.info("No launched instances found for nodes %s", print_with_count(nodes_resume_list))
851-
self._update_failed_nodes(set(nodes_resume_list), "InsufficientInstanceCapacity", override=False)
851+
self._update_failed_nodes(set(nodes_resume_list), "InsufficientInstanceCapacity(Check slurm_resume log for ec2 error codes)", override=False)
852852

853853
def _all_or_nothing_node_assignment(
854854
self,
@@ -903,7 +903,7 @@ def _all_or_nothing_node_assignment(
903903
# No instances launched at all, e.g. CreateFleet API returns no EC2 instances,
904904
# or no left instances available from a best-effort EC2 launch
905905
logger.info("No launched instances found for nodes %s", print_with_count(nodes_resume_list))
906-
self._update_failed_nodes(set(nodes_resume_list), "InsufficientInstanceCapacity", override=False)
906+
self._update_failed_nodes(set(nodes_resume_list), "InsufficientInstanceCapacity(Check slurm_resume log for ec2 error codes)", override=False)
907907

908908
def _launch_instances( # noqa: C901
909909
self,
@@ -986,7 +986,7 @@ def _launch_ec2_instances(self, batch_nodes, compute_resource, fleet_manager, in
986986
# queue_2: {cr_3: list[EC2Instance]}
987987
# }
988988
else:
989-
self._update_failed_nodes(set(batch_nodes), "InsufficientInstanceCapacity")
989+
self._update_failed_nodes(set(batch_nodes), "InsufficientInstanceCapacity(Check slurm_resume log for ec2 error codes)")
990990
return launched_ec2_instances
991991

992992
def _get_fleet_manager(self, all_or_nothing_batch, compute_resource, queue):

0 commit comments

Comments
 (0)