Skip to content

Commit ecaffb8

Browse files
Refactor logic in _is_node_in_replacement_valid() to account for node.instance being None (#620)
1 parent b5c8bb5 commit ecaffb8

File tree

2 files changed

+10
-3
lines changed

2 files changed

+10
-3
lines changed

src/slurm_plugin/clustermgtd.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1207,6 +1207,7 @@ def _is_node_in_replacement_valid(self, node: SlurmNode, check_node_is_valid):
12071207
If check_node_is_valid=True, check whether a node is in replacement,
12081208
If check_node_is_valid=False, check whether a node is replacement timeout.
12091209
"""
1210+
log.debug(f"Checking if node is in replacement {node}")
12101211
if (
12111212
node.is_backing_instance_valid(
12121213
self._config.ec2_instance_missing_max_count,
@@ -1215,9 +1216,15 @@ def _is_node_in_replacement_valid(self, node: SlurmNode, check_node_is_valid):
12151216
)
12161217
and node.name in self._static_nodes_in_replacement
12171218
):
1218-
time_is_expired = time_is_up(
1219-
node.instance.launch_time, self._current_time, grace_time=self._config.node_replacement_timeout
1219+
# Set `time_is_expired` to `False` if `node.instance` is `None` since we don't have a launch time yet
1220+
time_is_expired = (
1221+
False
1222+
if not node.instance
1223+
else time_is_up(
1224+
node.instance.launch_time, self._current_time, grace_time=self._config.node_replacement_timeout
1225+
)
12201226
)
1227+
log.debug(f"Node {node} is in replacement and timer expired? {time_is_expired}, instance? {node.instance}")
12211228
return not time_is_expired if check_node_is_valid else time_is_expired
12221229
return False
12231230

src/slurm_plugin/slurm_resources.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -474,7 +474,7 @@ def is_backing_instance_valid(
474474
if log_warn_if_unhealthy:
475475
logger.warning(
476476
f"Incrementing missing EC2 instance count for node {self.name} to "
477-
f"{nodes_without_backing_instance_count_map[self.name]}."
477+
f"{nodes_without_backing_instance_count_map[self.name].count}."
478478
)
479479
else:
480480
# Remove the slurm node from the map since the instance is healthy

0 commit comments

Comments
 (0)