Skip to content

Commit e52e25b

Browse files
authored
[3.6] Avoid duplication of nodes in ClusterManager._find_active_nodes() (#537)
This is relevant only in cases when the Slurm partition configuration is edited and PC-managed nodes are added to multiple custom partitions. Add unit test for `ClusterManager._find_active_nodes()`. Signed-off-by: Jacopo De Amicis <[email protected]>
1 parent dc340b4 commit e52e25b

File tree

3 files changed

+128
-1
lines changed

3 files changed

+128
-1
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@ This file is used to list changes made in each version of the aws-parallelcluste
66
3.6.1
77
------
88

9+
**CHANGES**
10+
- Avoid duplication of nodes seen by ClusterManager if compute nodes are added to multiple Slurm partitions.
11+
912
**BUG FIXES**
1013
- Fix fast insufficient capacity fail-over logic when using Multiple Instance Types and no instances are returned
1114

src/slurm_plugin/clustermgtd.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1127,7 +1127,7 @@ def _find_active_nodes(partitions_name_map):
11271127
for partition in partitions_name_map.values():
11281128
if partition.state != "INACTIVE":
11291129
active_nodes += partition.slurm_nodes
1130-
return active_nodes
1130+
return list(dict.fromkeys(active_nodes))
11311131

11321132
def _is_node_in_replacement_valid(self, node, check_node_is_valid):
11331133
"""

tests/slurm_plugin/test_clustermgtd.py

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3729,3 +3729,127 @@ def test_find_unhealthy_slurm_nodes(
37293729
assert_that(unhealthy_dynamic_nodes).is_equal_to(expected_unhealthy_dynamic_nodes)
37303730
assert_that(unhealthy_static_nodes).is_equal_to(expected_unhealthy_static_nodes)
37313731
assert_that(ice_compute_resources_and_nodes_map).is_equal_to(expected_ice_compute_resources_and_nodes_map)
3732+
3733+
3734+
@pytest.mark.parametrize(
3735+
"partitions_name_map, expected_nodelist",
3736+
[
3737+
pytest.param(
3738+
{
3739+
"queue1": SimpleNamespace(
3740+
name="queue1",
3741+
nodenames="queue1-st-cr1-1,queue1-st-cr1-2",
3742+
state="UP",
3743+
slurm_nodes=[
3744+
StaticNode(name="queue1-st-cr1-1", nodeaddr="", nodehostname="", state=""),
3745+
StaticNode(name="queue1-st-cr1-2", nodeaddr="", nodehostname="", state=""),
3746+
],
3747+
),
3748+
"queue2": SimpleNamespace(
3749+
name="queue2",
3750+
nodenames="queue2-st-cr1-1,queue2-st-cr1-2",
3751+
state="UP",
3752+
slurm_nodes=[
3753+
StaticNode(name="queue2-st-cr1-1", nodeaddr="", nodehostname="", state=""),
3754+
StaticNode(name="queue2-st-cr1-2", nodeaddr="", nodehostname="", state=""),
3755+
],
3756+
),
3757+
},
3758+
[
3759+
StaticNode(name="queue1-st-cr1-1", nodeaddr="", nodehostname="", state=""),
3760+
StaticNode(name="queue1-st-cr1-2", nodeaddr="", nodehostname="", state=""),
3761+
StaticNode(name="queue2-st-cr1-1", nodeaddr="", nodehostname="", state=""),
3762+
StaticNode(name="queue2-st-cr1-2", nodeaddr="", nodehostname="", state=""),
3763+
],
3764+
id="Two non-overlapping partitions",
3765+
),
3766+
pytest.param(
3767+
{
3768+
"queue1": SimpleNamespace(
3769+
name="queue1",
3770+
nodenames="queue1-st-cr1-1,queue1-st-cr1-2",
3771+
state="UP",
3772+
slurm_nodes=[
3773+
StaticNode(name="queue1-st-cr1-1", nodeaddr="", nodehostname="", state=""),
3774+
StaticNode(name="queue1-st-cr1-2", nodeaddr="", nodehostname="", state=""),
3775+
],
3776+
),
3777+
"custom_partition": SimpleNamespace(
3778+
name="custom_partition",
3779+
nodenames="queue1-st-cr1-1,queue1-st-cr1-2",
3780+
state="UP",
3781+
slurm_nodes=[
3782+
StaticNode(name="queue1-st-cr1-1", nodeaddr="", nodehostname="", state=""),
3783+
StaticNode(name="queue1-st-cr1-2", nodeaddr="", nodehostname="", state=""),
3784+
],
3785+
),
3786+
},
3787+
[
3788+
StaticNode(name="queue1-st-cr1-1", nodeaddr="", nodehostname="", state=""),
3789+
StaticNode(name="queue1-st-cr1-2", nodeaddr="", nodehostname="", state=""),
3790+
],
3791+
id="Two overlapping partitions obtained by creating a custom partition that includes nodes from a "
3792+
"PC-managed queue",
3793+
),
3794+
pytest.param(
3795+
{},
3796+
[],
3797+
id="Empty cluster with no partitions",
3798+
),
3799+
pytest.param(
3800+
{
3801+
"queue1": SimpleNamespace(
3802+
name="queue1",
3803+
nodenames="queue1-st-cr1-1,queue1-st-cr1-2",
3804+
state="UP",
3805+
slurm_nodes=[
3806+
StaticNode(name="queue1-st-cr1-1", nodeaddr="", nodehostname="", state=""),
3807+
StaticNode(name="queue1-st-cr1-2", nodeaddr="", nodehostname="", state=""),
3808+
],
3809+
),
3810+
"queue2": SimpleNamespace(
3811+
name="queue2",
3812+
nodenames="queue2-st-cr1-1,queue2-st-cr1-2",
3813+
state="INACTIVE",
3814+
slurm_nodes=[
3815+
StaticNode(name="queue2-st-cr1-1", nodeaddr="", nodehostname="", state=""),
3816+
StaticNode(name="queue2-st-cr1-2", nodeaddr="", nodehostname="", state=""),
3817+
],
3818+
),
3819+
},
3820+
[
3821+
StaticNode(name="queue1-st-cr1-1", nodeaddr="", nodehostname="", state=""),
3822+
StaticNode(name="queue1-st-cr1-2", nodeaddr="", nodehostname="", state=""),
3823+
],
3824+
id="Two non-overlapping partitions, one active and one inactive",
3825+
),
3826+
pytest.param(
3827+
{
3828+
"queue1": SimpleNamespace(
3829+
name="queue1",
3830+
nodenames="queue1-st-cr1-1,queue1-st-cr1-2",
3831+
state="INACTIVE",
3832+
slurm_nodes=[
3833+
StaticNode(name="queue1-st-cr1-1", nodeaddr="", nodehostname="", state=""),
3834+
StaticNode(name="queue1-st-cr1-2", nodeaddr="", nodehostname="", state=""),
3835+
],
3836+
),
3837+
"queue2": SimpleNamespace(
3838+
name="queue2",
3839+
nodenames="queue2-st-cr1-1,queue2-st-cr1-2",
3840+
state="INACTIVE",
3841+
slurm_nodes=[
3842+
StaticNode(name="queue2-st-cr1-1", nodeaddr="", nodehostname="", state=""),
3843+
StaticNode(name="queue2-st-cr1-2", nodeaddr="", nodehostname="", state=""),
3844+
],
3845+
),
3846+
},
3847+
[],
3848+
id="Two inactive non-overlapping partitions",
3849+
),
3850+
],
3851+
)
3852+
def test_find_active_nodes(partitions_name_map, expected_nodelist):
3853+
"""Unit test for the `ClusterManager._find_active_nodes()` method."""
3854+
result_nodelist = ClusterManager._find_active_nodes(partitions_name_map)
3855+
assert_that(result_nodelist).is_equal_to(expected_nodelist)

0 commit comments

Comments
 (0)