Skip to content

Commit dc340b4

Browse files
NSsirenagmarciani
authored andcommitted
Fix FleetManager behavior when create-fleet returns no instances
In case of ICE create-fleet doesn't raise an exception and the default behavior isn't triggered This change forces the failure reason to ICE when create-fleet doens't return any instance Signed-off-by: Nicola Sirena <[email protected]>
1 parent e7839ae commit dc340b4

File tree

4 files changed

+71
-11
lines changed

4 files changed

+71
-11
lines changed

CHANGELOG.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@ This file is used to list changes made in each version of the aws-parallelcluste
66
3.6.1
77
------
88

9-
**CHANGES**
10-
- There were no changes for this version.
9+
**BUG FIXES**
10+
- Fix fast insufficient capacity fail-over logic when using Multiple Instance Types and no instances are returned
1111

1212
3.6.0
1313
------

src/slurm_plugin/instance_manager.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -167,11 +167,20 @@ def _update_slurm_node_addrs(self, slurm_nodes, launched_instances):
167167
print_with_count(zip(launched_nodes, launched_instances)),
168168
)
169169
if fail_launch_nodes:
170-
logger.warning(
171-
"Failed to launch instances due to limited EC2 capacity for following nodes: %s",
172-
print_with_count(fail_launch_nodes),
173-
)
174-
self._update_failed_nodes(set(fail_launch_nodes), "LimitedInstanceCapacity")
170+
if launched_nodes:
171+
logger.warning(
172+
"Failed to launch instances due to limited EC2 capacity for following nodes: %s",
173+
print_with_count(fail_launch_nodes),
174+
)
175+
self._update_failed_nodes(set(fail_launch_nodes), "LimitedInstanceCapacity")
176+
else:
177+
# EC2 Fleet doens't trigger any exception in case of ICEs and may return more than one error
178+
# for each request. So when no instances were launched we force the reason to ICE
179+
logger.error(
180+
"Failed to launch instances due to limited EC2 capacity for following nodes: %s",
181+
print_with_count(fail_launch_nodes),
182+
)
183+
self._update_failed_nodes(set(fail_launch_nodes), "InsufficientInstanceCapacity")
175184

176185
return dict(zip(launched_nodes, launched_instances))
177186

tests/slurm_plugin/test_fleet_manager.py

Lines changed: 54 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,32 @@ def test_launch_instances(self, boto3_stubber, launch_params, mocked_boto3_reque
219219

220220
# -------- Ec2CreateFleetManager ------
221221

222+
test_fleet_exception_params = {
223+
"LaunchTemplateConfigs": [
224+
{
225+
"LaunchTemplateSpecification": {"LaunchTemplateName": "hit-queue1-fleet-spot", "Version": "$Latest"},
226+
"Overrides": [
227+
{
228+
"InstanceRequirements": {
229+
"VCpuCount": {"Min": 2},
230+
"MemoryMiB": {"Min": 2048},
231+
"AllowedInstanceTypes": ["inf*"],
232+
"AcceleratorManufacturers": ["nvidia"],
233+
}
234+
}
235+
],
236+
}
237+
],
238+
"SpotOptions": {
239+
"AllocationStrategy": "capacity-optimized",
240+
"SingleInstanceType": False,
241+
"SingleAvailabilityZone": True,
242+
"MinTargetCapacity": 1,
243+
},
244+
"TargetCapacitySpecification": {"TotalTargetCapacity": 5, "DefaultTargetCapacityType": "spot"},
245+
"Type": "instant",
246+
}
247+
222248
test_fleet_spot_params = {
223249
"LaunchTemplateConfigs": [
224250
{
@@ -486,6 +512,26 @@ def test_evaluate_launch_params(
486512
},
487513
],
488514
),
515+
# create-fleet - exception
516+
(
517+
test_fleet_exception_params,
518+
[
519+
MockedBoto3Request(
520+
method="create_fleet",
521+
response={
522+
"Instances": [],
523+
"Errors": [
524+
{"ErrorCode": "InvalidParameterValue", "ErrorMessage": "Insufficient capacity."}
525+
],
526+
"ResponseMetadata": {"RequestId": "1234-abcde"},
527+
},
528+
expected_params=test_fleet_exception_params,
529+
generate_error=True,
530+
error_code="InvalidParameterValue",
531+
),
532+
],
533+
[],
534+
),
489535
# normal - on-demand
490536
(
491537
test_on_demand_params,
@@ -549,7 +595,7 @@ def test_evaluate_launch_params(
549595
],
550596
),
551597
],
552-
ids=["fleet_spot", "fleet_ondemand"],
598+
ids=["fleet_spot", "fleet_exception", "fleet_ondemand"],
553599
)
554600
def test_launch_instances(
555601
self,
@@ -567,8 +613,13 @@ def test_launch_instances(
567613
"hit", "region", "boto3_config", FLEET_CONFIG, "queue2", "fleet-ondemand", False, {}, {}
568614
)
569615

570-
assigned_nodes = fleet_manager._launch_instances(launch_params)
571-
assert_that(assigned_nodes.get("Instances", [])).is_equal_to(expected_assigned_nodes)
616+
if mocked_boto3_request[0].generate_error:
617+
with pytest.raises(Exception) as e:
618+
fleet_manager._launch_instances(launch_params)
619+
assert isinstance(e, ClientError)
620+
else:
621+
assigned_nodes = fleet_manager._launch_instances(launch_params)
622+
assert_that(assigned_nodes.get("Instances", [])).is_equal_to(expected_assigned_nodes)
572623

573624
@pytest.mark.parametrize(
574625
("instance_ids", "mocked_boto3_request", "expected_exception", "expected_error", "expected_result"),

tests/slurm_plugin/test_instance_manager.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -814,7 +814,7 @@ def test_add_instances(
814814
["queue1-st-c5xlarge-1"],
815815
{},
816816
None,
817-
{"LimitedInstanceCapacity": {"queue1-st-c5xlarge-1"}},
817+
{"InsufficientInstanceCapacity": {"queue1-st-c5xlarge-1"}},
818818
False,
819819
"dns.domain",
820820
),

0 commit comments

Comments
 (0)