|
| 1 | +# Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. |
| 2 | +# |
| 3 | +# Licensed under the Apache License, Version 2.0 (the "License"). |
| 4 | +# You may not use this file except in compliance with the License. |
| 5 | +# A copy of the License is located at |
| 6 | +# |
| 7 | +# http://aws.amazon.com/apache2.0/ |
| 8 | +# |
| 9 | +# or in the "LICENSE.txt" file accompanying this file. |
| 10 | +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. |
| 11 | +# See the License for the specific language governing permissions and limitations under the License. |
| 12 | +import json |
| 13 | +import logging |
| 14 | +import re |
| 15 | + |
| 16 | +import boto3 |
| 17 | +import pytest |
| 18 | +from assertpy import assert_that |
| 19 | +from remote_command_executor import RemoteCommandExecutor |
| 20 | +from clusters_factory import Cluster |
| 21 | + |
| 22 | +from tests.common.schedulers_common import SlurmCommands |
| 23 | +from tests.common.utils import read_remote_file, terminate_nodes_manually |
| 24 | +from tests.common.assertions import assert_regex_in_file |
| 25 | + |
| 26 | +def submit_job_imex_status(rce: RemoteCommandExecutor, launch_template_id: str, queue_name: str, max_nodes: int = 1): |
| 27 | + logging.info("Submitting job to check IMEX status") |
| 28 | + slurm_commands = SlurmCommands(rce) |
| 29 | + job_id = slurm_commands.submit_command_and_assert_job_accepted( |
| 30 | + submit_command_args={ |
| 31 | + "command": f"/usr/bin/nvidia-imex-ctl -a -q -c /opt/parallelcluster/shared/nvidia-imex/config_{launch_template_id}.cfg", |
| 32 | + "partition": queue_name, |
| 33 | + "nodes": max_nodes, |
| 34 | + } |
| 35 | + ) |
| 36 | + slurm_commands.wait_job_completed(job_id) |
| 37 | + slurm_commands.assert_job_succeeded(job_id) |
| 38 | + return job_id |
| 39 | + |
| 40 | +def assert_imex_node_config(rce: RemoteCommandExecutor, launch_template_id: str, expected_ips: list): |
| 41 | + logging.info(f"Checking IMEX nodes config contains the expected nodes: {expected_ips}") |
| 42 | + imex_nodes_config_file = f"/opt/parallelcluster/shared/nvidia-imex/nodes_config_{launch_template_id}.cfg" |
| 43 | + imex_config_content = read_remote_file(rce, imex_nodes_config_file) |
| 44 | + actual_ips = [ip.strip() for ip in imex_config_content.strip().split('\n')] |
| 45 | + assert_that(actual_ips).contains_only(expected_ips) |
| 46 | + |
| 47 | + |
| 48 | +def assert_imex_healthy(cluster: Cluster, queue_name: str, compute_resource_name: str, max_nodes: int = 1): |
| 49 | + rce = RemoteCommandExecutor(cluster) |
| 50 | + |
| 51 | + launch_template_id = cluster.get_compute_nodes_launch_template_logical_id(queue_name, compute_resource_name) |
| 52 | + logging.info(f"Launch template for compute nodes in queue {queue_name} and compute resource {compute_resource_name}: {launch_template_id}") |
| 53 | + |
| 54 | + ips = cluster.get_compute_nodes_private_ip(queue_name, compute_resource_name) |
| 55 | + logging.info(f"Private IP addresses for compute nodes in queue {queue_name} and compute resource {compute_resource_name}: {ips}") |
| 56 | + |
| 57 | + job_id = submit_job_imex_status(rce, launch_template_id, queue_name, max_nodes) |
| 58 | + |
| 59 | + assert_imex_node_config(rce, launch_template_id, ips) |
| 60 | + |
| 61 | + job_stdout = rce.run_remote_command(f"cat slurm-{job_id}.out").stdout |
| 62 | + logging.info(f"IMEX status output: {job_stdout}") |
| 63 | + imex_status = json.loads(job_stdout) |
| 64 | + |
| 65 | + logging.info(f"Checking that IMEX only sees the expected nodes: {ips}") |
| 66 | + assert_that( |
| 67 | + all(node['hostname'] in ips for node in imex_status['nodeStatus']) |
| 68 | + ).is_true() |
| 69 | + |
| 70 | + logging.info(f"Checking that IMEX sees every node ready: {ips}") |
| 71 | + assert_that( |
| 72 | + all( |
| 73 | + any( |
| 74 | + node['hostname'] == ip and node['message'] == 'READY' for node in imex_status['nodeStatus']) for ip in ips |
| 75 | + ) |
| 76 | + ).is_true() |
| 77 | + |
| 78 | + for compute_node_ip in cluster.get_compute_nodes_private_ip(queue_name, compute_resource_name): |
| 79 | + for file_name in ["/var/log/nvidia-imex-verbose.log", "/var/log/parallelcluster/nvidia-imex-prolog.log"]: |
| 80 | + logging.info(f"Checking file {file_name} log does not contain any error") |
| 81 | + assert_regex_in_file(cluster, compute_node_ip, file_name, r'^(?!.*(?:err|warn|fail)).*$') |
| 82 | + |
| 83 | +def assert_imex_not_configured(cluster: Cluster, queue_name: str, compute_resource_name: str, max_nodes: int = 1): |
| 84 | + rce = RemoteCommandExecutor(cluster) |
| 85 | + |
| 86 | + launch_template_id = cluster.get_compute_nodes_launch_template_logical_id(queue_name, compute_resource_name) |
| 87 | + logging.info(f"Launch template for compute nodes in queue {queue_name} and compute resource {compute_resource_name}: {launch_template_id}") |
| 88 | + |
| 89 | + submit_job_imex_status(rce, launch_template_id, queue_name, max_nodes) |
| 90 | + |
| 91 | + assert_imex_node_config(rce, launch_template_id, ["0.0.0.0", "0.0.0.0"]) |
| 92 | + |
| 93 | + |
| 94 | +@pytest.mark.usefixtures("region", "os", "instance", "scheduler") |
| 95 | +def test_gb200(pcluster_config_reader, file_reader, clusters_factory, test_datadir, s3_bucket_factory, region): |
| 96 | + """ |
| 97 | + Test automated configuration of Nvidia IMEX. |
| 98 | +
|
| 99 | + This test creates a cluster with the necessary custom actions to configure NVIDIA IMEX and verifies the following: |
| 100 | + 1. On the compute resource supporting IMEX (q1-cr1), the IMEX nodes file is configured by the prolog, |
| 101 | + IMEX service is healthy and no errors are reported in IMEX's or prolog's logs. |
| 102 | + Also, IMEX gets reconfigured when nodes belonging to the same compute resource get replaced |
| 103 | + 2. On the compute resource not supporting IMEX (q1-cr2), the IMEX nodes file is not configured by the prolog, |
| 104 | + keeping the default values and IMEX is not started. |
| 105 | +
|
| 106 | + The test prints in test log the full IMEX status to facilitate troubleshooting. |
| 107 | + The test uses instance type g4dn to simulate a p6e-gb200 instance. |
| 108 | + This is a reasonable approximation for the test because the focus of the test is on IMEX configuration, |
| 109 | + which can be executed on g4dn as well. |
| 110 | + """ |
| 111 | + max_queue_size = 2 |
| 112 | + |
| 113 | + # Create an S3 bucket for custom action scripts |
| 114 | + bucket_name = s3_bucket_factory() |
| 115 | + bucket = boto3.resource("s3", region_name=region).Bucket(bucket_name) |
| 116 | + |
| 117 | + # Upload files to test bucket |
| 118 | + headnode_start_filename="head_node_start.sh" |
| 119 | + prolog_filename="nvidia-imex.prolog.sh" |
| 120 | + bucket.upload_file(str(test_datadir / prolog_filename), prolog_filename) |
| 121 | + head_node_start_script_rendered = file_reader( |
| 122 | + input_file=headnode_start_filename, |
| 123 | + output_file=f"{headnode_start_filename}.rendered", |
| 124 | + bucket_name=bucket_name, |
| 125 | + prolog_key=prolog_filename |
| 126 | + ) |
| 127 | + bucket.upload_file(head_node_start_script_rendered, headnode_start_filename) |
| 128 | + |
| 129 | + #TODO: Remove after testing: BEGIN: added compute custom action to force the configuraiton of IMEX |
| 130 | + compute_configured_filename="compute_node_configured.sh" |
| 131 | + bucket.upload_file(str(test_datadir / compute_configured_filename), compute_configured_filename) |
| 132 | + #TODO: Remove after testing: END |
| 133 | + |
| 134 | + queue_name = "q1" |
| 135 | + compute_resource_with_imex = "cr1" |
| 136 | + compute_resource_without_imex = "cr2" |
| 137 | + |
| 138 | + cluster_config = pcluster_config_reader( |
| 139 | + bucket_name=bucket_name, |
| 140 | + head_node_start_script=headnode_start_filename, |
| 141 | + compute_node_configured_script=compute_configured_filename, |
| 142 | + max_queue_size=max_queue_size, |
| 143 | + queue_name=queue_name, |
| 144 | + compute_resource_with_imex=compute_resource_with_imex, |
| 145 | + compute_resource_without_imex=compute_resource_without_imex, |
| 146 | + ) |
| 147 | + cluster = clusters_factory(cluster_config) |
| 148 | + |
| 149 | + assert_imex_healthy(cluster, queue_name, compute_resource_with_imex, max_queue_size) |
| 150 | + |
| 151 | + # IMEX is not configured on compute resource thta do not support it |
| 152 | + assert_imex_not_configured(cluster, queue_name, compute_resource_without_imex) |
| 153 | + |
| 154 | + # Forcefully terminate a compute node in the compute resource supporting IMEX |
| 155 | + # to simulate an outage that forces the replacement of the node and consequently the IMEX reconfiguration. |
| 156 | + terminate_nodes_manually([cluster.get_compute_nodes(queue_name, compute_resource_with_imex)[0].get("InstanceId")], region) |
| 157 | + assert_imex_healthy(cluster, queue_name, compute_resource_with_imex, max_queue_size) |
0 commit comments