aws
diff --git a/‎tests/integration-tests/clusters_factory.py‎
Lines changed: 46 additions & 0 deletions b/‎tests/integration-tests/clusters_factory.py‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎tests/integration-tests/configs/develop.yaml‎
Lines changed: 7 additions & 0 deletions b/‎tests/integration-tests/configs/develop.yaml‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎tests/integration-tests/conftest.py‎
Lines changed: 26 additions & 0 deletions b/‎tests/integration-tests/conftest.py‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎tests/integration-tests/tests/common/assertions.py‎
Lines changed: 9 additions & 1 deletion b/‎tests/integration-tests/tests/common/assertions.py‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎tests/integration-tests/tests/common/utils.py‎
Lines changed: 9 additions & 0 deletions b/‎tests/integration-tests/tests/common/utils.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎tests/integration-tests/tests/gb200/test_gb200.py‎
Lines changed: 166 additions & 0 deletions b/‎tests/integration-tests/tests/gb200/test_gb200.py‎
Lines changed: 166 additions & 0 deletions
@@ -19,6 +19,7 @@
 import boto3
 import yaml
 from framework.credential_providers import run_pcluster_command
+from remote_command_executor import RemoteCommandExecutor
 from retrying import retry
 from time_utils import minutes, seconds
 from utils import (
@@ -34,6 +35,15 @@
     retry_if_subprocess_error,
 )
 
+from tests.common.utils import read_remote_file
+
+TAG_CLUSTER_NAME = "parallelcluster:cluster-name"
+TAG_NODE_TYPE = "parallelcluster:node-type"
+TAG_QUEUE_NAME = "parallelcluster:queue-name"
+TAG_QCOMPUTE_RESOURCE_NAME = "parallelcluster:compute-resource-name"
+
+LAUNCH_TEMPLATES_CONFIG_FILE = "/opt/parallelcluster/shared/launch-templates-config.json"
+
 
 def suppress_and_log_exception(func):
     @functools.wraps(func)
@@ -253,6 +263,42 @@ def describe_cluster_instances(self, node_type=None, queue_name=None):
             logging.error("Failed when getting cluster instances with error:\n%s\nand output:\n%s", e.stderr, e.stdout)
             raise
 
+    def get_compute_nodes(self, queue_name: str = None, compute_resource_name: str = None, status: str = "running"):
+        """Return the EC2 instance details for compute nodes matching the provided criteria."""
+        ec2 = boto3.client("ec2", region_name=self.region)
+        filters = [
+            {"Name": f"tag:{TAG_CLUSTER_NAME}", "Values": [self.cfn_name]},
+            {"Name": f"tag:{TAG_NODE_TYPE}", "Values": ["Compute"]},
+            {"Name": "instance-state-name", "Values": [status]},
+        ]
+
+        if queue_name:
+            filters.append({"Name": f"tag:{TAG_QUEUE_NAME}", "Values": [queue_name]})
+        if compute_resource_name:
+            filters.append({"Name": f"tag:{TAG_QCOMPUTE_RESOURCE_NAME}", "Values": [compute_resource_name]})
+
+        return ec2.describe_instances(Filters=filters).get("Reservations")[0].get("Instances")
+
+    def get_compute_nodes_private_ip(
+        self, queue_name: str = None, compute_resource_name: str = None, status: str = "running"
+    ):
+        """Return the private IP address of compute nodes matching the provided criteria."""
+        return [i.get("PrivateIpAddress") for i in self.get_compute_nodes(queue_name, compute_resource_name, status)]
+
+    def get_compute_nodes_launch_template_logical_id(self, queue_name: str, compute_resource_name: str):
+        """Return the launch template logical id of compute nodes matching the provided criteria."""
+        launch_templates_config = json.loads(
+            read_remote_file(RemoteCommandExecutor(self), LAUNCH_TEMPLATES_CONFIG_FILE)
+        )
+        return (
+            launch_templates_config.get("Queues", {})
+            .get(queue_name, {})
+            .get("ComputeResources", {})
+            .get(compute_resource_name, {})
+            .get("LaunchTemplate", {})
+            .get("LogicalId")
+        )
+
     def get_cluster_instance_ids(self, node_type=None, queue_name=None):
         """Run pcluster describe-cluster-instances and collect instance ids."""
         instances = self.describe_cluster_instances(node_type=node_type, queue_name=queue_name)
 
@@ -280,6 +280,13 @@ test-suites:
           instances: [{{ common.instance("instance_type_1") }}]
           oss: [{{ OS_X86_6 }}]
           schedulers: [ "slurm" ]
+  gb200:
+    test_gb200.py::test_gb200:
+      dimensions:
+        - regions: [ "us-east-1" ]
+          instances: [ "g4dn.2xlarge" ]
+          oss: [ "alinux2023" ]
+          schedulers: [ "slurm" ]
   health_checks:
     test_gpu_health_checks.py::test_cluster_with_gpu_health_checks:
       dimensions:
 
@@ -579,6 +579,32 @@ def test_datadir(request, datadir):
     return datadir / "{0}/{1}".format(class_name, function_name)
 
 
+@pytest.fixture()
+def file_reader(test_datadir, request, vpc_stack):
+    """
+    Define a fixture to render file templates associated to the running test.
+
+    The template file for a given test is a generic file stored in the configs_datadir folder.
+    The template can be written by using Jinja2 template engine.
+
+    :return: a _file_renderer(**kwargs) function which gets as input a dictionary of values to replace in the template
+    """
+
+    def _file_renderer(input_file: str = "script.sh", output_file: str = "script_rendered.sh", **kwargs):
+        input_file_path = test_datadir / input_file
+        if not os.path.isfile(input_file_path):
+            raise FileNotFoundError(f"Input file not found in the expected dir {input_file_path}")
+        output_file_path = test_datadir / output_file if output_file else input_file_path
+        default_values = _get_default_template_values(vpc_stack, request)
+        file_loader = FileSystemLoader(str(test_datadir))
+        env = SandboxedEnvironment(loader=file_loader)
+        rendered_template = env.get_template(input_file).render(**{**default_values, **kwargs})
+        output_file_path.write_text(rendered_template)
+        return output_file_path
+
+    return _file_renderer
+
+
 @pytest.fixture()
 def pcluster_config_reader(test_datadir, vpc_stack, request, region, instance, architecture):
     """
 
@@ -9,12 +9,14 @@
 # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
+import re
 import time
 from typing import List, Union
 
 import boto3
 import pytest
 from assertpy import assert_that, soft_assertions
+from clusters_factory import Cluster
 from constants import NodeType
 from remote_command_executor import RemoteCommandExecutor
 from retrying import RetryError, retry
@@ -28,7 +30,7 @@
 )
 
 from tests.common.scaling_common import get_compute_nodes_allocation
-from tests.common.utils import get_ddb_item
+from tests.common.utils import get_ddb_item, read_remote_file
 
 
 @retry(wait_fixed=seconds(20), stop_max_delay=minutes(6))
@@ -422,3 +424,9 @@ def _assert_build_image_stack_deleted(stack_name, region, timeout_seconds=600, p
         time.sleep(poll_interval)
 
     pytest.fail(f"Timed-out waiting for stack {stack_name} deletion (last status: {last_status})")
+
+
+def assert_regex_in_file(cluster: Cluster, compute_node_ip: str, file_name: str, pattern: str):
+    rce = RemoteCommandExecutor(cluster, compute_node_ip)
+    file_content = read_remote_file(rce, file_name)
+    assert_that(bool(re.search(pattern, file_content, re.IGNORECASE))).is_true()
@@ -536,3 +536,12 @@ def write_file(dirname, filename, content):
         f.write(content)
     logging.info(f"File written: {filepath}")
     return filepath
+
+
+def terminate_nodes_manually(instance_ids, region):
+    ec2_client = boto3.client("ec2", region_name=region)
+    for instance_id in instance_ids:
+        instance_states = ec2_client.terminate_instances(InstanceIds=[instance_id]).get("TerminatingInstances")[0]
+        assert_that(instance_states.get("InstanceId")).is_equal_to(instance_id)
+        assert_that(instance_states.get("CurrentState").get("Name")).is_in("shutting-down", "terminated")
+    logging.info("Terminated nodes: {}".format(instance_ids))
@@ -0,0 +1,166 @@
+# Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License").
+# You may not use this file except in compliance with the License.
+# A copy of the License is located at
+#
+# http://aws.amazon.com/apache2.0/
+#
+# or in the "LICENSE.txt" file accompanying this file.
+# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
+# See the License for the specific language governing permissions and limitations under the License.
+import json
+import logging
+import re
+
+import boto3
+import pytest
+from assertpy import assert_that
+from clusters_factory import Cluster
+from remote_command_executor import RemoteCommandExecutor
+
+from tests.common.assertions import assert_regex_in_file
+from tests.common.schedulers_common import SlurmCommands
+from tests.common.utils import read_remote_file, terminate_nodes_manually
+
+
+def submit_job_imex_status(rce: RemoteCommandExecutor, launch_template_id: str, queue_name: str, max_nodes: int = 1):
+    logging.info("Submitting job to check IMEX status")
+    slurm_commands = SlurmCommands(rce)
+    job_id = slurm_commands.submit_command_and_assert_job_accepted(
+        submit_command_args={
+            "command": f"/usr/bin/nvidia-imex-ctl -a -q -c /opt/parallelcluster/shared/nvidia-imex/config_{launch_template_id}.cfg",
+            "partition": queue_name,
+            "nodes": max_nodes,
+        }
+    )
+    slurm_commands.wait_job_completed(job_id)
+    slurm_commands.assert_job_succeeded(job_id)
+    return job_id
+
+
+def assert_imex_node_config(rce: RemoteCommandExecutor, launch_template_id: str, expected_ips: list):
+    logging.info(f"Checking IMEX nodes config contains the expected nodes: {expected_ips}")
+    imex_nodes_config_file = f"/opt/parallelcluster/shared/nvidia-imex/nodes_config_{launch_template_id}.cfg"
+    imex_config_content = read_remote_file(rce, imex_nodes_config_file)
+    actual_ips = [ip.strip() for ip in imex_config_content.strip().split("\n")]
+    assert_that(actual_ips).contains_only(expected_ips)
+
+
+def assert_imex_healthy(cluster: Cluster, queue_name: str, compute_resource_name: str, max_nodes: int = 1):
+    rce = RemoteCommandExecutor(cluster)
+
+    launch_template_id = cluster.get_compute_nodes_launch_template_logical_id(queue_name, compute_resource_name)
+    logging.info(
+        f"Launch template for compute nodes in queue {queue_name} and compute resource {compute_resource_name}: {launch_template_id}"
+    )
+
+    ips = cluster.get_compute_nodes_private_ip(queue_name, compute_resource_name)
+    logging.info(
+        f"Private IP addresses for compute nodes in queue {queue_name} and compute resource {compute_resource_name}: {ips}"
+    )
+
+    job_id = submit_job_imex_status(rce, launch_template_id, queue_name, max_nodes)
+
+    assert_imex_node_config(rce, launch_template_id, ips)
+
+    job_stdout = rce.run_remote_command(f"cat slurm-{job_id}.out").stdout
+    logging.info(f"IMEX status output: {job_stdout}")
+    imex_status = json.loads(job_stdout)
+
+    logging.info(f"Checking that IMEX only sees the expected nodes: {ips}")
+    assert_that(all(node["hostname"] in ips for node in imex_status["nodeStatus"])).is_true()
+
+    logging.info(f"Checking that IMEX sees every node ready: {ips}")
+    assert_that(
+        all(
+            any(node["hostname"] == ip and node["message"] == "READY" for node in imex_status["nodeStatus"])
+            for ip in ips
+        )
+    ).is_true()
+
+    for compute_node_ip in cluster.get_compute_nodes_private_ip(queue_name, compute_resource_name):
+        for file_name in ["/var/log/nvidia-imex-verbose.log", "/var/log/parallelcluster/nvidia-imex-prolog.log"]:
+            logging.info(f"Checking file {file_name} log does not contain any error")
+            assert_regex_in_file(cluster, compute_node_ip, file_name, r"^(?!.*(?:err|warn|fail)).*$")
+
+
+def assert_imex_not_configured(cluster: Cluster, queue_name: str, compute_resource_name: str, max_nodes: int = 1):
+    rce = RemoteCommandExecutor(cluster)
+
+    launch_template_id = cluster.get_compute_nodes_launch_template_logical_id(queue_name, compute_resource_name)
+    logging.info(
+        f"Launch template for compute nodes in queue {queue_name} and compute resource {compute_resource_name}: {launch_template_id}"
+    )
+
+    submit_job_imex_status(rce, launch_template_id, queue_name, max_nodes)
+
+    assert_imex_node_config(rce, launch_template_id, ["0.0.0.0", "0.0.0.0"])
+
+
+@pytest.mark.usefixtures("region", "os", "instance", "scheduler")
+def test_gb200(pcluster_config_reader, file_reader, clusters_factory, test_datadir, s3_bucket_factory, region):
+    """
+    Test automated configuration of Nvidia IMEX.
+
+    This test creates a cluster with the necessary custom actions to configure NVIDIA IMEX and verifies the following:
+    1. On the compute resource supporting IMEX (q1-cr1), the IMEX nodes file is configured by the prolog,
+       IMEX service is healthy and no errors are reported in IMEX's or prolog's logs.
+       Also, IMEX gets reconfigured when nodes belonging to the same compute resource get replaced
+    2. On the compute resource not supporting IMEX (q1-cr2), the IMEX nodes file is not configured by the prolog,
+       keeping the default values and IMEX is not started.
+
+    The test prints in test log the full IMEX status to facilitate troubleshooting.
+    The test uses instance type g4dn to simulate a p6e-gb200 instance.
+    This is a reasonable approximation for the test because the focus of the test is on IMEX configuration,
+    which can be executed on g4dn as well.
+    """
+    max_queue_size = 2
+
+    # Create an S3 bucket for custom action scripts
+    bucket_name = s3_bucket_factory()
+    bucket = boto3.resource("s3", region_name=region).Bucket(bucket_name)
+
+    # Upload files to test bucket
+    headnode_start_filename = "head_node_start.sh"
+    prolog_filename = "nvidia-imex.prolog.sh"
+    bucket.upload_file(str(test_datadir / prolog_filename), prolog_filename)
+    head_node_start_script_rendered = file_reader(
+        input_file=headnode_start_filename,
+        output_file=f"{headnode_start_filename}.rendered",
+        bucket_name=bucket_name,
+        prolog_key=prolog_filename,
+    )
+    bucket.upload_file(head_node_start_script_rendered, headnode_start_filename)
+
+    # TODO: Remove after testing: BEGIN: added compute custom action to force the configuraiton of IMEX
+    compute_configured_filename = "compute_node_configured.sh"
+    bucket.upload_file(str(test_datadir / compute_configured_filename), compute_configured_filename)
+    # TODO: Remove after testing: END
+
+    queue_name = "q1"
+    compute_resource_with_imex = "cr1"
+    compute_resource_without_imex = "cr2"
+
+    cluster_config = pcluster_config_reader(
+        bucket_name=bucket_name,
+        head_node_start_script=headnode_start_filename,
+        compute_node_configured_script=compute_configured_filename,
+        max_queue_size=max_queue_size,
+        queue_name=queue_name,
+        compute_resource_with_imex=compute_resource_with_imex,
+        compute_resource_without_imex=compute_resource_without_imex,
+    )
+    cluster = clusters_factory(cluster_config)
+
+    assert_imex_healthy(cluster, queue_name, compute_resource_with_imex, max_queue_size)
+
+    # IMEX is not configured on compute resource thta do not support it
+    assert_imex_not_configured(cluster, queue_name, compute_resource_without_imex)
+
+    # Forcefully terminate a compute node in the compute resource supporting IMEX
+    # to simulate an outage that forces the replacement of the node and consequently the IMEX reconfiguration.
+    terminate_nodes_manually(
+        [cluster.get_compute_nodes(queue_name, compute_resource_with_imex)[0].get("InstanceId")], region
+    )
+    assert_imex_healthy(cluster, queue_name, compute_resource_with_imex, max_queue_size)