Add integ test for training CLI and SDK (#100)

Aditi2424 · adishaa · web-flow · commit 281e6d69b536 · 2025-07-09T10:55:09.000-07:00
* Add integ test for training cli

* Add integ test for training sdk

* relax pydantic version

* fix pydantic version

* return latest cluster and fix set cluster context test

---------

Co-authored-by: adishaa &lt;adishaa@amazon.com&gt;
diff --git a/.gitignore b/.gitignore
@@ -24,4 +24,6 @@ __pycache__/
 
 # Ignore all contents of result and results directories
 /result/
-/results/
+/results/
+
+.idea/
diff --git a/__init__.py b/__init__.py
diff --git a/setup.py b/setup.py
@@ -85,7 +85,7 @@
         "ruff==0.6.2",
         "hera-workflows==5.16.3",
         "sagemaker-core<2.0.0",
-        "pydantic==2.11.7"
+        "pydantic>=2.10.6,<3.0.0"
     ],
     entry_points={
         "console_scripts": [
diff --git a/test/integration_tests/cli/test_hyp_cli_commands.py b/test/integration_tests/cli/test_hyp_cli_commands.py
@@ -0,0 +1,162 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+
+import pytest
+import time
+
+from sagemaker.hyperpod.cli.utils import setup_logger
+from test.integration_tests.utils import execute_command
+from test.integration_tests.abstract_integration_tests import AbstractIntegrationTests
+
+logger = setup_logger(__name__)
+
+
+class TestHypCLICommands(AbstractIntegrationTests):
+    """Integration tests for HyperPod CLI using hyp commands."""
+
+    def test_list_clusters(self, cluster_name):
+        """Test listing clusters """
+        assert cluster_name
+
+    def test_set_cluster_context(self, cluster_name):
+        """Test setting cluster context."""
+        result = execute_command([
+            "hyp", "set-cluster-context",
+            "--cluster-name", cluster_name
+        ])
+        assert result.returncode == 0
+        context_line = result.stdout.strip().splitlines()[-1]
+        assert any(text in context_line for text in ["Updated context", "Added new context"])
+
+    def test_get_cluster_context(self, cluster_name):
+        """Test getting current cluster context."""
+        result = execute_command(["hyp", "get-cluster-context"])
+        assert result.returncode == 0
+
+        context_output = result.stdout.strip()
+        assert "Cluster context:" in context_output
+        # Just verify we got a valid ARN without checking the specific name
+        current_arn = context_output.split("Cluster context:")[-1].strip()
+        assert "arn:aws:eks:" in current_arn
+
+    def test_create_job(self, test_job_name, image_uri):
+        """Test creating a PyTorch job using the correct CLI parameters."""
+        result = execute_command([
+            "hyp", "create", "hp-pytorch-job",
+            "--version", "1.0",
+            "--job-name", test_job_name,
+            "--image", image_uri,
+            "--pull-policy", "Always",
+            "--tasks-per-node", "1",
+            "--max-retry", "1"
+        ])
+        assert result.returncode == 0
+        logger.info(f"Created job: {test_job_name}")
+        
+        # Wait a moment for the job to be created
+        time.sleep(5)
+
+    def test_list_jobs(self, test_job_name):
+        """Test listing jobs and verifying the created job is present."""
+        list_result = execute_command(["hyp", "list", "hp-pytorch-job"])
+        assert list_result.returncode == 0
+        
+        # Check if either the job name is in the output or at least the header is present
+        assert test_job_name in list_result.stdout
+        logger.info("Successfully listed jobs")
+
+    def test_list_pods(self, test_job_name):
+        """Test listing pods for a specific job."""
+        # Wait a moment to ensure pods are created
+        time.sleep(10)
+        
+        list_pods_result = execute_command([
+            "hyp", "list-pods", "hp-pytorch-job", 
+            "--job-name", test_job_name
+        ])
+        assert list_pods_result.returncode == 0
+        
+        # Verify the output contains expected headers and job name
+        output = list_pods_result.stdout.strip()
+        assert f"Pods for job: {test_job_name}" in output
+        assert "POD NAME" in output
+        assert "NAMESPACE" in output
+        
+        # Verify at least one pod is listed (should contain the job name in the pod name)
+        assert f"{test_job_name}-pod-" in output
+        
+        logger.info(f"Successfully listed pods for job: {test_job_name}")
+
+    # @pytest.mark.skip(reason="Skipping since there is ")
+    def test_get_logs(self, test_job_name):
+        """Test getting logs for a specific pod in a job."""
+        # First, get the pod name from list-pods command
+        list_pods_result = execute_command([
+            "hyp", "list-pods", "hp-pytorch-job",
+            "--job-name", test_job_name
+        ])
+        assert list_pods_result.returncode == 0
+
+        # Extract the pod name from the output
+        output_lines = list_pods_result.stdout.strip().split('\n')
+        pod_name = None
+        for line in output_lines:
+            if f"{test_job_name}-pod-" in line:
+                # Extract the pod name from the line
+                pod_name = line.split()[0].strip()
+                break
+
+        assert pod_name is not None, f"Could not find pod for job {test_job_name}"
+        logger.info(f"Found pod: {pod_name}")
+
+        # Now get logs for this pod
+        get_logs_result = execute_command([
+            "hyp", "get-logs", "hp-pytorch-job",
+            "--job-name", test_job_name,
+            "--pod-name", pod_name
+        ])
+        assert get_logs_result.returncode == 0
+
+        # Verify the output contains the expected header
+        logs_output = get_logs_result.stdout.strip()
+        assert f"Listing logs for pod: {pod_name}" in logs_output
+
+        logger.info(f"Successfully retrieved logs for pod: {pod_name}")
+
+    def test_describe_job(self, test_job_name):
+        """Test describing a specific job and verifying the output."""
+        describe_result = execute_command(["hyp", "describe", "hp-pytorch-job", "--job-name", test_job_name])
+        assert describe_result.returncode == 0
+        
+        # Check if either the job name is in the output or metadata is present
+        assert test_job_name in describe_result.stdout
+        logger.info(f"Successfully described job: {test_job_name}")
+
+    @pytest.mark.run(order=99)
+    def test_delete_job(self, test_job_name):
+        """Test deleting a job and verifying deletion."""
+        delete_result = execute_command(["hyp", "delete", "hp-pytorch-job", "--job-name", test_job_name])
+        assert delete_result.returncode == 0
+        logger.info(f"Successfully deleted job: {test_job_name}")
+        
+        # Wait a moment for the job to be deleted
+        time.sleep(5)
+        
+        # Verify the job is no longer listed
+        list_result = execute_command(["hyp", "list", "hp-pytorch-job"])
+        assert list_result.returncode == 0
+        
+        # The job name should no longer be in the output
+        assert test_job_name not in list_result.stdout
+
+
diff --git a/test/integration_tests/conftest.py b/test/integration_tests/conftest.py
@@ -0,0 +1,77 @@
+import uuid
+import pytest
+import json
+from test.integration_tests.utils import execute_command
+from sagemaker.hyperpod.training import (
+    HyperPodPytorchJob,
+    Container,
+    ReplicaSpec,
+    Resources,
+    RunPolicy,
+    Spec,
+    Template,
+)
+from sagemaker.hyperpod.common.config import Metadata
+
+@pytest.fixture(scope="class")
+def test_job_name():
+    """Generate a unique job name for testing."""
+    return f"test-pytorch-job-{str(uuid.uuid4())[:8]}"
+
+@pytest.fixture(scope="class")
+def image_uri():
+    """Return a standard PyTorch image URI for testing."""
+    return "763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:2.2.0-cpu-py310-ubuntu20.04-sagemaker"
+
+@pytest.fixture(scope="class")
+def cluster_name():
+    """Fixture to list clusters once and return the first cluster name."""
+    result = execute_command(["hyp", "list-cluster"])
+    assert result.returncode == 0
+
+    try:
+        json_start = result.stdout.index('[')
+        json_text = result.stdout[json_start:]
+        clusters = json.loads(json_text)
+    except Exception as e:
+        raise AssertionError(f"Failed to parse cluster list JSON: {e}\nRaw Output:\n{result.stdout}")
+
+    assert clusters, "No clusters found in list-cluster output"
+    return clusters[-1]["Cluster"]
+
+@pytest.fixture(scope="class")
+def pytorch_job(test_job_name, image_uri):
+    """Create a HyperPodPytorchJob instance for testing."""
+    nproc_per_node="1"
+    replica_specs=[
+        ReplicaSpec(
+            name="pod",
+            template=Template(
+                spec=Spec(
+                    containers=[
+                        Container(
+                            name="container-name",
+                            image=image_uri,
+                            image_pull_policy="Always",
+                            resources=Resources(
+                                requests={"nvidia.com/gpu": "0"},
+                                limits={"nvidia.com/gpu": "0"},
+                            ),
+                            # command=[]
+                        )
+                    ]
+                )
+            ),
+        )
+    ]
+    run_policy=RunPolicy(clean_pod_policy="None")
+
+    pytorch_job = HyperPodPytorchJob(
+        metadata=Metadata(name=test_job_name),
+        nproc_per_node=nproc_per_node,
+        replica_specs=replica_specs,
+        run_policy=run_policy,
+    )
+
+    return pytorch_job
+
diff --git a/test/integration_tests/sdk/training.py b/test/integration_tests/sdk/training.py
@@ -0,0 +1,123 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+
+import pytest
+import time
+import yaml
+
+from sagemaker.hyperpod.training import (
+    HyperPodPytorchJob,
+    Container,
+    ReplicaSpec,
+    Resources,
+    RunPolicy,
+    Spec,
+    Template,
+)
+from sagemaker.hyperpod.common.config import Metadata
+from sagemaker.hyperpod.cli.utils import setup_logger
+from test.integration_tests.abstract_integration_tests import AbstractIntegrationTests
+
+logger = setup_logger(__name__)
+
+
+class TestHyperPodTrainingSDK(AbstractIntegrationTests):
+    """Integration tests for HyperPod Training SDK."""
+
+    def test_create_job(self, pytorch_job):
+        """Test creating a PyTorch job using the SDK."""
+        try:
+            # The create() method doesn't return anything
+            pytorch_job.create()
+            logger.info(f"Job creation initiated: {pytorch_job.metadata.name}")
+
+            # Wait for the job to be created and status to be available
+            # We'll try a few times with increasing delays
+            max_attempts = 5
+            for attempt in range(1, max_attempts + 1):
+                try:
+                    logger.info(f"Waiting for job status to be available (attempt {attempt}/{max_attempts})...")
+                    # Wait with increasing delay
+                    time.sleep(attempt * 5)  # 5, 10, 15, 20, 25 seconds
+
+                    # Get the job directly instead of using refresh
+                    HyperPodPytorchJob.get(pytorch_job.metadata.name, pytorch_job.metadata.namespace)
+
+                    # If we got here without exception, the job exists
+                    logger.info(f"Job successfully created: {pytorch_job.metadata.name}")
+                    return
+                except Exception as e:
+                    if "status" in str(e) and attempt < max_attempts:
+                        logger.info(f"Status not available yet, retrying... ({e})")
+                        continue
+                    else:
+                        raise
+
+            # If we get here, we've exhausted our attempts
+            pytest.fail(f"Job was created but status never became available after {max_attempts} attempts")
+        except Exception as e:
+            logger.error(f"Error creating job: {e}")
+            pytest.fail(f"Failed to create job: {e}")
+
+    def test_list_jobs(self, pytorch_job):
+        """Test listing jobs and verifying the created job is present."""
+        jobs = HyperPodPytorchJob.list()
+        assert jobs is not None
+
+        # Check if the job name is in the list
+        job_names = [job.metadata.name for job in jobs]
+        assert pytorch_job.metadata.name in job_names
+
+    #
+    def test_refresh_job(self, pytorch_job):
+        pytorch_job.refresh()
+        time.sleep(15)
+        assert pytorch_job.status is not None, "Job status should not be None"
+        logger.info(f"Refreshed job status:\n{yaml.dump(pytorch_job.status)}")
+
+    def test_list_pods(self, pytorch_job):
+        """Test listing pods for a specific job."""
+        pods = pytorch_job.list_pods()
+        assert pods is not None
+
+        # Check if at least one pod is listed
+        assert len(pods) > 0
+
+        # Store the first pod name for later use
+        pytest.pod_name = pods[0]
+
+        logger.info(f"Successfully listed pods: {pods}")
+
+    def test_get_logs(self, pytorch_job):
+        """Test getting logs for a specific pod in a job."""
+        pod_name = getattr(pytest, "pod_name", None)
+        if not pod_name:
+            pytest.skip("No pod name available from previous test")
+
+        logs = pytorch_job.get_logs_from_pod(pod_name)
+        assert logs is not None
+
+        logger.info(f"Successfully retrieved logs for pod: {pod_name}")
+
+    def test_delete_job(self, pytorch_job):
+        """Test deleting a job."""
+        pytorch_job.delete()
+        logger.info(f"Successfully deleted job: {pytorch_job.metadata.name}")
+
+        # Wait a moment for the job to be deleted
+        time.sleep(5)
+
+        # Verify the job is no longer listed
+        jobs = HyperPodPytorchJob.list()
+        job_names = [job.metadata.name for job in jobs]
+        assert pytorch_job.metadata.name not in job_names
diff --git a/test/integration_tests/utils.py b/test/integration_tests/utils.py