diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 1a42d3dd..0d32b02d 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -1,32 +1,30 @@ -# PR Approval Steps - -## For Requester - -1. Description - - [ ] Check the PR title and description for clarity. It should describe the changes made and the reason behind them. - - [ ] Ensure that the PR follows the contribution guidelines, if applicable. -2. Security requirements - - [ ] Ensure that a Pull Request (PR) does not expose passwords and other sensitive information by using git-secrets and upload relevant evidence: https://github.com/awslabs/git-secrets - - [ ] Ensure commit has GitHub Commit Signature -3. Manual review - 1. Click on the Files changed tab to see the code changes. Review the changes thoroughly: - - [ ] Code Quality: Check for coding standards, naming conventions, and readability. - - [ ] Functionality: Ensure that the changes meet the requirements and that all necessary code paths are tested. - - [ ] Security: Check for any security issues or vulnerabilities. - - [ ] Documentation: Confirm that any necessary documentation (code comments, README updates, etc.) has been updated. -4. Check for Merge Conflicts: - - [ ] Verify if there are any merge conflicts with the base branch. GitHub will usually highlight this. If there are conflicts, you should resolve them. - -## For Reviewer - -1. Go through `For Requester` section to double check each item. -2. Request Changes or Approve the PR: - 1. If the PR is ready to be merged, click Review changes and select Approve. - 2. If changes are required, select Request changes and provide feedback. Be constructive and clear in your feedback. -3. Merging the PR - 1. Check the Merge Method: - 1. Decide on the appropriate merge method based on your repository's guidelines (e.g., Squash and merge, Rebase and merge, or Merge). - 2. Merge the PR: - 1. Click the Merge pull request button. - 2. Confirm the merge by clicking Confirm merge. +## What's changing and why? + + +## Before/After UX + +**Before:** + + +**After:** + + +## How was this change tested? + + + +## Are unit tests added? + + +## Are integration tests added? + + +## Reviewer Guidelines + +‼️ **Merge Requirements**: PRs with failing integration tests cannot be merged without justification. + +One of the following must be true: +- [ ] All automated PR checks pass +- [ ] Failed tests include local run results/screenshots proving they work +- [ ] Changes are documentation-only \ No newline at end of file diff --git a/.github/workflows/codebuild-ci.yml b/.github/workflows/codebuild-ci.yml index 518d5686..e7929125 100644 --- a/.github/workflows/codebuild-ci.yml +++ b/.github/workflows/codebuild-ci.yml @@ -2,8 +2,7 @@ name: PR Checks on: pull_request_target: branches: - - "master*" - - "main*" + - "*" concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.head_ref }} diff --git a/.github/workflows/security-monitoring.yml b/.github/workflows/security-monitoring.yml index bc80e244..bf3e1df8 100644 --- a/.github/workflows/security-monitoring.yml +++ b/.github/workflows/security-monitoring.yml @@ -73,7 +73,7 @@ jobs: uses: aws-actions/configure-aws-credentials@12e3392609eaaceb7ae6191b3f54bbcb85b5002b with: role-to-assume: ${{ secrets.MONITORING_ROLE_ARN }} - aws-region: us-west-2 + aws-region: us-east-2 - name: Put Dependabot Alert Metric Data run: | if [ "${{ needs.check-dependabot-alerts.outputs.dependabot_alert_status }}" == "1" ]; then diff --git a/.gitignore b/.gitignore index f72c7e06..46ae4cc6 100644 --- a/.gitignore +++ b/.gitignore @@ -16,14 +16,23 @@ __pycache__/ /.mypy_cache /doc/_apidoc/ +doc/_build/ /build /sagemaker-hyperpod/build /sagemaker-hyperpod/.coverage /sagemaker-hyperpod/.coverage.* +/hyperpod-cluster-stack-template/build +/hyperpod-pytorch-job-template/build +/hyperpod-custom-inference-template/build +/hyperpod-jumpstart-inference-template/build + # Ignore all contents of result and results directories /result/ /results/ -.idea/ \ No newline at end of file +.idea/ + +.venv* +venv diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 00000000..7b186f4f --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,20 @@ +version: 2 + +build: + os: ubuntu-22.04 + tools: + python: "3.9" + +python: + install: + - method: pip + path: . + - requirements: doc/requirements.txt + +sphinx: + configuration: doc/conf.py + fail_on_warning: false + +formats: + - pdf + - epub \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 8262140d..731b83b9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,23 +1,76 @@ # Changelog -## v2.0.0 (2024-12-04) +## v.3.3.0 (2025-09-23) ### Features -- feature: The HyperPod CLI now support ([Hyperpod recipes](https://github.com/aws/sagemaker-hyperpod-recipes.git)). The HyperPod recipes enable customers to get started training and fine-tuning popular publicly-available foundation models like Llama 3.1 405B in minutes. Learn more ([here](https://github.com/aws/sagemaker-hyperpod-recipes.git)). + * Init Experience + * Init, Validate, and Create JumpStart endpoint, Custom endpoint, and PyTorch Training Job with local configuration + * Cluster management + * Bug fixes for cluster creation + -## v1.0.0 (2024-09-09) +## v.3.2.2 (2025-09-10) ### Features -- feature: Add support for SageMaker HyperPod CLI + * Fix for production canary failures caused by bad training job template. + * New version for Health Monitoring Agent (1.0.790.0_1.0.266.0) with minor improvements and bug fixes. + +## v3.2.1 (2025-08-27) + +### Features + + * Cluster management + * Bug Fixes with cluster creation + * Enable cluster template to be installed with hyperpod CLI . + +## v3.2.0 (2025-08-25) + +### Features + + * Cluster management + * Creation of cluster stack + * Describing and listing a cluster stack + * Updating a cluster + * Init Experience + * Init, Validate, Create with local configurations + + +## v3.1.0 (2025-08-13) + +### Features + * Task Governance feature for training jobs. + + +## v3.0.2 (2025-07-31) +### Features + + * Update volume flag to support hostPath and PVC + * Add an option to disable the deployment of KubeFlow TrainingOperator + * Enable telemetry for CLI -## v1.0.0] ([2025]-[07]-[10]) +## v3.0.0 (2025-07-10) ### Features * Training Job - Create, List , Get * Inference Jumpstart - Create , List, Get, Invoke * Inference Custom - Create , List, Get, Invoke - * Observability changes \ No newline at end of file + * Observability changes + +## v2.0.0 (2024-12-04) + +### Features + +- feature: The HyperPod CLI now support ([Hyperpod recipes](https://github.com/aws/sagemaker-hyperpod-recipes.git)). The HyperPod recipes enable customers to get started training and fine-tuning popular publicly-available foundation models like Llama 3.1 405B in minutes. Learn more ([here](https://github.com/aws/sagemaker-hyperpod-recipes.git)). + +## v1.0.0 (2024-09-09) + +### Features + +- feature: Add support for SageMaker HyperPod CLI + + + diff --git a/README.md b/README.md index f59a428f..72e1bc6c 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # SageMaker HyperPod command-line interface -The Amazon SageMaker HyperPod command-line interface (HyperPod CLI) is a tool that helps manage training jobs on the SageMaker HyperPod clusters orchestrated by Amazon EKS. +The Amazon SageMaker HyperPod command-line interface (HyperPod CLI) is a tool that helps manage clusters, training jobs, and inference endpoints on the SageMaker HyperPod clusters orchestrated by Amazon EKS. This documentation serves as a reference for the available HyperPod CLI commands. For a comprehensive user guide, see [Orchestrating SageMaker HyperPod clusters with Amazon EKS](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-hyperpod-eks.html) in the *Amazon SageMaker Developer Guide*. @@ -14,32 +14,39 @@ Note: Old `hyperpod`CLI V2 has been moved to `release_v2` branch. Please refer [ - [ML Framework Support](#ml-framework-support) - [Installation](#installation) - [Usage](#usage) - - [Getting Clusters](#getting-cluster-information) - - [Connecting to a Cluster](#connecting-to-a-cluster) - - [Getting Cluster Context](#getting-cluster-context) - - [Listing Pods](#listing-pods) - - [Accessing Logs](#accessing-logs) - - [CLI](#cli-) - - [Training](#training-) - - [Inference](#inference-) - - [SDK](#sdk-) - - [Training](#training-sdk) - - [Inference](#inference-sdk) + - [Getting Started](#getting-started) + - [CLI](#cli) + - [Cluster Management](#cluster-management) + - [Training](#training) + - [Inference](#inference) + - [Jumpstart Endpoint](#jumpstart-endpoint-creation) + - [Custom Endpoint](#custom-endpoint-creation) + - [SDK](#sdk) + - [Cluster Management](#cluster-management-sdk) + - [Training](#training-sdk) + - [Inference](#inference-sdk) +- [Examples](#examples) ## Overview The SageMaker HyperPod CLI is a tool that helps create training jobs and inference endpoint deployments to the Amazon SageMaker HyperPod clusters orchestrated by Amazon EKS. It provides a set of commands for managing the full lifecycle of jobs, including create, describe, list, and delete operations, as well as accessing pod and operator logs where applicable. The CLI is designed to abstract away the complexity of working directly with Kubernetes for these core actions of managing jobs on SageMaker HyperPod clusters orchestrated by Amazon EKS. -## Prerequisites for Training +## Prerequisites + +### Region Configuration + +**Important**: For commands that accept the `--region` option, if no region is explicitly provided, the command will use the default region from your AWS credentials configuration. + +### Prerequisites for Training - HyperPod CLI currently supports starting PyTorchJobs. To start a job, you need to install Training Operator first. - You can follow [pytorch operator doc](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-eks-operator-install.html) to install it. -## Prerequisites for Inference +### Prerequisites for Inference - HyperPod CLI supports creating Inference Endpoints through jumpstart and through custom Endpoint config - - You can follow [inference operator doc](https://github.com/aws/sagemaker-hyperpod-cli/tree/master/helm_chart/HyperPodHelmChart/charts/inference-operator) to install it. + - You can follow [inference operator doc](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-hyperpod-model-deployment-setup.html) to install it. ## Platform Support @@ -54,26 +61,15 @@ SageMaker HyperPod CLI currently supports start training job with: 1. Make sure that your local python version is 3.8, 3.9, 3.10 or 3.11. -1. Install ```helm```. +2. Install the sagemaker-hyperpod-cli package. - The SageMaker Hyperpod CLI uses Helm to start training jobs. See also the [Helm installation guide](https://helm.sh/docs/intro/install/). - - ``` - curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 - chmod 700 get_helm.sh - ./get_helm.sh - rm -f ./get_helm.sh - ``` - -1. Clone and install the sagemaker-hyperpod-cli package. - - ``` + ```bash pip install sagemaker-hyperpod ``` -1. Verify if the installation succeeded by running the following command. +3. Verify if the installation succeeded by running the following command. - ``` + ```bash hyp --help ``` @@ -81,85 +77,208 @@ SageMaker HyperPod CLI currently supports start training job with: The HyperPod CLI provides the following commands: -- [Getting Clusters](#getting-cluster-information) -- [Connecting to a Cluster](#connecting-to-a-cluster) -- [Getting Cluster Context](#getting-cluster-context) -- [Listing Pods](#listing-pods) -- [Accessing Logs](#accessing-logs) -- [CLI](#cli-) - - [Training](#training-) - - [Inference](#inference-) -- [SDK](#sdk-) +- [Getting Started](#getting-started) +- [CLI](#cli) + - [Cluster Management](#cluster-management) + - [Training](#training) + - [Inference](#inference) + - [Jumpstart Endpoint](#jumpstart-endpoint-creation) + - [Custom Endpoint](#custom-endpoint-creation) +- [SDK](#sdk) + - [Cluster Management](#cluster-management-sdk) - [Training](#training-sdk) - [Inference](#inference-sdk) -### Getting Cluster information +### Getting Started + +#### Getting Cluster information This command lists the available SageMaker HyperPod clusters and their capacity information. -``` -hyp list-cluster [--region ] [--namespace ] [--output ] +```bash +hyp list-cluster ``` -* `region` (string) - Optional. The region that the SageMaker HyperPod and EKS clusters are located. If not specified, it will be set to the region from the current AWS account credentials. -* `namespace` (string) - Optional. The namespace that users want to check the quota with. Only the SageMaker managed namespaces are supported. -* `output` (enum) - Optional. The output format. Available values are `table` and `json`. The default value is `json`. +| Option | Type | Description | +|--------|------|-------------| +| `--region ` | Optional | The region that the SageMaker HyperPod and EKS clusters are located. If not specified, it will be set to the region from the current AWS account credentials. | +| `--namespace ` | Optional | The namespace that users want to check the quota with. Only the SageMaker managed namespaces are supported. | +| `--output ` | Optional | The output format. Available values are `table` and `json`. The default value is `json`. | +| `--debug` | Optional | Enable debug mode for detailed logging. | -### Connecting to a Cluster +#### Connecting to a Cluster This command configures the local Kubectl environment to interact with the specified SageMaker HyperPod cluster and namespace. -``` -hyp set-cluster-context --cluster-name [--namespace ] +```bash +hyp set-cluster-context --cluster-name ``` -* `cluster-name` (string) - Required. The SageMaker HyperPod cluster name to configure with. -* `namespace` (string) - Optional. The namespace that you want to connect to. If not specified, Hyperpod cli commands will auto discover the accessible namespace. +| Option | Type | Description | +|--------|------|-------------| +| `--cluster-name ` | Required | The SageMaker HyperPod cluster name to configure with. | +| `--namespace ` | Optional | The namespace that you want to connect to. If not specified, Hyperpod cli commands will auto discover the accessible namespace. | +| `--region ` | Optional | The AWS region where the HyperPod cluster resides. | +| `--debug` | Optional | Enable debug mode for detailed logging. | -### Getting Cluster Context +#### Getting Cluster Context Get all the context related to the current set Cluster -``` +```bash hyp get-cluster-context ``` -### Listing Pods +| Option | Type | Description | +|--------|------|-------------| +| `--debug` | Optional | Enable debug mode for detailed logging. | -This command lists all the pods associated with a specific training job. +## CLI + +### Cluster Management + +**Important**: For commands that accept the `--region` option, if no region is explicitly provided, the command will use the default region from your AWS credentials configuration. + +**Cluster stack names must be unique within each AWS region.** If you attempt to create a cluster stack with a name that already exists in the same region, the deployment will fail. + +#### Initialize Cluster Configuration + +Initialize a new cluster configuration in the current directory: + +```bash +hyp init cluster-stack ``` -hyp list-pods hyp-pytorch-job --job-name + +**Important**: The `resource_name_prefix` parameter in the generated `config.yaml` file serves as the primary identifier for all AWS resources created during deployment. Each deployment must use a unique resource name prefix to avoid conflicts. This prefix is automatically appended with a unique identifier during cluster creation to ensure resource uniqueness. + +#### Configure Cluster Parameters + +Configure cluster parameters interactively or via command line: + +```bash +hyp configure --resource-name-prefix my-cluster --stage prod ``` -* `job-name` (string) - Required. The name of the job to list pods for. +#### Validate Configuration -### Accessing Logs +Validate the configuration file syntax: -This command retrieves the logs for a specific pod within a training job. +```bash +hyp validate +``` + +#### Create Cluster Stack +Create the cluster stack using the configured parameters: + +```bash +hyp create --region ``` -hyp get-logs hyp-pytorch-job --pod-name --job-name + +**Note**: The region flag is optional. If not provided, the command will use the default region from your AWS credentials configuration. + +#### List Cluster Stacks + +```bash +hyp list cluster-stack +``` + +| Option | Type | Description | +|--------|------|-------------| +| `--region ` | Optional | The AWS region to list stacks from. | +| `--status "['CREATE_COMPLETE', 'UPDATE_COMPLETE']"` | Optional | Filter by stack status. | +| `--debug` | Optional | Enable debug mode for detailed logging. | + +#### Describe Cluster Stack + +```bash +hyp describe cluster-stack ``` -* `job-name` (string) - Required. The name of the job to get the log for. -* `pod-name` (string) - Required. The name of the pod to get the log from. +| Option | Type | Description | +|--------|------|-------------| +| `--region ` | Optional | The AWS region where the stack exists. | +| `--debug` | Optional | Enable debug mode for detailed logging. | + +#### Delete Cluster Stack + +Delete a HyperPod cluster stack. Removes the specified CloudFormation stack and all associated AWS resources. This operation cannot be undone. + +```bash + hyp delete cluster-stack +``` +| Option | Type | Description | +|--------|------|-------------| +| `--region ` | Required | The AWS region where the stack exists. | +| `--retain-resources S3Bucket-TrainingData,EFSFileSystem-Models` | Optional | Comma-separated list of logical resource IDs to retain during deletion (only works on DELETE_FAILED stacks). Resource names are shown in failed deletion output, or use AWS CLI: `aws cloudformation list-stack-resources STACK_NAME --region REGION`. | +| `--debug` | Optional | Enable debug mode for detailed logging. | -### CLI + +#### Update Existing Cluster + +```bash +hyp update cluster --cluster-name my-cluster \ + --instance-groups '[{"InstanceCount":2,"InstanceGroupName":"worker-nodes","InstanceType":"ml.m5.large"}]' \ + --node-recovery Automatic +``` + +#### Reset Configuration + +Reset configuration to default values: + +```bash +hyp reset +``` ### Training -#### Creating a Training Job +#### **Option 1**: Create Pytorch job through init experience +#### Initialize Pytorch Job Configuration + +Initialize a new pytorch job configuration in the current directory: + +```bash +hyp init hyp-pytorch-job ``` + +#### Configure Pytorch Job Parameters + +Configure pytorch job parameters interactively or via command line: + +```bash +hyp configure --job-name my-pytorch-job +``` + +#### Validate Configuration + +Validate the configuration file syntax: + +```bash +hyp validate +``` + +#### Create Pytorch Job + +Create the pytorch job using the configured parameters: + +```bash +hyp create +``` + + +#### **Option 2**: Create Pytorch job through create command + +```bash hyp create hyp-pytorch-job \ --version 1.0 \ --job-name test-pytorch-job \ --image pytorch/pytorch:latest \ - --command '["python", "train.py"]' \ - --args '["--epochs", "10", "--batch-size", "32"]' \ + --command '[python, train.py]' \ + --args '[--epochs=10, --batch-size=32]' \ --environment '{"PYTORCH_CUDA_ALLOC_CONF": "max_split_size_mb:32"}' \ --pull-policy "IfNotPresent" \ --instance-type ml.p4d.24xlarge \ @@ -170,96 +289,410 @@ hyp create hyp-pytorch-job \ --queue-name "training-queue" \ --priority "high" \ --max-retry 3 \ - --volumes '["data-vol", "model-vol", "checkpoint-vol"]' \ - --persistent-volume-claims '["shared-data-pvc", "model-registry-pvc"]' \ - --output-s3-uri s3://my-bucket/model-artifacts + --accelerators 8 \ + --vcpu 96.0 \ + --memory 1152.0 \ + --accelerators-limit 8 \ + --vcpu-limit 96.0 \ + --memory-limit 1152.0 \ + --preferred-topology "topology.kubernetes.io/zone=us-west-2a" \ + --volume name=model-data,type=hostPath,mount_path=/data,path=/data \ + --volume name=training-output,type=pvc,mount_path=/data2,claim_name=my-pvc,read_only=false ``` -Key required parameters explained: +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--job-name` | TEXT | Yes | Unique name for the training job (1-63 characters, alphanumeric with hyphens) | +| `--image` | TEXT | Yes | Docker image URI containing your training code | +| `--namespace` | TEXT | No | Kubernetes namespace | +| `--command` | ARRAY | No | Command to run in the container (array of strings) | +| `--args` | ARRAY | No | Arguments for the entry script (array of strings) | +| `--environment` | OBJECT | No | Environment variables as key-value pairs | +| `--pull-policy` | TEXT | No | Image pull policy (Always, Never, IfNotPresent) | +| `--instance-type` | TEXT | No | Instance type for training | +| `--node-count` | INTEGER | No | Number of nodes (minimum: 1) | +| `--tasks-per-node` | INTEGER | No | Number of tasks per node (minimum: 1) | +| `--label-selector` | OBJECT | No | Node label selector as key-value pairs | +| `--deep-health-check-passed-nodes-only` | BOOLEAN | No | Schedule pods only on nodes that passed deep health check (default: false) | +| `--scheduler-type` | TEXT | No | Scheduler type | +| `--queue-name` | TEXT | No | Queue name for job scheduling (1-63 characters, alphanumeric with hyphens) | +| `--priority` | TEXT | No | Priority class for job scheduling | +| `--max-retry` | INTEGER | No | Maximum number of job retries (minimum: 0) | +| `--volume` | ARRAY | No | List of volume configurations (Refer [Volume Configuration](#volume-configuration) for detailed parameter info) | +| `--service-account-name` | TEXT | No | Service account name | +| `--accelerators` | INTEGER | No | Number of accelerators a.k.a GPUs or Trainium Chips | +| `--vcpu` | FLOAT | No | Number of vCPUs | +| `--memory` | FLOAT | No | Amount of memory in GiB | +| `--accelerators-limit` | INTEGER | No | Limit for the number of accelerators a.k.a GPUs or Trainium Chips | +| `--vcpu-limit` | FLOAT | No | Limit for the number of vCPUs | +| `--memory-limit` | FLOAT | No | Limit for the amount of memory in GiB | +| `--preferred-topology` | TEXT | No | Preferred topology annotation for scheduling | +| `--required-topology` | TEXT | No | Required topology annotation for scheduling | +| `--debug` | FLAG | No | Enable debug mode (default: false) | + +#### List Training Jobs + +```bash +hyp list hyp-pytorch-job +``` + +#### Describe a Training Job + +```bash +hyp describe hyp-pytorch-job --job-name +```` + +#### Listing Pods + +This command lists all the pods associated with a specific training job. + +```bash +hyp list-pods hyp-pytorch-job --job-name +``` + +* `job-name` (string) - Required. The name of the job to list pods for. + +#### Accessing Logs + +This command retrieves the logs for a specific pod within a training job. + +```bash +hyp get-logs hyp-pytorch-job --pod-name --job-name +``` - --job-name: Unique identifier for your training job +| Parameter | Required | Description | +|--------|------|-------------| +| `--job-name` | Yes | The name of the job to get the log for. | +| `--pod-name` | Yes | The name of the pod to get the log from. | +| `--namespace` | No | The namespace of the job. Defaults to 'default'. | +| `--container` | No | The container name to get logs from. | - --image: Docker image containing your training environment +#### Get Operator Logs -This command starts a training job named test-pytorch-job. The --output-s3-uri specifies where the trained model artifacts will be stored, for example, s3://my-bucket/model-artifacts. Note this location, as you’ll need it for deploying the custom model. +```bash +hyp get-operator-logs hyp-pytorch-job --since-hours 0.5 +``` + +#### Delete a Training Job + +```bash +hyp delete hyp-pytorch-job --job-name +``` ### Inference -#### Creating a JumpstartModel Endpoint +### Jumpstart Endpoint Creation -Pre-trained Jumpstart models can be gotten from https://sagemaker.readthedocs.io/en/v2.82.0/doc_utils/jumpstart.html and fed into the call for creating the endpoint +#### **Option 1**: Create jumpstart endpoint through init experience + +#### Initialize Jumpstart Endpoint Configuration +Initialize a new jumpstart endpoint configuration in the current directory: + +```bash +hyp init hyp-jumpstart-endpoint ``` + +#### Configure Jumpstart Endpoint Parameters + +Configure jumpstart endpoint parameters interactively or via command line: + +```bash +hyp configure --endpoint-name my-jumpstart-endpoint +``` + +#### Validate Configuration + +Validate the configuration file syntax: + +```bash +hyp validate +``` + +#### Create Jumpstart Endpoint + +Create the jumpstart endpoint using the configured parameters: + +```bash +hyp create +``` + + +#### **Option 2**: Create jumpstart endpoint through create command +Pre-trained Jumpstart models can be gotten from https://sagemaker.readthedocs.io/en/v2.82.0/doc_utils/jumpstart.html and fed into the call for creating the endpoint + +```bash hyp create hyp-jumpstart-endpoint \ --version 1.0 \ --model-id jumpstart-model-id\ --instance-type ml.g5.8xlarge \ - --endpoint-name endpoint-jumpstart \ - --tls-output-s3-uri s3://sample-bucket + --endpoint-name endpoint-jumpstart ``` +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--model-id` | TEXT | Yes | JumpStart model identifier (1-63 characters, alphanumeric with hyphens) | +| `--instance-type` | TEXT | Yes | EC2 instance type for inference (must start with "ml.") | +| `--namespace` | TEXT | No | Kubernetes namespace | +| `--metadata-name` | TEXT | No | Name of the jumpstart endpoint object | +| `--accept-eula` | BOOLEAN | No | Whether model terms of use have been accepted (default: false) | +| `--model-version` | TEXT | No | Semantic version of the model (e.g., "1.0.0", 5-14 characters) | +| `--endpoint-name` | TEXT | No | Name of SageMaker endpoint (1-63 characters, alphanumeric with hyphens) | +| `--tls-certificate-output-s3-uri` | TEXT | No | S3 URI to write the TLS certificate (optional) | +| `--debug` | FLAG | No | Enable debug mode (default: false) | + #### Invoke a JumpstartModel Endpoint -``` +```bash hyp invoke hyp-jumpstart-endpoint \ --endpoint-name endpoint-jumpstart \ --body '{"inputs":"What is the capital of USA?"}' ``` + #### Managing an Endpoint -``` +```bash hyp list hyp-jumpstart-endpoint -hyp get hyp-jumpstart-endpoint --name endpoint-jumpstart +hyp describe hyp-jumpstart-endpoint --name endpoint-jumpstart +``` + +#### List Pods + +```bash +hyp list-pods hyp-jumpstart-endpoint ``` -#### Creating a Custom Inference Endpoint +#### Get Logs + +```bash +hyp get-logs hyp-jumpstart-endpoint --pod-name +``` + +#### Get Operator Logs + +```bash +hyp get-operator-logs hyp-jumpstart-endpoint --since-hours 0.5 +``` + +#### Deleting an Endpoint + +```bash +hyp delete hyp-jumpstart-endpoint --name endpoint-jumpstart +``` + + +### Custom Endpoint Creation +#### **Option 1**: Create custom endpoint through init experience + +#### Initialize Custom Endpoint Configuration + +Initialize a new custom endpoint configuration in the current directory: + +```bash +hyp init hyp-custom-endpoint +``` + +#### Configure Custom Endpoint Parameters + +Configure custom endpoint parameters interactively or via command line: +```bash +hyp configure --endpoint-name my-custom-endpoint ``` + +#### Validate Configuration + +Validate the configuration file syntax: + +```bash +hyp validate +``` + +#### Create Custom Endpoint + +Create the custom endpoint using the configured parameters: + +```bash +hyp create +``` + + +#### **Option 2**: Create custom endpoint through create command +```bash hyp create hyp-custom-endpoint \ --version 1.0 \ - --endpoint-name my-custom-endpoint \ + --endpoint-name endpoint-custom \ --model-name my-pytorch-model \ --model-source-type s3 \ - --model-location my-pytorch-training/model.tar.gz \ + --model-location my-pytorch-training \ + --model-volume-mount-name test-volume \ --s3-bucket-name your-bucket \ --s3-region us-east-1 \ --instance-type ml.g5.8xlarge \ --image-uri 763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-inference:latest \ --container-port 8080 - ``` +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--instance-type` | TEXT | Yes | EC2 instance type for inference (must start with "ml.") | +| `--model-name` | TEXT | Yes | Name of model to create on SageMaker (1-63 characters, alphanumeric with hyphens) | +| `--model-source-type` | TEXT | Yes | Model source type ("s3" or "fsx") | +| `--image-uri` | TEXT | Yes | Docker image URI for inference | +| `--container-port` | INTEGER | Yes | Port on which model server listens (1-65535) | +| `--model-volume-mount-name` | TEXT | Yes | Name of the model volume mount | +| `--namespace` | TEXT | No | Kubernetes namespace | +| `--metadata-name` | TEXT | No | Name of the custom endpoint object | +| `--endpoint-name` | TEXT | No | Name of SageMaker endpoint (1-63 characters, alphanumeric with hyphens) | +| `--env` | OBJECT | No | Environment variables as key-value pairs | +| `--metrics-enabled` | BOOLEAN | No | Enable metrics collection (default: false) | +| `--model-version` | TEXT | No | Version of the model (semantic version format) | +| `--model-location` | TEXT | No | Specific model data location | +| `--prefetch-enabled` | BOOLEAN | No | Whether to pre-fetch model data (default: false) | +| `--tls-certificate-output-s3-uri` | TEXT | No | S3 URI for TLS certificate output | +| `--fsx-dns-name` | TEXT | No | FSx File System DNS Name | +| `--fsx-file-system-id` | TEXT | No | FSx File System ID | +| `--fsx-mount-name` | TEXT | No | FSx File System Mount Name | +| `--s3-bucket-name` | TEXT | No | S3 bucket location | +| `--s3-region` | TEXT | No | S3 bucket region | +| `--model-volume-mount-path` | TEXT | No | Path inside container for model volume (default: "/opt/ml/model") | +| `--resources-limits` | OBJECT | No | Resource limits for the worker | +| `--resources-requests` | OBJECT | No | Resource requests for the worker | +| `--dimensions` | OBJECT | No | CloudWatch Metric dimensions as key-value pairs | +| `--metric-collection-period` | INTEGER | No | Period for CloudWatch query (default: 300) | +| `--metric-collection-start-time` | INTEGER | No | StartTime for CloudWatch query (default: 300) | +| `--metric-name` | TEXT | No | Metric name to query for CloudWatch trigger | +| `--metric-stat` | TEXT | No | Statistics metric for CloudWatch (default: "Average") | +| `--metric-type` | TEXT | No | Type of metric for HPA ("Value" or "Average", default: "Average") | +| `--min-value` | NUMBER | No | Minimum metric value for empty CloudWatch response (default: 0) | +| `--cloud-watch-trigger-name` | TEXT | No | Name for the CloudWatch trigger | +| `--cloud-watch-trigger-namespace` | TEXT | No | AWS CloudWatch namespace for the metric | +| `--target-value` | NUMBER | No | Target value for the CloudWatch metric | +| `--use-cached-metrics` | BOOLEAN | No | Enable caching of metric values (default: true) | +| `--invocation-endpoint` | TEXT | No | Invocation endpoint path (default: "invocations") | +| `--debug` | FLAG | No | Enable debug mode (default: false) | + + #### Invoke a Custom Inference Endpoint -``` +```bash hyp invoke hyp-custom-endpoint \ --endpoint-name endpoint-custom-pytorch \ --body '{"inputs":"What is the capital of USA?"}' - ``` -#### Deleting an Endpoint +#### Managing an Endpoint +```bash +hyp list hyp-custom-endpoint +hyp describe hyp-custom-endpoint --name endpoint-custom ``` -hyp delete hyp-jumpstart-endpoint --name endpoint-jumpstart + +#### List Pods + +```bash +hyp list-pods hyp-custom-endpoint ``` +#### Get Logs + +```bash +hyp get-logs hyp-custom-endpoint --pod-name +``` + +#### Get Operator Logs + +```bash +hyp get-operator-logs hyp-custom-endpoint --since-hours 0.5 +``` + +#### Deleting an Endpoint + +```bash +hyp delete hyp-custom-endpoint --name endpoint-custom +``` ## SDK -Along with the CLI, we also have SDKs available that can perform the training and inference functionalities that the CLI performs +Along with the CLI, we also have SDKs available that can perform the cluster management, training and inference functionalities that the CLI performs -### Training SDK +### Cluster Management SDK -#### Creating a Training Job +#### Creating a Cluster Stack + +```python +from sagemaker.hyperpod.cluster_management.hp_cluster_stack import HpClusterStack +# Initialize cluster stack configuration +cluster_stack = HpClusterStack( + stage="prod", + resource_name_prefix="my-hyperpod", + hyperpod_cluster_name="my-hyperpod-cluster", + eks_cluster_name="my-hyperpod-eks", + + # Infrastructure components + create_vpc_stack=True, + create_eks_cluster_stack=True, + create_hyperpod_cluster_stack=True, + + # Network configuration + vpc_cidr="10.192.0.0/16", + availability_zone_ids=["use2-az1", "use2-az2"], + + # Instance group configuration + instance_group_settings=[ + { + "InstanceCount": 1, + "InstanceGroupName": "controller-group", + "InstanceType": "ml.t3.medium", + "TargetAvailabilityZoneId": "use2-az2" + } + ] +) + +# Create the cluster stack +response = cluster_stack.create(region="us-east-2") +``` + +#### Listing Cluster Stacks + +```python +# List all cluster stacks +stacks = HpClusterStack.list(region="us-east-2") +print(f"Found {len(stacks['StackSummaries'])} stacks") +``` + +#### Describing a Cluster Stack + +```python +# Describe a specific cluster stack +stack_info = HpClusterStack.describe("my-stack-name", region="us-east-2") +print(f"Stack status: {stack_info['Stacks'][0]['StackStatus']}") ``` -from sagemaker.hyperpod import HyperPodPytorchJob -from sagemaker.hyperpod.job -import ReplicaSpec, Template, Spec, Container, Resources, RunPolicy, Metadata +#### Monitoring Cluster Status + +```python +from sagemaker.hyperpod.cluster_management.hp_cluster_stack import HpClusterStack + +stack = HpClusterStack() +response = stack.create(region="us-west-2") +status = stack.get_status(region="us-west-2") +print(status) +``` + +### Training SDK + +#### Creating a Training Job + +```python +from sagemaker.hyperpod.training.hyperpod_pytorch_job import HyperPodPytorchJob +from sagemaker.hyperpod.training.config.hyperpod_pytorch_job_unified_config import ( + ReplicaSpec, Template, Spec, Containers, Resources, RunPolicy +) +from sagemaker.hyperpod.common.config.metadata import Metadata # Define job specifications nproc_per_node = "1" # Number of processes per node @@ -274,7 +707,7 @@ replica_specs = ( containers = [ - Container + Containers ( # Container name name="container-name", @@ -315,16 +748,68 @@ pytorch_job = HyperPodPytorchJob replica_specs = replica_specs, # Run policy run_policy = run_policy, - # S3 location for artifacts - output_s3_uri="s3://my-bucket/model-artifacts" ) # Launch the job pytorch_job.create() - - ``` +#### List Training Jobs +```python +from sagemaker.hyperpod.training import HyperPodPytorchJob +import yaml + +# List all PyTorch jobs +jobs = HyperPodPytorchJob.list() +print(yaml.dump(jobs)) +``` + +#### Describe a Training Job +```python +from sagemaker.hyperpod.training import HyperPodPytorchJob + +# Get an existing job +job = HyperPodPytorchJob.get(name="my-pytorch-job") + +print(job) +``` + +#### List Pods for a Training Job +```python +from sagemaker.hyperpod.training import HyperPodPytorchJob + +# List Pods for an existing job +job = HyperPodPytorchJob.get(name="my-pytorch-job") +print(job.list_pods()) +``` + +#### Get Logs from a Pod +```python +from sagemaker.hyperpod.training import HyperPodPytorchJob + +# Get pod logs for a job +job = HyperPodPytorchJob.get(name="my-pytorch-job") +print(job.get_logs_from_pod("pod-name")) +``` + +#### Get Training Operator Logs +```python +from sagemaker.hyperpod.training import HyperPodPytorchJob + +# Get training operator logs +job = HyperPodPytorchJob.get(name="my-pytorch-job") +print(job.get_operator_logs(since_hours=0.1)) +``` + +#### Delete a Training Job +```python +from sagemaker.hyperpod.training import HyperPodPytorchJob +# Get an existing job +job = HyperPodPytorchJob.get(name="my-pytorch-job") + +# Delete the job +job.delete() +``` ### Inference SDK @@ -332,128 +817,219 @@ pytorch_job.create() Pre-trained Jumpstart models can be gotten from https://sagemaker.readthedocs.io/en/v2.82.0/doc_utils/jumpstart.html and fed into the call for creating the endpoint -``` +```python from sagemaker.hyperpod.inference.config.hp_jumpstart_endpoint_config import Model, Server, SageMakerEndpoint, TlsConfig from sagemaker.hyperpod.inference.hp_jumpstart_endpoint import HPJumpStartEndpoint -model = Model( - model_id="deepseek-llm-r1-distill-qwen-1-5b", - model_version="2.0.4" +model=Model( + model_id='deepseek-llm-r1-distill-qwen-1-5b' ) - -server = Server( - instance_type="ml.g5.8xlarge" +server=Server( + instance_type='ml.g5.8xlarge', ) +endpoint_name=SageMakerEndpoint(name='') -endpoint_name = SageMakerEndpoint(name="endpoint-jumpstart") - -tls_config = TlsConfig(tls_certificate_output_s3_uri="s3://sample-bucket") - -js_endpoint = HPJumpStartEndpoint( +js_endpoint=HPJumpStartEndpoint( model=model, server=server, - sage_maker_endpoint=endpoint_name, - tls_config=tls_config + sage_maker_endpoint=endpoint_name ) js_endpoint.create() ``` +#### Creating a Custom Inference Endpoint (with S3) -#### Invoke a JumpstartModel Endpoint +```python +from sagemaker.hyperpod.inference.config.hp_endpoint_config import CloudWatchTrigger, Dimensions, AutoScalingSpec, Metrics, S3Storage, ModelSourceConfig, TlsConfig, EnvironmentVariables, ModelInvocationPort, ModelVolumeMount, Resources, Worker +from sagemaker.hyperpod.inference.hp_endpoint import HPEndpoint -``` -data = '{"inputs":"What is the capital of USA?"}' -response = js_endpoint.invoke(body=data).body.read() -print(response) -``` +model_source_config = ModelSourceConfig( + model_source_type='s3', + model_location="", + s3_storage=S3Storage( + bucket_name='', + region='us-east-2', + ), +) +environment_variables = [ + EnvironmentVariables(name="HF_MODEL_ID", value="/opt/ml/model"), + EnvironmentVariables(name="SAGEMAKER_PROGRAM", value="inference.py"), + EnvironmentVariables(name="SAGEMAKER_SUBMIT_DIRECTORY", value="/opt/ml/model/code"), + EnvironmentVariables(name="MODEL_CACHE_ROOT", value="/opt/ml/model"), + EnvironmentVariables(name="SAGEMAKER_ENV", value="1"), +] -#### Creating a Custom Inference Endpoint +worker = Worker( + image='763104351884.dkr.ecr.us-east-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.4.0-tgi2.3.1-gpu-py311-cu124-ubuntu22.04-v2.0', + model_volume_mount=ModelVolumeMount( + name='model-weights', + ), + model_invocation_port=ModelInvocationPort(container_port=8080), + resources=Resources( + requests={"cpu": "30000m", "nvidia.com/gpu": 1, "memory": "100Gi"}, + limits={"nvidia.com/gpu": 1} + ), + environment_variables=environment_variables, +) -``` -from sagemaker.hyperpod.inference.config.hp_custom_endpoint_config import Model, Server, SageMakerEndpoint, TlsConfig, EnvironmentVariables -from sagemaker.hyperpod.inference.hp_custom_endpoint import HPCustomEndpoint +tls_config=TlsConfig(tls_certificate_output_s3_uri='s3://') -model = Model( - model_source_type="s3", - model_location="test-pytorch-job/model.tar.gz", - s3_bucket_name="my-bucket", - s3_region="us-east-2", - prefetch_enabled=True +custom_endpoint = HPEndpoint( + endpoint_name='', + instance_type='ml.g5.8xlarge', + model_name='deepseek15b-test-model-name', + tls_config=tls_config, + model_source_config=model_source_config, + worker=worker, ) -server = Server( - instance_type="ml.g5.8xlarge", - image_uri="763104351884.dkr.ecr.us-east-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.4.0-tgi2.3.1-gpu-py311-cu124-ubuntu22.04-v2.0", - container_port=8080, - model_volume_mount_name="model-weights" -) +custom_endpoint.create() +``` -resources = { - "requests": {"cpu": "30000m", "nvidia.com/gpu": 1, "memory": "100Gi"}, - "limits": {"nvidia.com/gpu": 1} -} - -env = EnvironmentVariables( - HF_MODEL_ID="/opt/ml/model", - SAGEMAKER_PROGRAM="inference.py", - SAGEMAKER_SUBMIT_DIRECTORY="/opt/ml/model/code", - MODEL_CACHE_ROOT="/opt/ml/model", - SAGEMAKER_ENV="1" -) -endpoint_name = SageMakerEndpoint(name="endpoint-custom-pytorch") +#### List Endpoints -tls_config = TlsConfig(tls_certificate_output_s3_uri="s3://sample-bucket") +```python +from sagemaker.hyperpod.inference.hp_jumpstart_endpoint import HPJumpStartEndpoint +from sagemaker.hyperpod.inference.hp_endpoint import HPEndpoint -custom_endpoint = HPCustomEndpoint( - model=model, - server=server, - resources=resources, - environment=env, - sage_maker_endpoint=endpoint_name, - tls_config=tls_config, -) +# List JumpStart endpoints +jumpstart_endpoints = HPJumpStartEndpoint.list() +print(jumpstart_endpoints) -custom_endpoint.create() +# List custom endpoints +custom_endpoints = HPEndpoint.list() +print(custom_endpoints) ``` -#### Invoke a Custom Inference Endpoint +#### Describe an Endpoint +```python +from sagemaker.hyperpod.inference.hp_jumpstart_endpoint import HPJumpStartEndpoint +from sagemaker.hyperpod.inference.hp_endpoint import HPEndpoint + +# Get JumpStart endpoint details +jumpstart_endpoint = HPJumpStartEndpoint.get(name="js-endpoint-name", namespace="test") +print(jumpstart_endpoint) + +# Get custom endpoint details +custom_endpoint = HPEndpoint.get(name="endpoint-custom") +print(custom_endpoint) ``` + +#### Invoke an Endpoint +```python +from sagemaker.hyperpod.inference.hp_jumpstart_endpoint import HPJumpStartEndpoint +from sagemaker.hyperpod.inference.hp_endpoint import HPEndpoint + data = '{"inputs":"What is the capital of USA?"}' +jumpstart_endpoint = HPJumpStartEndpoint.get(name="endpoint-jumpstart") +response = jumpstart_endpoint.invoke(body=data).body.read() +print(response) + +custom_endpoint = HPEndpoint.get(name="endpoint-custom") response = custom_endpoint.invoke(body=data).body.read() print(response) ``` -#### Managing an Endpoint +#### List Pods +```python +from sagemaker.hyperpod.inference.hp_jumpstart_endpoint import HPJumpStartEndpoint +from sagemaker.hyperpod.inference.hp_endpoint import HPEndpoint +# List pods +js_pods = HPJumpStartEndpoint.list_pods() +print(js_pods) + +c_pods = HPEndpoint.list_pods() +print(c_pods) ``` -endpoint_iterator = HPJumpStartEndpoint.list() -for endpoint in endpoint_iterator: - print(endpoint.name, endpoint.status) -logs = js_endpoint.get_logs() -print(logs) +#### Get Logs +```python +from sagemaker.hyperpod.inference.hp_jumpstart_endpoint import HPJumpStartEndpoint +from sagemaker.hyperpod.inference.hp_endpoint import HPEndpoint + +# Get logs from pod +js_logs = HPJumpStartEndpoint.get_logs(pod=) +print(js_logs) +c_logs = HPEndpoint.get_logs(pod=) +print(c_logs) ``` -#### Deleting an Endpoint +#### Get Operator Logs +```python +from sagemaker.hyperpod.inference.hp_jumpstart_endpoint import HPJumpStartEndpoint +from sagemaker.hyperpod.inference.hp_endpoint import HPEndpoint +# Invoke JumpStart endpoint +print(HPJumpStartEndpoint.get_operator_logs(since_hours=0.1)) + +# Invoke custom endpoint +print(HPEndpoint.get_operator_logs(since_hours=0.1)) ``` -js_endpoint.delete() +#### Delete an Endpoint +```python +from sagemaker.hyperpod.inference.hp_jumpstart_endpoint import HPJumpStartEndpoint +from sagemaker.hyperpod.inference.hp_endpoint import HPEndpoint + +# Delete JumpStart endpoint +jumpstart_endpoint = HPJumpStartEndpoint.get(name="endpoint-jumpstart") +jumpstart_endpoint.delete() + +# Delete custom endpoint +custom_endpoint = HPEndpoint.get(name="endpoint-custom") +custom_endpoint.delete() ``` + #### Observability - Getting Monitoring Information -``` -from sagemaker.hyperpod.utils import get_monitoring_config, +```python +from sagemaker.hyperpod.observability.utils import get_monitoring_config monitor_config = get_monitoring_config() -monitor_config.grafanaURL -monitor_config.prometheusURL ``` +## Examples +#### Cluster Management Example Notebooks + +[CLI Cluster Management Example](https://github.com/aws/sagemaker-hyperpod-cli/blob/main/examples/cluster_management/cluster_creation_init_experience.ipynb) + +[SDK Cluster Management Example](https://github.com/aws/sagemaker-hyperpod-cli/blob/main/examples/cluster_management/cluster_creation_sdk_experience.ipynb) + +#### Training Example Notebooks + +[CLI Training Init Experience Example](https://github.com/aws/sagemaker-hyperpod-cli/blob/main/examples/training/CLI/training-init-experience.ipynb) + +[CLI Training Example](https://github.com/aws/sagemaker-hyperpod-cli/blob/main/examples/training/CLI/training-e2e-cli.ipynb) + +[SDK Training Example](https://github.com/aws/sagemaker-hyperpod-cli/blob/main/examples/training/SDK/training_sdk_example.ipynb) + +#### Inference Example Notebooks + +##### CLI +[CLI Inference Jumpstart Model Init Experience Example](https://github.com/aws/sagemaker-hyperpod-cli/blob/main/examples/inference/CLI/inference-jumpstart-init-experience.ipynb) + +[CLI Inference JumpStart Model Example](https://github.com/aws/sagemaker-hyperpod-cli/blob/main/examples/inference/CLI/inference-jumpstart-e2e-cli.ipynb) + +[CLI Inference FSX Model Example](https://github.com/aws/sagemaker-hyperpod-cli/blob/main/examples/inference/CLI/inference-fsx-model-e2e-cli.ipynb) + +[CLI Inference S3 Model Init Experience Example](https://github.com/aws/sagemaker-hyperpod-cli/blob/main/examples/inference/CLI/inference-s3-model-init-experience.ipynb) + +[CLI Inference S3 Model Example](https://github.com/aws/sagemaker-hyperpod-cli/blob/main/examples/inference/CLI/inference-s3-model-e2e-cli.ipynb) + +##### SDK + +[SDK Inference JumpStart Model Example](https://github.com/aws/sagemaker-hyperpod-cli/blob/main/examples/inference/SDK/inference-jumpstart-e2e.ipynb) + +[SDK Inference FSX Model Example](https://github.com/aws/sagemaker-hyperpod-cli/blob/main/examples/inference/SDK/inference-fsx-model-e2e.ipynb) + +[SDK Inference S3 Model Example](https://github.com/aws/sagemaker-hyperpod-cli/blob/main/examples/inference/SDK/inference-s3-model-e2e.ipynb) + + ## Disclaimer * This CLI and SDK requires access to the user's file system to set and get context and function properly. diff --git a/doc/Makefile b/doc/Makefile new file mode 100644 index 00000000..c8d71c96 --- /dev/null +++ b/doc/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = python3 -msphinx +SPHINXPROJ = sagemaker +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file diff --git a/doc/_static/custom.css b/doc/_static/custom.css new file mode 100644 index 00000000..c37521b6 --- /dev/null +++ b/doc/_static/custom.css @@ -0,0 +1,184 @@ +/* Custom styles for SageMaker HyperPod documentation */ + +/* Adjust logo size and alignment */ +.navbar-brand img { + max-height: 40px; + width: auto; + margin-right: 10px; + vertical-align: middle; +} + +.navbar-brand .title { + font-weight: 800; + color: #111827; +} + +/* Ensure logo container doesn't force wrapping */ +.navbar-brand-box { + width: auto; + flex-shrink: 0; +} + +/* Header styling */ +header { + background-color: white; + + box-shadow: 0 1px 2px rgba(0, 0, 0, 0.05); + position: sticky; + top: 0; + z-index: 50; +} + +h1 { + font-size: 1.875rem; + font-weight: 700; + color: #111827; +} + +h2 { + font-size: 1.5rem; + font-weight: 700; + color: #111827; +} + +h3 { + font-size: 1.25rem; + font-weight: 500; + color: #111827; +} + +p { + font-size: 1.0rem; + color: #4b5563; +} + +html[data-theme="dark"] .navbar-brand .title { + color: #f8fafc !important; +} + +html[data-theme="dark"] p { + color: #d1d5db !important; +} + +.current.active>a { + background-color: aliceblue !important; +} + +.bd-sidebar-primary li.has-children .caption, +.bd-sidebar-primary li.has-children>.reference { + margin-right: inherit; +} + +nav.bd-links li>a { + margin-right: inherit; +} + +.table tbody tr:hover { + background: none !important; +} + +.wy-table-responsive table td, +.wy-table-responsive table th { + white-space: normal; +} + +.wy-table-responsive { + margin-bottom: 24px; + max-width: 100%; + overflow: visible; +} + +.pagination { + display: inline-block; +} + +.pagination a { + color: black; + float: left; + padding: 8px 16px; + text-decoration: none; +} + +.pagination a.active { + background-color: #2a80b9; + color: white; +} + +.pagination a:hover:not(.active) { + background-color: #ddd; +} + + +dl.py.class.dt.sig.sig-object.py { + overflow: auto; + margin: 6px 0; + font-size: 90%; + line-height: normal; + background: #e7f2fa !important; + color: #2980b9 !important; + border-top: 3px solid #6ab0de !important; + padding: 6px; + position: relative; +} + +.bd-article { + overflow: auto; +} + +.sig-prename.descclassname { + color: #000; +} + +.field-list { + display: grid !important; + grid-template-columns: 0.5fr 2fr !important; +} + +.field-list dt { + background: transparent !important; + word-break: normal !important; +} + +.py.class dl { + margin: 1rem 0 !important; +} + +.page-toc.tocsection.onthispage svg { + margin-right: 0.5rem; +} + +.sidebar-secondary-items { + display: block !important; + padding: 0.5rem 0 !important; +} + +.table { + border-radius: 4px !important; + border: 1px solid #e1e5e9 !important; + border-collapse: separate !important; + border-spacing: 0 !important; + overflow: hidden !important; +} + +.table tbody tr { + background: none !important; +} + +.table tbody tr:hover { + background: none !important; +} + +.table td, +.table th { + border: none !important; + border-bottom: 1px solid #e1e5e9 !important; +} + +.table tr:last-child td { + border-bottom: none !important; +} + +.bd-toc code { + background: transparent !important; + border: none; +} \ No newline at end of file diff --git a/doc/_static/image.png b/doc/_static/image.png new file mode 100644 index 00000000..c90c4cd2 Binary files /dev/null and b/doc/_static/image.png differ diff --git a/doc/_static/image_dark.png b/doc/_static/image_dark.png new file mode 100644 index 00000000..ebcadd94 Binary files /dev/null and b/doc/_static/image_dark.png differ diff --git a/doc/_static/image_light.svg b/doc/_static/image_light.svg new file mode 100644 index 00000000..2aed204d --- /dev/null +++ b/doc/_static/image_light.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/doc/_static/search_accessories.css b/doc/_static/search_accessories.css new file mode 100644 index 00000000..c7e09e1f --- /dev/null +++ b/doc/_static/search_accessories.css @@ -0,0 +1,29 @@ +.example-badge { + background-color: #c63340; + color: white; + padding: 0.25rem 0.5rem; + text-align: center; + border-radius: 5px; + font-size: 0.8rem; + display: inline-block; +} + +.aws-doc-badge { + background-color: #e18b50; + color: white; + padding: 0.25rem 0.5rem; + text-align: center; + border-radius: 5px; + font-size: 0.8rem; + display: inline-block; +} + +.sdk-doc-badge { + background-color: #4c968f; + color: white; + padding: 0.25rem 0.5rem; + text-align: center; + border-radius: 5px; + font-size: 0.8rem; + display: inline-block; +} \ No newline at end of file diff --git a/doc/advanced_resources.md b/doc/advanced_resources.md new file mode 100644 index 00000000..d3e2cc2c --- /dev/null +++ b/doc/advanced_resources.md @@ -0,0 +1,54 @@ +(advanced_resources)= + +# Advanced Resources + +```{toctree} +:hidden: +:maxdepth: 2 + +examples +AWS SageMaker HyperPod Docs +HyperPod Developer Guide +SageMaker HyperPod Workshop + +``` + +## Advanced Resources + +::::{grid} 1 2 2 2 +:gutter: 3 + +:::{grid-item-card} Github +:link: examples +:link-type: ref +:class-card: sd-border-secondary + +**Example Notebooks** - Ready-to-use implementation guides +::: + +:::{grid-item-card} AWS SageMaker HyperPod Docs +:link: https://docs.aws.amazon.com/sagemaker/latest/dg/hyperpod.html +:link-type: url +:class-card: sd-border-secondary + +**HyperPod Documentation** - Know more about HyperPod +::: + +:::{grid-item-card} HyperPod Developer Guide +:link: https://catalog.workshops.aws/sagemaker-hyperpod-eks/en-US +:link-type: url +:class-card: sd-border-secondary + +**Developer Guide** - Refer to this practical development guide +::: + +:::{grid-item-card} SageMaker HyperPod Workshop +:link: https://catalog.workshops.aws/sagemaker-hyperpod-eks/en-US +:link-type: url +:class-card: sd-border-secondary + +**Practical Guide** - Refer to the workshop for detailed follow-through steps +::: + + +:::: diff --git a/doc/api/metadata.rst b/doc/api/metadata.rst new file mode 100644 index 00000000..6ae5472d --- /dev/null +++ b/doc/api/metadata.rst @@ -0,0 +1,7 @@ +Metadata +------------ + +.. automodule:: sagemaker.hyperpod.common.config.metadata + :members: + :undoc-members: + :show-inheritance: diff --git a/doc/cli/cli_index.rst b/doc/cli/cli_index.rst new file mode 100644 index 00000000..3d3885a3 --- /dev/null +++ b/doc/cli/cli_index.rst @@ -0,0 +1,38 @@ +CLI Reference +============= + +Complete reference for the SageMaker HyperPod Command Line Interface. + +.. toctree:: + :hidden: + :maxdepth: 2 + + cluster_management/cli_cluster_management + training/cli_training + inference/cli_inference + +.. container:: + + .. grid:: 1 1 3 3 + :gutter: 3 + + .. grid-item-card:: Cluster Management CLI + :link: cluster_management/cli_cluster_management + :link-type: doc + :class-card: sd-border-secondary + + Cluster stack management commands, options and parameters. + + .. grid-item-card:: Training CLI + :link: training/cli_training + :link-type: doc + :class-card: sd-border-secondary + + Training CLI commands, options and parameters. + + .. grid-item-card:: Inference CLI + :link: inference/cli_inference + :link-type: doc + :class-card: sd-border-secondary + + Inference CLI commands, options and parameters. \ No newline at end of file diff --git a/doc/cli/cli_reference.md b/doc/cli/cli_reference.md new file mode 100644 index 00000000..6ae3af58 --- /dev/null +++ b/doc/cli/cli_reference.md @@ -0,0 +1,45 @@ +(cli_reference)= + +# CLI Reference + +```{toctree} +:hidden: +:maxdepth: 2 + +cli_training +cli_inference +cli_cluster_management +``` + +Complete reference for the SageMaker HyperPod Command Line Interface. + +::::{container} +::::{grid} 1 1 3 3 +:gutter: 3 + +:::{grid-item-card} Training CLI +:link: cli_training +:link-type: ref +:class-card: sd-border-secondary + +Training CLI commands, options and parameters. +::: + +:::{grid-item-card} Inference CLI +:link: cli_inference +:link-type: ref +:class-card: sd-border-secondary + +Inference CLI commands, options and parameters. +::: + +:::{grid-item-card} Cluster Management CLI +:link: cli_cluster_management +:link-type: ref +:class-card: sd-border-secondary + +Cluster stack management commands, options and parameters. +::: + +:::: +:::: \ No newline at end of file diff --git a/doc/cli/cluster_management/cli_cluster_management.md b/doc/cli/cluster_management/cli_cluster_management.md new file mode 100644 index 00000000..dcf3fc8a --- /dev/null +++ b/doc/cli/cluster_management/cli_cluster_management.md @@ -0,0 +1,429 @@ +(cli_cluster_management)= + +# Cluster Management + +Complete reference for SageMaker HyperPod cluster management parameters and configuration options. + +```{note} +**Region Configuration**: For commands that accept the `--region` option, if no region is explicitly provided, the command will use the default region from your AWS credentials configuration. +``` + +* [Initialize Configuration](#hyp-init) +* [Create Cluster Stack](#hyp-create) +* [Update Cluster](#hyp-update-cluster) +* [List Cluster Stacks](#hyp-list-cluster-stack) +* [Describe Cluster Stack](#hyp-describe-cluster-stack) +* [List HyperPod Clusters](#hyp-list-cluster) +* [Set Cluster Context](#hyp-set-cluster-context) +* [Get Cluster Context](#hyp-get-cluster-context) +* [Get Monitoring](#hyp-get-monitoring) + +* [Configure Parameters](#hyp-configure) +* [Validate Configuration](#hyp-validate) +* [Reset Configuration](#hyp-reset) + +## hyp init + +Initialize a template scaffold in the current directory. + +#### Syntax + +```bash +hyp init TEMPLATE [DIRECTORY] [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `TEMPLATE` | CHOICE | Yes | Template type (cluster-stack, hyp-pytorch-job, hyp-custom-endpoint, hyp-jumpstart-endpoint) | +| `DIRECTORY` | PATH | No | Target directory (default: current directory) | +| `--version` | TEXT | No | Schema version to use | + +```{important} +The `resource_name_prefix` parameter in the generated `config.yaml` file serves as the primary identifier for all AWS resources created during deployment. Each deployment must use a unique resource name prefix to avoid conflicts. This prefix is automatically appended with a unique identifier during cluster creation to ensure resource uniqueness. + +**Cluster stack names must be unique within each AWS region.** If you attempt to create a cluster stack with a name that already exists in the same region, the deployment will fail. +``` + +## hyp create + +Create a new HyperPod cluster stack using the provided configuration. + +#### Syntax + +```bash +hyp create [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--region` | TEXT | No | AWS region where the cluster stack will be created | +| `--debug` | FLAG | No | Enable debug logging | + +## hyp update cluster + +Update an existing HyperPod cluster configuration. + +```{important} +**Runtime vs Configuration Commands**: This command modifies an **existing, deployed cluster's** runtime settings (instance groups, node recovery). This is different from `hyp configure`, which only modifies local configuration files before cluster creation. +``` + +#### Syntax + +```bash +hyp update cluster [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--cluster-name` | TEXT | Yes | Name of the cluster to update | +| `--instance-groups` | TEXT | No | JSON string of instance group configurations | +| `--instance-groups-to-delete` | TEXT | No | JSON string of instance groups to delete | +| `--region` | TEXT | No | AWS region of the cluster | +| `--node-recovery` | TEXT | No | Node recovery setting (Automatic or None) | +| `--debug` | FLAG | No | Enable debug logging | + +## hyp list cluster-stack + +List all HyperPod cluster stacks (CloudFormation stacks). + +#### Syntax + +```bash +hyp list cluster-stack [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--region` | TEXT | No | AWS region to list stacks from | +| `--status` | TEXT | No | Filter by stack status. Format: "['CREATE_COMPLETE', 'UPDATE_COMPLETE']" | +| `--debug` | FLAG | No | Enable debug logging | + +## hyp describe cluster-stack + +Describe a specific HyperPod cluster stack. + +```{note} +**Region-Specific Stack Names**: Cluster stack names are unique within each AWS region. When describing a stack, ensure you specify the correct region where the stack was created, or the command will fail to find the stack. +``` + +#### Syntax + +```bash +hyp describe cluster-stack STACK-NAME [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `STACK-NAME` | TEXT | Yes | Name of the CloudFormation stack to describe | +| `--region` | TEXT | No | AWS region of the stack | +| `--debug` | FLAG | No | Enable debug logging | + +## hyp list-cluster + +List SageMaker HyperPod clusters with capacity information. + +#### Syntax + +```bash +hyp list-cluster [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--region` | TEXT | No | AWS region to list clusters from | +| `--output` | TEXT | No | Output format ("table" or "json", default: "json") | +| `--clusters` | TEXT | No | Comma-separated list of specific cluster names | +| `--namespace` | TEXT | No | Namespace to check capacity for (can be used multiple times) | +| `--debug` | FLAG | No | Enable debug logging | + +## hyp set-cluster-context + +Connect to a HyperPod EKS cluster and set kubectl context. + +#### Syntax + +```bash +hyp set-cluster-context [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--cluster-name` | TEXT | Yes | Name of the HyperPod cluster to connect to | +| `--region` | TEXT | No | AWS region of the cluster | +| `--namespace` | TEXT | No | Kubernetes namespace to connect to | +| `--debug` | FLAG | No | Enable debug logging | + +## hyp get-cluster-context + +Get context information for the currently connected cluster. + +#### Syntax + +```bash +hyp get-cluster-context [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--debug` | FLAG | No | Enable debug logging | + +## hyp get-monitoring + +Get monitoring configurations for the HyperPod cluster. + +#### Syntax + +```bash +hyp get-monitoring [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--grafana` | FLAG | No | Return Grafana dashboard URL | +| `--prometheus` | FLAG | No | Return Prometheus workspace URL | +| `--list` | FLAG | No | Return list of available metrics | + +## hyp configure + +Configure cluster parameters interactively or via command line. + +```{important} +**Pre-Deployment Configuration**: This command modifies local `config.yaml` files **before** cluster creation. For updating **existing, deployed clusters**, use `hyp update cluster` instead. +``` + +#### Syntax + +```bash +hyp configure [OPTIONS] +``` + +#### Parameters + +This command dynamically supports all configuration parameters available in the current template's schema. Common parameters include: + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--resource-name-prefix` | TEXT | No | Prefix for all AWS resources | +| `--create-hyperpod-cluster-stack` | BOOLEAN | No | Create HyperPod Cluster Stack | +| `--hyperpod-cluster-name` | TEXT | No | Name of SageMaker HyperPod Cluster | +| `--create-eks-cluster-stack` | BOOLEAN | No | Create EKS Cluster Stack | +| `--kubernetes-version` | TEXT | No | Kubernetes version | +| `--eks-cluster-name` | TEXT | No | Name of the EKS cluster | +| `--create-helm-chart-stack` | BOOLEAN | No | Create Helm Chart Stack | +| `--namespace` | TEXT | No | Namespace to deploy HyperPod Helm chart | +| `--node-provisioning-mode` | TEXT | No | Continuous provisioning mode | +| `--node-recovery` | TEXT | No | Node recovery setting ("Automatic" or "None") | +| `--create-vpc-stack` | BOOLEAN | No | Create VPC Stack | +| `--vpc-id` | TEXT | No | Existing VPC ID | +| `--vpc-cidr` | TEXT | No | VPC CIDR block | +| `--create-security-group-stack` | BOOLEAN | No | Create Security Group Stack | +| `--enable-hp-inference-feature` | BOOLEAN | No | Enable inference operator | +| `--stage` | TEXT | No | Deployment stage ("gamma" or "prod") | +| `--create-fsx-stack` | BOOLEAN | No | Create FSx Stack | +| `--storage-capacity` | INTEGER | No | FSx storage capacity in GiB | +| `--tags` | JSON | No | Resource tags as JSON object | + +**Note:** The exact parameters available depend on your current template type and version. Run `hyp configure --help` to see all available options for your specific configuration. + +## hyp validate + +Validate the current directory's configuration file syntax and structure. + +#### Syntax + +```bash +hyp validate +``` + +#### Parameters + +No parameters required. + +```{note} +This command performs **syntactic validation only** of the `config.yaml` file against the appropriate schema. It checks: + +- **YAML syntax**: Ensures file is valid YAML +- **Required fields**: Verifies all mandatory fields are present +- **Data types**: Confirms field values match expected types (string, number, boolean, array) +- **Schema structure**: Validates against the template's defined structure + +This command performs syntactic validation only and does **not** verify the actual validity of values (e.g., whether AWS regions exist, instance types are available, or resources can be created). + +**Prerequisites** + +- Must be run in a directory where `hyp init` has created configuration files +- A `config.yaml` file must exist in the current directory + +**Output** + +- **Success**: Displays confirmation message if syntax is valid +- **Errors**: Lists specific syntax errors with field names and descriptions +``` + + +#### Syntax + +```bash +# Validate current configuration syntax +hyp validate + +# Example output on success +✔️ config.yaml is valid! + +# Example output with syntax errors +❌ Config validation errors: + – kubernetes_version: Field is required + – vpc_cidr: Expected string, got number +``` + +## hyp reset + +Reset the current directory's config.yaml to default values. + +#### Syntax + +```bash +hyp reset +``` + +#### Parameters + +No parameters required. + + + +## Parameter Reference + +### Common Parameters Across Commands + +| Parameter | Type | Description | Default | +|-----------|------|-------------|---------| +| `--region` | TEXT | AWS region | Current AWS profile region | +| `--help` | FLAG | Show command help | - | +| `--verbose` | FLAG | Enable verbose output | false | + +### Configuration File Parameters + +The `config.yaml` file supports the following parameters: + +| Parameter | Type | Description | Default | +|-----------|------|-------------|---------| +| `resource_name_prefix` | TEXT | Prefix for all AWS resources (4-digit UUID added during submission) | "hyp-eks-stack" | +| `create_hyperpod_cluster_stack` | BOOLEAN | Create HyperPod Cluster Stack | true | +| `hyperpod_cluster_name` | TEXT | Name of SageMaker HyperPod Cluster | "hyperpod-cluster" | +| `create_eks_cluster_stack` | BOOLEAN | Create EKS Cluster Stack | true | +| `kubernetes_version` | TEXT | Kubernetes version | "1.31" | +| `eks_cluster_name` | TEXT | Name of the EKS cluster | "eks-cluster" | +| `create_helm_chart_stack` | BOOLEAN | Create Helm Chart Stack | true | +| `namespace` | TEXT | Namespace to deploy HyperPod Helm chart | "kube-system" | +| `helm_repo_url` | TEXT | URL of Helm repo containing HyperPod Helm chart | "https://github.com/aws/sagemaker-hyperpod-cli.git" | +| `helm_repo_path` | TEXT | Path to HyperPod Helm chart in repo | "helm_chart/HyperPodHelmChart" | +| `helm_operators` | TEXT | Configuration of HyperPod Helm chart | "mlflow.enabled=true,trainingOperators.enabled=true,..." | +| `helm_release` | TEXT | Name for Helm chart release | "dependencies" | +| `node_provisioning_mode` | TEXT | Continuous provisioning mode ("Continuous" or empty) | "Continuous" | +| `node_recovery` | TEXT | Automatic node recovery ("Automatic" or "None") | "Automatic" | +| `instance_group_settings` | ARRAY | List of instance group configurations | [Default controller group] | +| `rig_settings` | ARRAY | Restricted instance group configurations | null | +| `rig_s3_bucket_name` | TEXT | S3 bucket for RIG resources | null | +| `tags` | ARRAY | Custom tags for SageMaker HyperPod cluster | null | +| `create_vpc_stack` | BOOLEAN | Create VPC Stack | true | +| `vpc_id` | TEXT | Existing VPC ID (if not creating new) | null | +| `vpc_cidr` | TEXT | IP range for VPC | "10.192.0.0/16" | +| `availability_zone_ids` | ARRAY | List of AZs to deploy subnets | null | +| `create_security_group_stack` | BOOLEAN | Create Security Group Stack | true | +| `security_group_id` | TEXT | Existing security group ID | null | +| `security_group_ids` | ARRAY | Security groups for HyperPod cluster | null | +| `private_subnet_ids` | ARRAY | Private subnet IDs for HyperPod cluster | null | +| `eks_private_subnet_ids` | ARRAY | Private subnet IDs for EKS cluster | null | +| `nat_gateway_ids` | ARRAY | NAT Gateway IDs for internet routing | null | +| `private_route_table_ids` | ARRAY | Private route table IDs | null | +| `create_s3_endpoint_stack` | BOOLEAN | Create S3 Endpoint stack | true | +| `enable_hp_inference_feature` | BOOLEAN | Enable inference operator | false | +| `stage` | TEXT | Deployment stage ("gamma" or "prod") | "prod" | +| `custom_bucket_name` | TEXT | Custom S3 bucket name for templates | "" | +| `create_life_cycle_script_stack` | BOOLEAN | Create Life Cycle Script Stack | true | +| `create_s3_bucket_stack` | BOOLEAN | Create S3 Bucket Stack | true | +| `s3_bucket_name` | TEXT | S3 bucket for cluster lifecycle scripts | "s3-bucket" | +| `github_raw_url` | TEXT | Raw GitHub URL for lifecycle script | "https://raw.githubusercontent.com/aws-samples/..." | +| `on_create_path` | TEXT | File name of lifecycle script | "sagemaker-hyperpod-eks-bucket" | +| `create_sagemaker_iam_role_stack` | BOOLEAN | Create SageMaker IAM Role Stack | true | +| `sagemaker_iam_role_name` | TEXT | IAM role name for SageMaker cluster creation | "create-cluster-role" | +| `create_fsx_stack` | BOOLEAN | Create FSx Stack | true | +| `fsx_subnet_id` | TEXT | Subnet ID for FSx creation | "" | +| `fsx_availability_zone_id` | TEXT | Availability zone for FSx subnet | "" | +| `per_unit_storage_throughput` | INTEGER | Per unit storage throughput | 250 | +| `data_compression_type` | TEXT | Data compression type ("NONE" or "LZ4") | "NONE" | +| `file_system_type_version` | FLOAT | File system type version | 2.15 | +| `storage_capacity` | INTEGER | Storage capacity in GiB | 1200 | +| `fsx_file_system_id` | TEXT | Existing FSx file system ID | "" | + +**Note:** The actual available configuration parameters depend on the specific template schema version. Use `hyp init cluster-stack` to see all available parameters for your version. + +## Examples + +### Basic Cluster Stack Creation + +```bash +# Start with a clean directory +mkdir my-hyperpod-cluster +cd my-hyperpod-cluster + +# Initialize cluster configuration +hyp init cluster-stack + +# Configure basic parameters +hyp configure --resource-name-prefix my-cluster --stage prod + +# Validate configuration +hyp validate + +# Create cluster stack +hyp create --region us-west-2 +``` + +### Update Existing Cluster + +```bash +# Update instance groups +hyp update cluster \ + --cluster-name my-cluster \ + --instance-groups '[{"InstanceCount":2,"InstanceGroupName":"worker-nodes","InstanceType":"ml.m5.large"}]' \ + --region us-west-2 +``` + +### List and Describe + +```bash +# List all cluster stacks +hyp list cluster-stack --region us-west-2 + +# Describe specific cluster stack +hyp describe cluster-stack my-stack-name --region us-west-2 + +# List HyperPod clusters with capacity info +hyp list-cluster --region us-west-2 --output table + +# Connect to cluster +hyp set-cluster-context --cluster-name my-cluster --region us-west-2 + +# Get current context +hyp get-cluster-context +``` \ No newline at end of file diff --git a/doc/cli/cluster_management/cli_cluster_management_autogen.rst b/doc/cli/cluster_management/cli_cluster_management_autogen.rst new file mode 100644 index 00000000..c6dee4e0 --- /dev/null +++ b/doc/cli/cluster_management/cli_cluster_management_autogen.rst @@ -0,0 +1,16 @@ +.. Just kept as placeholder for autodoc gen, this file is not referenced in the actual docs. + +.. Cluster Management +.. ======================================== + +.. .. .. click:: sagemaker.hyperpod.cli.commands.cluster_stack:create_cluster_stack +.. .. :prog: hyp create cluster-stack + +.. .. click:: sagemaker.hyperpod.cli.commands.cluster_stack:describe_cluster_stack +.. :prog: hyp describe cluster-stack + +.. .. click:: sagemaker.hyperpod.cli.commands.cluster_stack:list_cluster_stacks +.. :prog: hyp list cluster-stack + +.. .. click:: sagemaker.hyperpod.cli.commands.cluster_stack:update_cluster +.. :prog: hyp update cluster \ No newline at end of file diff --git a/doc/cli/inference/cli_inference.md b/doc/cli/inference/cli_inference.md new file mode 100644 index 00000000..5460d62c --- /dev/null +++ b/doc/cli/inference/cli_inference.md @@ -0,0 +1,358 @@ +(cli_inference)= + +# Inference + +Complete reference for SageMaker HyperPod inference parameters and configuration options. + +```{note} +**Region Configuration**: For commands that accept the `--region` option, if no region is explicitly provided, the command will use the default region from your AWS credentials configuration. +``` + +* [Create JumpStart Endpoint](#hyp-create-hyp-jumpstart-endpoint) +* [Create Custom Endpoint](#hyp-create-hyp-custom-endpoint) + +* [List JumpStart Endpoints](#hyp-list-hyp-jumpstart-endpoint) +* [List Custom Endpoints](#hyp-list-hyp-custom-endpoint) +* [Describe JumpStart Endpoint](#hyp-describe-hyp-jumpstart-endpoint) +* [Describe Custom Endpoint](#hyp-describe-hyp-custom-endpoint) +* [Invoke JumpStart Endpoint](#hyp-invoke-hyp-jumpstart-endpoint) +* [Invoke Custom Endpoint](#hyp-invoke-hyp-custom-endpoint) +* [Delete JumpStart Endpoint](#hyp-delete-hyp-jumpstart-endpoint) +* [Delete Custom Endpoint](#hyp-delete-hyp-custom-endpoint) + +* [List JumpStart Pods](#hyp-list-pods-hyp-jumpstart-endpoint) +* [List Custom Pods](#hyp-list-pods-hyp-custom-endpoint) +* [Get JumpStart Logs](#hyp-get-logs-hyp-jumpstart-endpoint) +* [Get Custom Logs](#hyp-get-logs-hyp-custom-endpoint) +* [Get JumpStart Operator Logs](#hyp-get-operator-logs-hyp-jumpstart-endpoint) +* [Get Custom Operator Logs](#hyp-get-operator-logs-hyp-custom-endpoint) + + + +## hyp create hyp-jumpstart-endpoint + +Deploy pre-trained models from SageMaker JumpStart. + +#### Syntax + +```bash +hyp create hyp-jumpstart-endpoint [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--model-id` | TEXT | Yes | JumpStart model identifier (1-63 characters, alphanumeric with hyphens) | +| `--instance-type` | TEXT | Yes | EC2 instance type for inference (must start with "ml.") | +| `--namespace` | TEXT | No | Kubernetes namespace | +| `--metadata-name` | TEXT | No | Name of the jumpstart endpoint object | +| `--accept-eula` | BOOLEAN | No | Whether model terms of use have been accepted (default: false) | +| `--model-version` | TEXT | No | Semantic version of the model (e.g., "1.0.0", 5-14 characters) | +| `--endpoint-name` | TEXT | No | Name of SageMaker endpoint (1-63 characters, alphanumeric with hyphens) | +| `--tls-certificate-output-s3-uri` | TEXT | No | S3 URI to write the TLS certificate (optional) | +| `--debug` | FLAG | No | Enable debug mode (default: false) | + + +### hyp create hyp-custom-endpoint + +Deploy custom models with your own inference code. + +#### Syntax + +```bash +hyp create hyp-custom-endpoint [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--instance-type` | TEXT | Yes | EC2 instance type for inference (must start with "ml.") | +| `--model-name` | TEXT | Yes | Name of model to create on SageMaker (1-63 characters, alphanumeric with hyphens) | +| `--model-source-type` | TEXT | Yes | Model source type ("s3" or "fsx") | +| `--image-uri` | TEXT | Yes | Docker image URI for inference | +| `--container-port` | INTEGER | Yes | Port on which model server listens (1-65535) | +| `--model-volume-mount-name` | TEXT | Yes | Name of the model volume mount | +| `--namespace` | TEXT | No | Kubernetes namespace | +| `--metadata-name` | TEXT | No | Name of the custom endpoint object | +| `--endpoint-name` | TEXT | No | Name of SageMaker endpoint (1-63 characters, alphanumeric with hyphens) | +| `--env` | OBJECT | No | Environment variables as key-value pairs | +| `--metrics-enabled` | BOOLEAN | No | Enable metrics collection (default: false) | +| `--model-version` | TEXT | No | Version of the model (semantic version format) | +| `--model-location` | TEXT | No | Specific model data location | +| `--prefetch-enabled` | BOOLEAN | No | Whether to pre-fetch model data (default: false) | +| `--tls-certificate-output-s3-uri` | TEXT | No | S3 URI for TLS certificate output | +| `--fsx-dns-name` | TEXT | No | FSx File System DNS Name | +| `--fsx-file-system-id` | TEXT | No | FSx File System ID | +| `--fsx-mount-name` | TEXT | No | FSx File System Mount Name | +| `--s3-bucket-name` | TEXT | No | S3 bucket location | +| `--s3-region` | TEXT | No | S3 bucket region | +| `--model-volume-mount-path` | TEXT | No | Path inside container for model volume (default: "/opt/ml/model") | +| `--resources-limits` | OBJECT | No | Resource limits for the worker | +| `--resources-requests` | OBJECT | No | Resource requests for the worker | +| `--dimensions` | OBJECT | No | CloudWatch Metric dimensions as key-value pairs | +| `--metric-collection-period` | INTEGER | No | Period for CloudWatch query (default: 300) | +| `--metric-collection-start-time` | INTEGER | No | StartTime for CloudWatch query (default: 300) | +| `--metric-name` | TEXT | No | Metric name to query for CloudWatch trigger | +| `--metric-stat` | TEXT | No | Statistics metric for CloudWatch (default: "Average") | +| `--metric-type` | TEXT | No | Type of metric for HPA ("Value" or "Average", default: "Average") | +| `--min-value` | NUMBER | No | Minimum metric value for empty CloudWatch response (default: 0) | +| `--cloud-watch-trigger-name` | TEXT | No | Name for the CloudWatch trigger | +| `--cloud-watch-trigger-namespace` | TEXT | No | AWS CloudWatch namespace for the metric | +| `--target-value` | NUMBER | No | Target value for the CloudWatch metric | +| `--use-cached-metrics` | BOOLEAN | No | Enable caching of metric values (default: true) | +| `--invocation-endpoint` | TEXT | No | Invocation endpoint path (default: "invocations") | +| `--debug` | FLAG | No | Enable debug mode (default: false) | + + +## Inference Endpoint Management Commands + +Commands for managing inference endpoints. + +### hyp list hyp-jumpstart-endpoint + +List JumpStart model endpoints. + +#### Syntax + +```bash +hyp list hyp-jumpstart-endpoint [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--namespace` | TEXT | No | Namespace to list endpoints from (default: "default") | + +### hyp list hyp-custom-endpoint + +List custom model endpoints. + +#### Syntax + +```bash +hyp list hyp-custom-endpoint [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--namespace` | TEXT | No | Namespace to list endpoints from (default: "default") | + +### hyp describe hyp-jumpstart-endpoint + +Describe a JumpStart model endpoint. + +#### Syntax + +```bash +hyp describe hyp-jumpstart-endpoint [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--name` | TEXT | Yes | Name of the endpoint to describe | +| `--namespace` | TEXT | No | Namespace of the endpoint (default: "default") | +| `--full` | FLAG | No | Display full JSON output | + +### hyp describe hyp-custom-endpoint + +Describe a custom model endpoint. + +#### Syntax + +```bash +hyp describe hyp-custom-endpoint [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--name` | TEXT | Yes | Name of the endpoint to describe | +| `--namespace` | TEXT | No | Namespace of the endpoint (default: "default") | +| `--full` | FLAG | No | Display full JSON output | + +### hyp invoke hyp-jumpstart-endpoint + +Invoke a JumpStart model endpoint. + +#### Syntax + +```bash +hyp invoke hyp-jumpstart-endpoint [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--endpoint-name` | TEXT | Yes | Name of the endpoint to invoke | +| `--body` | TEXT | Yes | Request body (JSON format) | +| `--content-type` | TEXT | No | Content type of the request (default: "application/json") | + +### hyp invoke hyp-custom-endpoint + +Invoke a custom model endpoint. + +#### Syntax + +```bash +hyp invoke hyp-custom-endpoint [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--endpoint-name` | TEXT | Yes | Name of the endpoint to invoke | +| `--body` | TEXT | Yes | Request body (JSON format) | +| `--content-type` | TEXT | No | Content type of the request (default: "application/json") | + +### hyp delete hyp-jumpstart-endpoint + +Delete a JumpStart model endpoint. + +#### Syntax + +```bash +hyp delete hyp-jumpstart-endpoint [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--name` | TEXT | Yes | Name of the endpoint to delete | +| `--namespace` | TEXT | No | Namespace of the endpoint (default: "default") | + +### hyp delete hyp-custom-endpoint + +Delete a custom model endpoint. + +#### Syntax + +```bash +hyp delete hyp-custom-endpoint [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--name` | TEXT | Yes | Name of the endpoint to delete | +| `--namespace` | TEXT | No | Namespace of the endpoint (default: "default") | + +### hyp list-pods hyp-jumpstart-endpoint + +List pods for JumpStart endpoints. + +#### Syntax + +```bash +hyp list-pods hyp-jumpstart-endpoint [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--namespace` | TEXT | No | Namespace to list pods from (default: "default") | + +### hyp list-pods hyp-custom-endpoint + +List pods for custom endpoints. + +#### Syntax + +```bash +hyp list-pods hyp-custom-endpoint [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--namespace` | TEXT | No | Namespace to list pods from (default: "default") | + +### hyp get-logs hyp-jumpstart-endpoint + +Get logs from JumpStart endpoint pods. + +#### Syntax + +```bash +hyp get-logs hyp-jumpstart-endpoint [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--pod-name` | TEXT | Yes | Name of the pod to get logs from | +| `--container` | TEXT | No | Container name to get logs from | +| `--namespace` | TEXT | No | Namespace of the pod (default: "default") | + +### hyp get-logs hyp-custom-endpoint + +Get logs from custom endpoint pods. + +#### Syntax + +```bash +hyp get-logs hyp-custom-endpoint [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--pod-name` | TEXT | Yes | Name of the pod to get logs from | +| `--container` | TEXT | No | Container name to get logs from | +| `--namespace` | TEXT | No | Namespace of the pod (default: "default") | + +### hyp get-operator-logs hyp-jumpstart-endpoint + +Get operator logs for JumpStart endpoints. + +#### Syntax + +```bash +hyp get-operator-logs hyp-jumpstart-endpoint [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--since-hours` | FLOAT | Yes | Time frame to get logs for (in hours) | + +### hyp get-operator-logs hyp-custom-endpoint + +Get operator logs for custom endpoints. + +#### Syntax + +```bash +hyp get-operator-logs hyp-custom-endpoint [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--since-hours` | FLOAT | Yes | Time frame to get logs for (in hours) | + +## Parameter Reference + +### Common Parameters Across Commands + +| Parameter | Type | Description | Default | +|-----------|------|-------------|---------| +| `--namespace` | TEXT | Kubernetes namespace | Current context | +| `--help` | FLAG | Show command help | - | diff --git a/doc/cli/training/cli_training.md b/doc/cli/training/cli_training.md new file mode 100644 index 00000000..dc89d221 --- /dev/null +++ b/doc/cli/training/cli_training.md @@ -0,0 +1,182 @@ +(cli_training)= + + +# Training + +Complete reference for SageMaker HyperPod PyTorch training job parameters and configuration options. + +```{note} +**Region Configuration**: For commands that accept the `--region` option, if no region is explicitly provided, the command will use the default region from your AWS credentials configuration. +``` + +* [Create PyTorch Job](#hyp-create-hyp-pytorch-job) +* [List Jobs](#hyp-list-hyp-pytorch-job) +* [Describe Job](#hyp-describe-hyp-pytorch-job) +* [Delete Job](#hyp-delete-hyp-pytorch-job) +* [List Pods](#hyp-list-pods-hyp-pytorch-job) +* [Get Logs](#hyp-get-logs-hyp-pytorch-job) + + +## hyp create hyp-pytorch-job + +Create distributed PyTorch training jobs on SageMaker HyperPod clusters. + +### Syntax + +```bash +hyp create hyp-pytorch-job [OPTIONS] +``` + +### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--job-name` | TEXT | Yes | Unique name for the training job (1-63 characters, alphanumeric with hyphens) | +| `--image` | TEXT | Yes | Docker image URI containing your training code | +| `--namespace` | TEXT | No | Kubernetes namespace | +| `--command` | ARRAY | No | Command to run in the container (array of strings) | +| `--args` | ARRAY | No | Arguments for the entry script (array of strings) | +| `--environment` | OBJECT | No | Environment variables as key-value pairs | +| `--pull-policy` | TEXT | No | Image pull policy (Always, Never, IfNotPresent) | +| `--instance-type` | TEXT | No | Instance type for training | +| `--node-count` | INTEGER | No | Number of nodes (minimum: 1) | +| `--tasks-per-node` | INTEGER | No | Number of tasks per node (minimum: 1) | +| `--label-selector` | OBJECT | No | Node label selector as key-value pairs | +| `--deep-health-check-passed-nodes-only` | BOOLEAN | No | Schedule pods only on nodes that passed deep health check (default: false) | +| `--scheduler-type` | TEXT | No | Scheduler type | +| `--queue-name` | TEXT | No | Queue name for job scheduling (1-63 characters, alphanumeric with hyphens) | +| `--priority` | TEXT | No | Priority class for job scheduling | +| `--max-retry` | INTEGER | No | Maximum number of job retries (minimum: 0) | +| `--volume` | ARRAY | No | List of volume configurations (Refer [Volume Configuration](#volume-configuration) for detailed parameter info) | +| `--service-account-name` | TEXT | No | Service account name | +| `--accelerators` | INTEGER | No | Number of accelerators a.k.a GPUs or Trainium Chips | +| `--vcpu` | FLOAT | No | Number of vCPUs | +| `--memory` | FLOAT | No | Amount of memory in GiB | +| `--accelerators-limit` | INTEGER | No | Limit for the number of accelerators a.k.a GPUs or Trainium Chips | +| `--vcpu-limit` | FLOAT | No | Limit for the number of vCPUs | +| `--memory-limit` | FLOAT | No | Limit for the amount of memory in GiB | +| `--preferred-topology` | TEXT | No | Preferred topology annotation for scheduling | +| `--required-topology` | TEXT | No | Required topology annotation for scheduling | +| `--debug` | FLAG | No | Enable debug mode (default: false) | + +### Volume Configuration + +The `--volume` parameter supports mounting different types of storage to your training containers. + +### Volume Syntax + +```bash +--volume name=,type=,mount_path=[,additional_options] +``` + +### Volume Types + +**hostPath Volume** +```bash +--volume name=model-data,type=hostPath,mount_path=/data,path=/host/data +``` + +**Persistent Volume Claim (PVC)** +```bash +--volume name=training-output,type=pvc,mount_path=/output,claim_name=training-pvc,read_only=false +``` + +### Volume Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `name` | TEXT | Yes | Volume name | +| `type` | TEXT | Yes | Volume type (`hostPath` or `pvc`) | +| `mount_path` | TEXT | Yes | Mount path in container | +| `path` | TEXT | For hostPath | Host path for hostPath volumes | +| `claim_name` | TEXT | For pvc | PVC claim name for pvc volumes | +| `read_only` | BOOLEAN | No | Read-only flag for pvc volumes | + +## Training Job Management Commands + +Commands for managing PyTorch training jobs. + +### hyp list hyp-pytorch-job + +List all HyperPod PyTorch jobs in a namespace. + +#### Syntax + +```bash +hyp list hyp-pytorch-job [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--namespace, -n` | TEXT | No | Namespace to list jobs from (default: "default") | + +### hyp describe hyp-pytorch-job + +Describe a specific HyperPod PyTorch job. + +#### Syntax + +```bash +hyp describe hyp-pytorch-job [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--job-name` | TEXT | Yes | Name of the job to describe | +| `--namespace, -n` | TEXT | No | Namespace of the job (default: "default") | + +### hyp delete hyp-pytorch-job + +Delete a HyperPod PyTorch job. + +#### Syntax + +```bash +hyp delete hyp-pytorch-job [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--job-name` | TEXT | Yes | Name of the job to delete | +| `--namespace, -n` | TEXT | No | Namespace of the job (default: "default") | + +### hyp list-pods hyp-pytorch-job + +List all pods associated with a PyTorch job. + +#### Syntax + +```bash +hyp list-pods hyp-pytorch-job [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--job-name` | TEXT | Yes | Name of the job to list pods for | +| `--namespace, -n` | TEXT | No | Namespace of the job (default: "default") | + +### hyp get-logs hyp-pytorch-job + +Get logs from a specific pod in a PyTorch job. + +#### Syntax + +```bash +hyp get-logs hyp-pytorch-job [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--job-name` | TEXT | Yes | Name of the job | +| `--pod-name` | TEXT | Yes | Name of the pod to get logs from | +| `--namespace, -n` | TEXT | No | Namespace of the job (default: "default") | diff --git a/doc/conf.py b/doc/conf.py index 68bf9c75..3bcc39e0 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -1,48 +1,64 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. """Sphinx configuration.""" import datetime import os import shutil +import sys +import re +import json +from pathlib import Path +from typing import Dict, List, Any, Optional, ClassVar +# Mock kubernetes.config before adding source path to prevent import errors +from unittest.mock import MagicMock +import types +kubernetes_config = types.ModuleType('kubernetes.config') +kubernetes_config.KUBE_CONFIG_DEFAULT_LOCATION = "~/.kube/config" +sys.modules['kubernetes.config'] = kubernetes_config -def run_apidoc(app): - """Generate doc stubs using sphinx-apidoc.""" - module_dir = os.path.join(app.srcdir, "../src/") - output_dir = os.path.join(app.srcdir, "_apidoc") - excludes = [] +# Add the source directory to Python path +sys.path.insert(0, str(Path(__file__).parent.parent / 'src')) - # Ensure that any stale apidoc files are cleaned up first. - if os.path.exists(output_dir): - shutil.rmtree(output_dir) - - cmd = [ - "--separate", - "--module-first", - "--doc-project=API Reference", - "-o", - output_dir, - module_dir, - ] - cmd.extend(excludes) +# Get version from setup.py +def get_version(): try: - from sphinx.ext import apidoc # Sphinx >= 1.7 - - apidoc.main(cmd) - except ImportError: - from sphinx import apidoc # Sphinx < 1.7 - - cmd.insert(0, apidoc.__file__) - apidoc.main(cmd) - - -def setup(app): - """Register our sphinx-apidoc hook.""" - app.connect("builder-inited", run_apidoc) + # Find the project root directory (where setup.py is located) + project_root = Path(__file__).parent.parent + setup_py_path = project_root / "setup.py" + + # Read setup.py content + with open(setup_py_path, "r") as f: + setup_py_content = f.read() + + # Extract version using regex + version_match = re.search(r'version\s*=\s*["\']([^"\']+)["\']', setup_py_content) + if version_match: + return version_match.group(1) + else: + print("Warning: Could not find version in setup.py") + return "unknown" + except Exception as e: + print(f"Warning: Could not extract version from setup.py: {e}") + return "unknown" # Sphinx configuration below. project = "SageMaker HyperPod CLI" +version = get_version() +release = version # Example configuration for intersphinx: refer to the Python standard library. intersphinx_mapping = {"python": ("http://docs.python.org/", None)} @@ -53,16 +69,124 @@ def setup(app): "sphinx.ext.napoleon", "sphinx.ext.todo", "sphinx.ext.viewcode", + "nbsphinx", + "myst_nb", + "sphinx_design", + "sphinx_tabs.tabs", + "sphinx_copybutton", + "sphinx.ext.autosummary", + "sphinx.ext.autosectionlabel", + "sphinx_design", + "sphinx_click" ] -source_suffix = ".rst" -master_doc = "index" + +autodoc_mock_imports = ["pyspark", "feature_store_pyspark", "py4j", "boto3", "botocore", "kubernetes", "yaml", "sagemaker_core"] + +source_suffix = { + '.rst': 'restructuredtext', + '.ipynb': 'myst-nb', + '.md': 'myst-nb', +} autoclass_content = "class" +autodoc_class_signature = "mixed" +autodoc_default_options = { + "members": True, + "undoc-members": False, + "private-members": False, + "special-members": False, + "show-inheritance": False, +} + +# Don't document class attributes automatically +autodoc_typehints_format = "short" +autodoc_preserve_defaults = True autodoc_member_order = "bysource" default_role = "py:obj" -html_theme = "haiku" -htmlhelp_basename = "{}doc".format(project) +html_theme = "sphinx_book_theme" +html_theme_options = { + "logo": { + "text": "SageMaker HyperPod
CLI and SDK", + "image_light": "_static/image.png", + "image_dark": "_static/image.png", + }, + "repository_url": "https://github.com/aws/sagemaker-hyperpod-cli", + "use_repository_button": True, + "use_issues_button": True, + "use_edit_page_button": True, + "path_to_docs": "doc", + "show_navbar_depth": 2, + "use_fullscreen_button": False, + "use_download_button": False, + "home_page_in_toc": True, + "secondary_sidebar_items": ["edit-this-page", "page-toc"], + "toc_title": "Table of contents", + "show_toc_level": 3, +} + +author = "Amazon Web Services" +copyright = f"{datetime.datetime.now().year}, Amazon Web Services" +htmlhelp_basename = "{}doc".format(project) +html_static_path = ["_static"] +html_css_files = ["custom.css", + "search_accessories.css", + ] napoleon_use_rtype = False +napoleon_use_param = False +napoleon_include_init_with_doc = False +napoleon_use_ivar = True +napoleon_parameter_style = "table" +napoleon_type_aliases = None +napoleon_custom_sections = [('Parameters', 'params_style')] + +viewcode_line_numbers = True + +# nbsphinx configuration +nbsphinx_allow_errors = True +nbsphinx_kernel_name = 'python3' + +# MyST-NB configuration +myst_enable_extensions = [ + "amsmath", + "colon_fence", + "deflist", + "dollarmath", + "html_image", + "html_admonition", + # "linkify", # Commented out until linkify-it-py is installed + "replacements", + "smartquotes", + "substitution", + "tasklist", + "attrs_inline", +] +myst_heading_anchors = 3 +nb_execution_mode = "off" + +# Make version available to MyST templates +myst_substitutions = { + "version": version, +} + +# Automatically extract typehints when specified and place them in +# descriptions of the relevant function/method. +autodoc_typehints = "signature" + +# Clean documentation without Pydantic boilerplate +# Hide constructor signature and parameters +autodoc_class_signature = "separated" +autodoc_member_order = "bysource" + +def setup(app): + pass + + +# autosummary +autosummary_generate = True +autosummary_ignore_module_all = False + +# autosectionlabel +autosectionlabel_prefix_document = True \ No newline at end of file diff --git a/doc/examples.md b/doc/examples.md new file mode 100644 index 00000000..ff5252b0 --- /dev/null +++ b/doc/examples.md @@ -0,0 +1,73 @@ +(examples)= + +# Example Notebooks + +## Cluster Management Example Notebooks + +For detailed examples of cluster management with HyperPod, see: + +::::{grid} 1 2 2 2 +:gutter: 3 + +:::{grid-item-card} CLI Cluster Management Example +:link: https://github.com/aws/sagemaker-hyperpod-cli/blob/main/examples/cluster_management/cluster_creation_init_experience.ipynb +:class-card: sd-border-primary + +**Cluster Management Examples** Refer the Cluster Management CLI Example. +::: + +:::{grid-item-card} SDK Cluster Management Example +:link: https://github.com/aws/sagemaker-hyperpod-cli/blob/main/examples/cluster_management/cluster_creation_sdk_experience.ipynb +:class-card: sd-border-primary + +**Cluster Management Examples** Refer the Cluster Management SDK Example. +::: + +:::: + +## Training Example Notebooks + +For detailed examples of training with HyperPod, see: + +::::{grid} 1 2 2 2 +:gutter: 3 + +:::{grid-item-card} CLI Training Example +:link: https://github.com/aws/sagemaker-hyperpod-cli/blob/main/examples/training/CLI/training-e2e-cli.ipynb +:class-card: sd-border-primary + +**Training Examples** Refer the Training Example. +::: + +:::{grid-item-card} SDK Training Example +:link: https://github.com/aws/sagemaker-hyperpod-cli/blob/main/examples/training/SDK/training_sdk_example.ipynb +:class-card: sd-border-primary + +**Training Examples** Refer the Training SDK Example. +::: + +:::: + + +## Inference Example Notebooks + +For detailed examples of inference with HyperPod, see: + +::::{grid} 1 2 2 2 +:gutter: 3 + +:::{grid-item-card} CLI Inference Examples +- CLI Inference JumpStart Model Example +- CLI Inference FSX Model Example +- CLI Inference S3 Model Example + +::: + +:::{grid-item-card} SDK Inference Example +- SDK Inference JumpStart Model Example +- SDK Inference FSX Model Example +- SDK Inference S3 Model Example + +::: + +:::: \ No newline at end of file diff --git a/doc/getting_started.md b/doc/getting_started.md new file mode 100644 index 00000000..718ab168 --- /dev/null +++ b/doc/getting_started.md @@ -0,0 +1,96 @@ +(getting_started)= + +# Getting Started + +```{toctree} +:hidden: +:maxdepth: 1 + +Cluster Management +Training +Inference + +``` + +This guide will help you get started with the SageMaker HyperPod CLI and SDK to perform basic operations. + +```{note} +**Region Configuration**: For commands that accept the `--region` option, if no region is explicitly provided, the command will use the default region from your AWS credentials configuration. +``` + +## List Available Clusters + +List all available SageMaker HyperPod clusters in your account: + +`````{tab-set} +````{tab-item} CLI +```bash +hyp list-cluster [--region ] +``` +```` + +````{tab-item} SDK +```python +from sagemaker.hyperpod import list_clusters + +list_clusters(region='aws-region') + +``` +```` +````` + +## Connect to a Cluster + +Configure your local kubectl environment to interact with a specific SageMaker HyperPod cluster and namespace: + +`````{tab-set} +````{tab-item} CLI +```bash +hyp set-cluster-context --cluster-name +``` +```` + +````{tab-item} SDK +```python +from sagemaker.hyperpod import set_cluster_context + +set_cluster_context('') + +``` +```` +````` + +## Get Current Cluster Context + +View information about the currently configured cluster context: + +`````{tab-set} +````{tab-item} CLI +```bash +hyp get-cluster-context +``` +```` + +````{tab-item} SDK +```python +from sagemaker.hyperpod import get_cluster_context + +get_cluster_context() +``` +```` +````` + + +## Next Steps + +After setting up your environment and connecting to a cluster, you can: + +- Create and manage PyTorch training jobs +- Deploy and manage inference endpoints +- Monitor cluster resources and job performance + +For more detailed information on specific commands, use the `--help` flag: + +```bash +hyp --help +``` \ No newline at end of file diff --git a/doc/getting_started/cluster_management.rst b/doc/getting_started/cluster_management.rst new file mode 100644 index 00000000..cf873689 --- /dev/null +++ b/doc/getting_started/cluster_management.rst @@ -0,0 +1,239 @@ +Cluster Management +=============================================== + +This guide will help you create and manage your first HyperPod cluster using the CLI. + +Prerequisites +------------- + +Before you begin, ensure you have: + +- An AWS account with appropriate permissions for SageMaker HyperPod +- AWS CLI configured with your credentials +- HyperPod CLI installed (``pip install sagemaker-hyperpod``) + +.. note:: + **Region Configuration**: For commands that accept the ``--region`` option, if no region is explicitly provided, the command will use the default region from your AWS credentials configuration. + + **Cluster stack names must be unique within each AWS region.** If you attempt to create a cluster stack with a name that already exists in the same region, the deployment will fail. + +Creating Your First Cluster +---------------------------- + +1. Start with a Clean Directory +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +It's recommended to start with a new and clean directory for each cluster configuration: + +.. code-block:: bash + + mkdir my-hyperpod-cluster + cd my-hyperpod-cluster + +2. Initialize a New Cluster Configuration +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. tab-set:: + + .. tab-item:: CLI + + .. code-block:: bash + + hyp init cluster-stack + +This creates three files: + +- ``config.yaml``: The main configuration file you'll use to customize your cluster +- ``cfn_params.jinja``: A reference template for CloudFormation parameters +- ``README.md``: Usage guide with instructions and examples + +.. important:: + The ``resource_name_prefix`` parameter in the generated ``config.yaml`` file serves as the primary identifier for all AWS resources created during deployment. Each deployment must use a unique resource name prefix to avoid conflicts. This prefix is automatically appended with a unique identifier during cluster creation to ensure resource uniqueness. + +3. Configure Your Cluster +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can configure your cluster in two ways: + +**Option 1: Edit config.yaml directly** + +The config.yaml file contains key parameters like: + +.. code-block:: yaml + + template: cluster-stack + namespace: kube-system + stage: gamma + resource_name_prefix: sagemaker-hyperpod-eks + +**Option 2: Use CLI/SDK commands (Pre-Deployment)** + +.. tab-set:: + + .. tab-item:: CLI + + .. code-block:: bash + + hyp configure --resource-name-prefix your-resource-prefix + +.. note:: + The ``hyp configure`` command only modifies local configuration files. It does not affect existing deployed clusters. + +4. Create the Cluster +~~~~~~~~~~~~~~~~~~~~~ + +.. warning:: + **Cluster Stack Name Uniqueness**: Cluster stack names must be unique within each AWS region. Ensure your ``resource_name_prefix`` in ``config.yaml`` generates a unique stack name for the target region to avoid deployment conflicts. + +.. tab-set:: + + .. tab-item:: CLI + + .. code-block:: bash + + hyp create --region your-region + +This will: + +- Validate your configuration +- Create a timestamped folder in the ``run`` directory +- Initialize the cluster creation process + +5. Monitor Your Cluster +~~~~~~~~~~~~~~~~~~~~~~~ + +Check the status of your cluster: + +.. tab-set:: + + .. tab-item:: CLI + + .. code-block:: bash + + hyp describe cluster-stack your-cluster-name --region your-region + + .. tab-item:: SDK + + .. code-block:: python + + from sagemaker.hyperpod.cluster_management.hp_cluster_stack import HpClusterStack + + # Describe a specific cluster stack + response = HpClusterStack.describe("your-cluster-name", region="your-region") + print(f"Stack Status: {response['Stacks'][0]['StackStatus']}") + print(f"Stack Name: {response['Stacks'][0]['StackName']}") + +.. note:: + **Region-Specific Stack Names**: Cluster stack names are unique within each AWS region. When describing a stack, ensure you specify the correct region where the stack was created, or the command will fail to find the stack. + + +List all clusters: + +.. tab-set:: + + .. tab-item:: CLI + + .. code-block:: bash + + hyp list cluster-stack --region your-region + + .. tab-item:: SDK + + .. code-block:: python + + from sagemaker.hyperpod.cluster_management.hp_cluster_stack import HpClusterStack + + # List all CloudFormation stacks (including cluster stacks) + stacks = HpClusterStack.list(region="your-region") + for stack in stacks['StackSummaries']: + print(f"Stack: {stack['StackName']}, Status: {stack['StackStatus']}") + + +Common Operations +----------------- + +Update a Cluster +~~~~~~~~~~~~~~~~~ + +.. important:: + **Runtime vs Configuration Commands**: + + - ``hyp update cluster`` modifies **existing, deployed clusters** (runtime settings like instance groups, node recovery) + - ``hyp configure`` modifies local ``config.yaml`` files **before** cluster creation + + Use the appropriate command based on whether your cluster is already deployed or not. + +.. tab-set:: + + .. tab-item:: CLI + + .. code-block:: bash + + hyp update cluster \ + --cluster-name your-cluster-name \ + --instance-groups "[]" \ + --region your-region + +Reset Configuration +~~~~~~~~~~~~~~~~~~~ + +.. tab-set:: + + .. tab-item:: CLI + + .. code-block:: bash + + hyp reset + + +Best Practices +-------------- + +- Always validate your configuration before submission: + + .. tab-set:: + + .. tab-item:: CLI + + .. code-block:: bash + + hyp validate + + .. note:: + This command performs **syntactic validation only** of the ``config.yaml`` file against the appropriate schema. It checks: + + - **YAML syntax**: Ensures file is valid YAML + - **Required fields**: Verifies all mandatory fields are present + - **Data types**: Confirms field values match expected types (string, number, boolean, array) + - **Schema structure**: Validates against the template's defined structure + + This command performs syntactic validation only and does **not** verify the actual validity of values (e.g., whether AWS regions exist, instance types are available, or resources can be created). + +- Use meaningful resource prefixes to easily identify your clusters +- Monitor cluster status regularly after creation +- Keep your configuration files in version control for reproducibility + +Next Steps +---------- + +After creating your cluster, you can: + +- Connect to your cluster: + + .. tab-set:: + + .. tab-item:: CLI + + .. code-block:: bash + + hyp set-cluster-context --cluster-name your-cluster-name + +- Start training jobs with PyTorch +- Deploy inference endpoints +- Monitor cluster resources and performance + +For more detailed information on specific commands, use the ``--help`` flag: + +.. code-block:: bash + + hyp --help \ No newline at end of file diff --git a/doc/getting_started/inference.md b/doc/getting_started/inference.md new file mode 100644 index 00000000..9b53139c --- /dev/null +++ b/doc/getting_started/inference.md @@ -0,0 +1,378 @@ +(inference)= + +# Inference with SageMaker HyperPod + +SageMaker HyperPod provides powerful capabilities for deploying and managing inference endpoints on EKS-hosted clusters. This guide covers how to create, invoke, and manage inference endpoints using both the HyperPod CLI and SDK. + +## Overview + +SageMaker HyperPod inference endpoints allow you to: + +- Deploy pre-trained JumpStart models +- Deploy custom models with your own inference code +- Configure resource requirements for inference +- Manage endpoint lifecycle +- Invoke endpoints for real-time predictions +- Monitor endpoint performance + +```{note} +**Region Configuration**: For commands that accept the `--region` option, if no region is explicitly provided, the command will use the default region from your AWS credentials configuration. +``` + +## Creating Inference Endpoints + +You can create inference endpoints using either JumpStart models or custom models: + +### JumpStart Model Endpoints + +`````{tab-set} +````{tab-item} CLI +```bash +hyp create hyp-jumpstart-endpoint \ + --model-id jumpstart-model-id \ + --instance-type ml.g5.8xlarge \ + --endpoint-name endpoint-jumpstart +``` +```` + +````{tab-item} SDK +```python +from sagemaker.hyperpod.inference.config.hp_jumpstart_endpoint_config import Model, Server, SageMakerEndpoint, TlsConfig +from sagemaker.hyperpod.inference.hp_jumpstart_endpoint import HPJumpStartEndpoint + +model = Model( + model_id="deepseek-llm-r1-distill-qwen-1-5b" +) + +server = Server( + instance_type="ml.g5.8xlarge" +) + +endpoint_name = SageMakerEndpoint(name="endpoint-jumpstart") + +js_endpoint = HPJumpStartEndpoint( + model=model, + server=server, + sage_maker_endpoint=endpoint_name +) + +js_endpoint.create() +``` +```` +````` + +### Custom Model Endpoints + +`````{tab-set} +````{tab-item} CLI +```bash +hyp create hyp-custom-endpoint \ + --version 1.0 \ + --endpoint-name endpoint-s3 \ + --model-name \ + --model-source-type s3 \ + --instance-type \ + --image-uri \ + --container-port 8080 \ + --model-volume-mount-name model-weights +``` +```` + +````{tab-item} SDK +```python +from sagemaker.hyperpod.inference.config.hp_custom_endpoint_config import Model, Server, SageMakerEndpoint, TlsConfig, EnvironmentVariables +from sagemaker.hyperpod.inference.hp_endpoint import HPEndpoint + +model = Model( + model_source_type="s3", + model_location="test-pytorch-job", + s3_bucket_name="my-bucket", + s3_region="us-east-2", + prefetch_enabled=True +) + +server = Server( + instance_type="ml.g5.8xlarge", + image_uri="763104351884.dkr.ecr.us-east-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.4.0-tgi2.3.1-gpu-py311-cu124-ubuntu22.04-v2.0", + container_port=8080, + model_volume_mount_name="model-weights" +) + +resources = { + "requests": {"cpu": "30000m", "nvidia.com/gpu": 1, "memory": "100Gi"}, + "limits": {"nvidia.com/gpu": 1} +} + +env = EnvironmentVariables( + HF_MODEL_ID="/opt/ml/model", + SAGEMAKER_PROGRAM="inference.py", + SAGEMAKER_SUBMIT_DIRECTORY="/opt/ml/model/code", + MODEL_CACHE_ROOT="/opt/ml/model", + SAGEMAKER_ENV="1" +) + +endpoint_name = SageMakerEndpoint(name="endpoint-custom-pytorch") + +tls_config = TlsConfig(tls_certificate_output_s3_uri="s3://sample-bucket") + +custom_endpoint = HPEndpoint( + model=model, + server=server, + resources=resources, + environment=env, + sage_maker_endpoint=endpoint_name, + tls_config=tls_config, +) + +custom_endpoint.create() +``` +```` +````` + +### Key Parameters + +When creating an inference endpoint, you'll need to specify: + +1. **Parameters required for Jumpstart Endpoint** + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| **endpoint-name** | TEXT | Yes | Unique identifier for your endpoint | +| **instance-type** | TEXT | Yes | The EC2 instance type to use | +| **model-id** | TEXT | Yes | ID of the pre-trained JumpStart model | + +2. **Parameters required for Custom Endpoint** + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| **endpoint-name** | TEXT | Yes | Unique identifier for your endpoint | +| **instance-type** | TEXT | Yes | The EC2 instance type to use | +| **image-uri** | TEXT | Yes | Docker image containing your inference code | +| **model-name** | TEXT | Yes | Name of model to create on SageMaker | +| **model-source-type** | TEXT | Yes | Source type: fsx or s3 | +| **model-volume-mount-name** | TEXT | Yes | Name of the model volume mount | +| **container-port** | INTEGER | Yes | Port on which the model server listens | + +## Managing Inference Endpoints + +### List Endpoints + +`````{tab-set} +````{tab-item} CLI +```bash +# List JumpStart endpoints +hyp list hyp-jumpstart-endpoint + +# List custom endpoints +hyp list hyp-custom-endpoint +``` +```` + +````{tab-item} SDK +```python +from sagemaker.hyperpod.inference.hp_jumpstart_endpoint import HPJumpStartEndpoint +from sagemaker.hyperpod.inference.hp_endpoint import HPEndpoint + +# List JumpStart endpoints +jumpstart_endpoints = HPJumpStartEndpoint.list() +print(jumpstart_endpoints) + +# List custom endpoints +custom_endpoints = HPEndpoint.list() +print(custom_endpoints) +``` +```` +````` + +### Describe an Endpoint + +`````{tab-set} +````{tab-item} CLI +```bash +# Describe JumpStart endpoint +hyp describe hyp-jumpstart-endpoint --name + +# Describe custom endpoint +hyp describe hyp-custom-endpoint --name +``` +```` + +````{tab-item} SDK +```python +from sagemaker.hyperpod.inference.hp_jumpstart_endpoint import HPJumpStartEndpoint +from sagemaker.hyperpod.inference.hp_endpoint import HPEndpoint + +# Get JumpStart endpoint details +jumpstart_endpoint = HPJumpStartEndpoint.get(name="js-endpoint-name", namespace="test") +print(jumpstart_endpoint) + +# Get custom endpoint details +custom_endpoint = HPEndpoint.get(name="endpoint-custom") +print(custom_endpoint) + +``` +```` +````` + +### Invoke an Endpoint + +`````{tab-set} +````{tab-item} CLI +```bash +# Invoke Jumpstart endpoint +hyp invoke hyp-jumpstart-endpoint \ + --endpoint-name \ + --body '{"inputs":"What is the capital of USA?"}' + +# Invoke custom endpoint +hyp invoke hyp-custom-endpoint \ + --endpoint-name \ + --body '{"inputs": "What is machine learning?"}' +``` +```` + +````{tab-item} SDK +```python +from sagemaker.hyperpod.inference.hp_jumpstart_endpoint import HPJumpStartEndpoint +from sagemaker.hyperpod.inference.hp_endpoint import HPEndpoint + +data = '{"inputs":"What is the capital of USA?"}' +jumpstart_endpoint = HPJumpStartEndpoint.get(name="endpoint-jumpstart") +response = jumpstart_endpoint.invoke(body=data).body.read() +print(response) + +custom_endpoint = HPEndpoint.get(name="endpoint-custom") +response = custom_endpoint.invoke(body=data).body.read() +print(response) +``` +```` +````` + +### List Pods + +`````{tab-set} +````{tab-item} CLI +```bash +# JumpStart endpoint +hyp list-pods hyp-jumpstart-endpoint + +# Custom endpoint +hyp list-pods hyp-custom-endpoint +``` +```` + +````{tab-item} SDK +```python +from sagemaker.hyperpod.inference.hp_jumpstart_endpoint import HPJumpStartEndpoint +from sagemaker.hyperpod.inference.hp_endpoint import HPEndpoint + +# List pods +js_pods = HPJumpStartEndpoint.list_pods() +print(js_pods) + +c_pods = HPEndpoint.list_pods() +print(c_pods) +``` +```` +````` + +### Get Logs + +`````{tab-set} +````{tab-item} CLI +```bash +# JumpStart endpoint +hyp get-logs hyp-jumpstart-endpoint --pod-name + +# Custom endpoint +hyp get-logs hyp-custom-endpoint --pod-name +``` +```` + +````{tab-item} SDK +```python +from sagemaker.hyperpod.inference.hp_jumpstart_endpoint import HPJumpStartEndpoint +from sagemaker.hyperpod.inference.hp_endpoint import HPEndpoint + +# Get logs from pod +js_logs = HPJumpStartEndpoint.get_logs(pod=) +print(js_logs) + +c_logs = HPEndpoint.get_logs(pod=) +print(c_logs) +``` +```` +````` + +### Get Operator Logs + +`````{tab-set} +````{tab-item} CLI +```bash +# JumpStart endpoint +hyp get-operator-logs hyp-jumpstart-endpoint --since-hours 0.5 + +# Custom endpoint +hyp get-operator-logs hyp-custom-endpoint --since-hours 0.5 +``` +```` + +````{tab-item} SDK +```python +from sagemaker.hyperpod.inference.hp_jumpstart_endpoint import HPJumpStartEndpoint +from sagemaker.hyperpod.inference.hp_endpoint import HPEndpoint + +# Invoke JumpStart endpoint +print(HPJumpStartEndpoint.get_operator_logs(since_hours=0.1)) + +# Invoke custom endpoint +print(HPEndpoint.get_operator_logs(since_hours=0.1)) +``` +```` +````` + +### Delete an Endpoint + +`````{tab-set} +````{tab-item} CLI +```bash +# Delete JumpStart endpoint +hyp delete hyp-jumpstart-endpoint --name + +# Delete custom endpoint +hyp delete hyp-custom-endpoint --name +``` +```` + +````{tab-item} SDK +```python +from sagemaker.hyperpod.inference.hp_jumpstart_endpoint import HPJumpStartEndpoint +from sagemaker.hyperpod.inference.hp_endpoint import HPEndpoint + +# Delete JumpStart endpoint +jumpstart_endpoint = HPJumpStartEndpoint.get(name="endpoint-jumpstart") +jumpstart_endpoint.delete() + +# Delete custom endpoint +custom_endpoint = HPEndpoint.get(name="endpoint-custom") +custom_endpoint.delete() +``` +```` +````` + +## Inference Example Notebooks + +For detailed examples of inference with HyperPod, explore these interactive Jupyter notebooks: + +CLI Examples: +- CLI Inference FSX Model Example +- CLI Inference JumpStart Model Example +- CLI Inference S3 Model Example + +SDK Examples: +- SDK Inference FSX Model Example +- SDK Inference JumpStart Model Example +- SDK Inference S3 Model Example + +These Jupyter notebooks demonstrate comprehensive workflows for deploying and managing inference endpoints using different model storage options and both CLI and SDK approaches. You can run these notebooks directly +in your local environment or SageMaker Studio. diff --git a/doc/getting_started/training.md b/doc/getting_started/training.md new file mode 100644 index 00000000..cd26cf46 --- /dev/null +++ b/doc/getting_started/training.md @@ -0,0 +1,222 @@ +--- +keywords: + - distributed + - kubernetes + - pytorch + - containerized + - orchestration +--- + +(training)= + +# Training with SageMaker HyperPod + +SageMaker HyperPod provides powerful capabilities for running distributed training workloads on EKS-orchestrated clusters. This guide covers how to create and manage training jobs using both the HyperPod CLI and SDK. + +## Overview + +SageMaker HyperPod training jobs allow you to: + +- Run distributed PyTorch training workloads +- Specify custom Docker images with your training code +- Configure resource requirements (instance types, GPUs) +- Set up node selection with label selectors +- Manage job scheduling and priorities +- Mount volumes and persistent volume claims + +```{note} +**Region Configuration**: For commands that accept the `--region` option, if no region is explicitly provided, the command will use the default region from your AWS credentials configuration. +``` + +## Creating Training Jobs + +You can create training jobs using either the CLI or SDK approach: + +`````{tab-set} +````{tab-item} CLI +```bash +hyp create hyp-pytorch-job \ + --job-name test-pytorch-job \ + --image pytorch/pytorch:latest \ +``` +```` +````{tab-item} SDK +```python +from sagemaker.hyperpod.training import ( + HyperPodPytorchJob, + Containers, + ReplicaSpec, + Resources, + RunPolicy, + Spec, + Template, +) +from sagemaker.hyperpod.common.config import Metadata + + +nproc_per_node="1" +replica_specs=[ + ReplicaSpec( + name="pod", + template=Template( + spec=Spec( + containers=[ + Containers( + name="container-name", + image="448049793756.dkr.ecr.us-west-2.amazonaws.com/ptjob:mnist", + image_pull_policy="Always", + resources=Resources( + requests={"nvidia.com/gpu": "0"}, + limits={"nvidia.com/gpu": "0"}, + ), + # command=[] + ) + ] + ) + ), + ) +] +run_policy=RunPolicy(clean_pod_policy="None") + +pytorch_job = HyperPodPytorchJob( + metadata=Metadata(name="demo"), + nproc_per_node="1", + replica_specs=replica_specs, + run_policy=run_policy, +) + +pytorch_job.create() +``` +```` +````` + +### Key Parameters + +When creating a training job, you'll need to specify: + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| **job-name** | TEXT | Yes | Unique identifier for your training job | +| **image** | TEXT | Yes | Docker image containing your training environment | +| **accelerators** | INTEGER | No | Number of accelerators a.k.a GPUs or Trainium Chips | +| **vcpu** | FLOAT | No | Number of vCPUs | +| **memory** | FLOAT | No | Amount of memory in GiB | +| **accelerators-limit** | INTEGER | No | Limit for the number of accelerators a.k.a GPUs or Trainium Chips | +| **vcpu-limit** | FLOAT | No | Limit for the number of vCPUs | +| **memory-limit** | FLOAT | No | Limit for the amount of memory in GiB | +| **preferred-topology** | TEXT | No | Preferred topology annotation for scheduling | +| **required-topology** | TEXT | No | Required topology annotation for scheduling | +| **debug** | FLAG | No | Enable debug mode | + + +## Managing Training Jobs + +### List Training Jobs + +`````{tab-set} +````{tab-item} CLI +```bash +hyp list hyp-pytorch-job +``` +```` +````{tab-item} SDK +```python +from sagemaker.hyperpod.training import HyperPodPytorchJob +import yaml + +# List all PyTorch jobs +jobs = HyperPodPytorchJob.list() +print(yaml.dump(jobs)) +``` +```` +````` + +### Describe a Training Job + +`````{tab-set} +````{tab-item} CLI +```bash +hyp describe hyp-pytorch-job --job-name +``` +```` +````{tab-item} SDK +```python +from sagemaker.hyperpod.training import HyperPodPytorchJob + +# Get an existing job +job = HyperPodPytorchJob.get(name="my-pytorch-job") + +print(job) +``` +```` +````` + +### List Pods for a Training Job + +`````{tab-set} +````{tab-item} CLI +```bash +hyp list-pods hyp-pytorch-job --job-name +``` +```` + +````{tab-item} SDK +```python +from sagemaker.hyperpod.training import HyperPodPytorchJob + +# List Pods for an existing job +job = HyperPodPytorchJob.get(name="my-pytorch-job") +print(job.list_pods()) +``` +```` +````` + +### Get Logs from a Pod + +`````{tab-set} +````{tab-item} CLI +```bash +hyp get-logs hyp-pytorch-job --pod-name test-pytorch-job-cli-pod-0 --job-name test-pytorch-job-cli +``` +```` + +````{tab-item} SDK +```python +from sagemaker.hyperpod.training import HyperPodPytorchJob + +# Get pod logs for a job +job = HyperPodPytorchJob.get(name="my-pytorch-job") +print(job.get_logs_from_pod("pod-name")) +``` +```` +````` + +### Delete a Training Job + +`````{tab-set} +````{tab-item} CLI +```bash +hyp delete hyp-pytorch-job --job-name +``` +```` +````{tab-item} SDK +```python +from sagemaker.hyperpod.training import HyperPodPytorchJob + +# Get an existing job +job = HyperPodPytorchJob.get(name="my-pytorch-job") + +# Delete the job +job.delete() +``` +```` +````` + +## Training Example Notebooks + +For detailed examples of training with HyperPod, see: + +- CLI Training Example +- SDK Training Example + +These examples demonstrate end-to-end workflows for creating and managing training jobs using both the CLI and SDK approaches. diff --git a/doc/index.md b/doc/index.md new file mode 100644 index 00000000..39e697c6 --- /dev/null +++ b/doc/index.md @@ -0,0 +1,135 @@ +--- +keywords: + - distributed + - kubernetes + - pytorch + - monitoring + - jumpstart +--- + +(hpcli_docs_mainpage)= + +# Overview + +```{toctree} +:hidden: +:maxdepth: 1 + +Installation +Getting Started +CLI Reference +SDK Reference +Advanced Resources +``` + +Transform your AI/ML development process with Amazon SageMaker HyperPod CLI and SDK. These tools handle infrastructure management complexities, allowing you to focus on model development and innovation. Whether it's scaling your PyTorch training jobs across thousands of GPUs, deploying production-grade inference endpoints or managing multiple clusters efficiently; the intuitive command-line interface and programmatic control enable you to: +- Accelerate development cycles and reduce operational overhead +- Automate ML workflows while maintaining operational visibility +- Optimize computing resources across your AI/ML projects + + +```{note} +Version Info - you’re viewing latest documentation for SageMaker Hyperpod CLI and SDK v3.0.0. +``` + + +```{admonition} What's New +:class: important + +🚀 We are excited to announce general availability of Amazon SageMaker HyperPod CLI and SDK! + + +**Major Updates**: +- **Distributed Training**: Scale PyTorch jobs across multiple nodes and GPUs with simplified management and automatic fault tolerance. +- **Model Inference**: Deploy pre-trained models from SageMaker JumpStart and host custom auto-scaling inference endpoints. +- **Observability**: Connect to and manage multiple HyperPod clusters with enhanced monitoring capabilities. +- **Usability Improvements**: Intuitive CLI for quick experimentation and cluster management, granular SDK control over workload configurations and easy access to system logs and observability dashboards for efficient debugging + +``` + +## Quick Start + + +::::{grid} 1 2 2 2 +:gutter: 3 + +:::{grid-item-card} Installation +:link: installation +:link-type: ref +:class-card: sd-border-primary + +**New to HyperPod?** Install the CLI/ SDK in minutes. +::: + +:::{grid-item-card} Getting Started +:link: getting_started +:link-type: ref +:class-card: sd-border-secondary + +**Ready to explore?** Connect to your cluster before running ML workflows. +::: + +:::{grid-item-card} Training +:link: training +:link-type: ref +:class-card: sd-border-secondary + +**Scale Your ML Models!** Get started with training +::: + +:::{grid-item-card} Inference +:link: inference +:link-type: ref +:class-card: sd-border-secondary + +**Deploy Your ML Model!** Get started with inference +::: + +:::: + +## Advanced Resources + +::::{grid} 1 2 2 2 +:gutter: 3 + +:::{grid-item-card} API reference +:link: sdk/sdk_index.html +:class-card: sd-border-primary + +**Explore APIs** - Checkout API Documentation +::: + +:::{grid-item-card} Github +:link: examples +:link-type: ref +:class-card: sd-border-secondary + +**Example Notebooks** - Ready-to-use implementation guides +::: + +:::{grid-item-card} AWS SageMaker HyperPod Docs +:link: https://docs.aws.amazon.com/sagemaker/latest/dg/hyperpod.html +:link-type: url +:class-card: sd-border-secondary + +**HyperPod Documentation** - Know more about HyperPod +::: + +:::{grid-item-card} HyperPod Developer Guide +:link: https://catalog.workshops.aws/sagemaker-hyperpod-eks/en-US +:link-type: url +:class-card: sd-border-secondary + +**Developer Guide** - Refer to this practical development guide +::: + +:::{grid-item-card} SageMaker HyperPod Workshop +:link: https://catalog.workshops.aws/sagemaker-hyperpod-eks/en-US +:link-type: url +:class-card: sd-border-secondary + +**Practical Guide** - Refer to the workshop for detailed follow-through steps +::: + + +:::: diff --git a/doc/index.rst b/doc/index.rst deleted file mode 100644 index 0f5525de..00000000 --- a/doc/index.rst +++ /dev/null @@ -1,16 +0,0 @@ -HyperpodCLI -======================= - -Please replace this text with a short description of your package. - -.. toctree:: - - _apidoc/modules - - -Indices and tables -__________________ - -* :ref:`genindex` -* :ref:`modindex` -* :ref:`search` diff --git a/doc/installation.md b/doc/installation.md new file mode 100644 index 00000000..2b4766d0 --- /dev/null +++ b/doc/installation.md @@ -0,0 +1,62 @@ +(installation)= +# Get Started +This guide provides installation instructions for the SageMaker HyperPod CLI and SDK. + +## System Requirements + +### Supported Platforms +- Linux +- macOS + +```{note} + Windows is not supported at this time. +``` + +### Supported ML Frameworks for Training +- PyTorch (version ≥ 1.10) + +### Supported Python Versions +- 3.9 and above + +## Prerequisites + +### For Training +SageMaker HyperPod CLI currently supports `HyperPodPytorchJob` training workloads. +To run these jobs, install the **SageMaker Training Operator**. + +[Install the SageMaker Training Operator](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-eks-operator-install.html) + +### For Inference +The CLI supports creating inference endpoints using JumpStart models or custom models. +To enable this, install the **SageMaker Inference Operator**. + +[Install the SageMaker Inference Operator](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-hyperpod-model-deployment-setup.html) + +## Installation Options + +### Install from PyPI + +It's recommended to install the SageMaker HyperPod CLI and SDK in a Python virtual environment to avoid conflicts with other packages: +```bash +# Create a virtual environment +python -m venv {venv-name} + +# Activate the virtual environment +source {venv-name}/bin/activate +``` +```{note} +Remember to activate your virtual environment (source {venv-name}/bin/activate) each time you want to use the HyperPod CLI and SDK if you chose the virtual environment installation method. +``` +You can install the SageMaker HyperPod CLI and SDK directly using `pip`: + +```bash +# Install from PyPI +pip install sagemaker-hyperpod +``` + +To verify that the installation was successful, run: + +```bash +# Verify CLI installation +hyp --help +``` diff --git a/doc/requirements.txt b/doc/requirements.txt new file mode 100644 index 00000000..98058a3c --- /dev/null +++ b/doc/requirements.txt @@ -0,0 +1,12 @@ +sphinx>=4.0.0,<8.0.0 +nbsphinx>=0.8.8 +myst-nb>=0.17.1 +ipykernel>=6.0.0 +jupyter>=1.0.0 +sphinx-book-theme>=1.0.0 +linkify-it-py>=2.0.0 +sphinx-design>=0.5.0 +sphinx-tabs>=3.4.1 +sphinx-copybutton +autodoc-pydantic>=2.0.0 +sphinx-click>=6.0.0 diff --git a/doc/sdk/cluster_management/hp_cluster_stack.rst b/doc/sdk/cluster_management/hp_cluster_stack.rst new file mode 100644 index 00000000..354c38d1 --- /dev/null +++ b/doc/sdk/cluster_management/hp_cluster_stack.rst @@ -0,0 +1,76 @@ +Cluster Management +================================ + +.. automodule:: sagemaker.hyperpod.cluster_management.hp_cluster_stack + :exclude-members: model_config, __init__ + :no-undoc-members: + :no-show-inheritance: + + + +SageMaker Core Cluster Update Method +==================================== + +The cluster management also supports updating cluster properties using the SageMaker Core Cluster update method from ``sagemaker_core.main.resources``: + +.. py:method:: Cluster.update(instance_groups=None, restricted_instance_groups=None, node_recovery=None, instance_groups_to_delete=None) + + Update a SageMaker Core Cluster resource. + + **Parameters:** + + .. list-table:: + :header-rows: 1 + :widths: 25 20 55 + + * - Parameter + - Type + - Description + * - instance_groups + - List[ClusterInstanceGroupSpecification] + - List of instance group specifications to update + * - restricted_instance_groups + - List[ClusterRestrictedInstanceGroupSpecification] + - List of restricted instance group specifications + * - node_recovery + - str + - Node recovery setting ("Automatic" or "None") + * - instance_groups_to_delete + - List[str] + - List of instance group names to delete + + **Returns:** + + The updated Cluster resource + + **Raises:** + + - ``botocore.exceptions.ClientError``: AWS service related errors + - ``ConflictException``: Conflict when modifying SageMaker entity + - ``ResourceLimitExceeded``: SageMaker resource limit exceeded + - ``ResourceNotFound``: Resource being accessed is not found + + + .. dropdown:: Usage Examples + :open: + + .. code-block:: python + + from sagemaker_core.main.resources import Cluster + from sagemaker_core.main.shapes import ClusterInstanceGroupSpecification + + # Get existing cluster + cluster = Cluster.get(cluster_name="my-cluster") + + # Update cluster with new instance groups and node recovery + cluster.update( + instance_groups=[ + ClusterInstanceGroupSpecification( + InstanceCount=2, + InstanceGroupName="worker-nodes", + InstanceType="ml.m5.large" + ) + ], + node_recovery="Automatic", + instance_groups_to_delete=["old-group-name"] + ) \ No newline at end of file diff --git a/doc/sdk/inference/hp_endpoint.rst b/doc/sdk/inference/hp_endpoint.rst new file mode 100644 index 00000000..7fb1fb08 --- /dev/null +++ b/doc/sdk/inference/hp_endpoint.rst @@ -0,0 +1,25 @@ +Inference +=========== + +* `HPEndpointBase`_ +* `HPEndpoint`_ +* `HPJumpStartEndpoint`_ +* `HPEndpoint Configs`_ + + +.. automodule:: sagemaker.hyperpod.inference.hp_endpoint_base + :exclude-members: is_kubeconfig_loaded, get_logger, verify_kube_config + :no-undoc-members: + :no-show-inheritance: + +.. automodule:: sagemaker.hyperpod.inference.hp_endpoint + :no-undoc-members: + +.. automodule:: sagemaker.hyperpod.inference.hp_jumpstart_endpoint + :no-undoc-members: + +.. automodule:: sagemaker.hyperpod.inference.config.hp_endpoint_config + :no-undoc-members: + +.. automodule:: sagemaker.hyperpod.inference.config.hp_jumpstart_endpoint_config + :no-undoc-members: diff --git a/doc/sdk/metadata.rst b/doc/sdk/metadata.rst new file mode 100644 index 00000000..6ae5472d --- /dev/null +++ b/doc/sdk/metadata.rst @@ -0,0 +1,7 @@ +Metadata +------------ + +.. automodule:: sagemaker.hyperpod.common.config.metadata + :members: + :undoc-members: + :show-inheritance: diff --git a/doc/sdk/sdk_index.rst b/doc/sdk/sdk_index.rst new file mode 100644 index 00000000..7bdad56b --- /dev/null +++ b/doc/sdk/sdk_index.rst @@ -0,0 +1,41 @@ +############# +SDK Reference +############# + +.. toctree:: + :hidden: + :maxdepth: 2 + + cluster_management/hp_cluster_stack + training/hyperpod_pytorch_job + inference/hp_endpoint + +Complete reference for the SageMaker HyperPod SDK. + +.. container:: + + .. grid:: 1 1 3 3 + :gutter: 3 + + .. grid-item-card:: Cluster Management SDK + :link: cluster_management/hp_cluster_stack + :link-type: doc + :class-card: sd-border-secondary + + Cluster Management SDK classes, methods and parameters. + + .. grid-item-card:: Training SDK + :link: training/hyperpod_pytorch_job + :link-type: doc + :class-card: sd-border-secondary + + Training SDK classes, methods and parameters. + + .. grid-item-card:: Inference SDK + :link: inference/hp_endpoint + :link-type: doc + :class-card: sd-border-secondary + + Inference SDK classes, methods and parameters. + + diff --git a/doc/sdk/training/hyperpod_pytorch_job.rst b/doc/sdk/training/hyperpod_pytorch_job.rst new file mode 100644 index 00000000..779bc85e --- /dev/null +++ b/doc/sdk/training/hyperpod_pytorch_job.rst @@ -0,0 +1,21 @@ +Training +=========== + +* `HyperPodPytorchJob`_ +* `HyperPodPytorchJob Configs`_ + + +HyperPodPytorchJob +------------------- + +.. autoclass:: sagemaker.hyperpod.training.hyperpod_pytorch_job.HyperPodPytorchJob + :exclude-members: is_kubeconfig_loaded, model_config, metadata, status, get_logger, verify_kube_config + :show-inheritance: + + +HyperPodPytorchJob Configs +--------------------------- + +.. automodule:: sagemaker.hyperpod.training.config.hyperpod_pytorch_job_unified_config + :members: _HyperPodPytorchJob + :show-inheritance: diff --git a/examples/cluster_management/cluster_creation_init_experience.ipynb b/examples/cluster_management/cluster_creation_init_experience.ipynb new file mode 100644 index 00000000..db01dcc6 --- /dev/null +++ b/examples/cluster_management/cluster_creation_init_experience.ipynb @@ -0,0 +1,384 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# SageMaker HyperPod Cluster Creation - Init Experience\n", + "\n", + "This notebook demonstrates the complete end-to-end workflow for creating a SageMaker HyperPod cluster using the HyperPod CLI. The init experience provides a guided approach to cluster creation with validation and configuration management.\n", + "\n", + "## Prerequisites\n", + "\n", + "- AWS CLI configured with appropriate permissions\n", + "- SageMaker HyperPod CLI installed (`pip install sagemaker-hyperpod`)\n", + "- Helm installed (required for cluster operations)\n", + "- Python 3.8+ environment\n", + "\n", + "## Workflow Overview\n", + "\n", + "1. **Initialize** - Create initial cluster configuration\n", + "2. **Configure** - Customize cluster settings and tags\n", + "3. **Validate** - Verify configuration before deployment\n", + "4. **Create** - Deploy the cluster infrastructure\n", + "5. **Monitor** - Check cluster status and manage lifecycle\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 1: Initialize Cluster Configuration\n", + "\n", + "The `hyp init cluster-stack` command creates a new cluster configuration template with default settings. This generates a `config.yaml` file that serves as the foundation for your cluster deployment.\n", + "\n", + "**What this does:**\n", + "- Creates a new `config.yaml` with default cluster settings\n", + "- Sets up basic infrastructure components (VPC, EKS, S3, etc.)\n", + "- Generates unique resource names to avoid conflicts\n" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Initialize a new cluster stack configuration\n", + "!hyp init cluster-stack" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 2: Configure Cluster Settings\n", + "\n", + "The `hyp configure` command allows you to customize your cluster configuration. You can add tags for resource management, modify instance types, adjust networking settings, and more.\n", + "\n", + "**Key configuration options:**\n", + "- **Tags**: For resource organization and cost tracking\n", + "- **Instance Groups**: Define compute resources and their specifications\n", + "- **Networking**: VPC, subnets, and security group settings\n", + "- **Storage**: FSx and EBS volume configurations\n" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Configure cluster with custom tags for resource management\n", + "# Tags help with cost tracking, resource organization, and compliance\n", + "!hyp configure --tags '[{\"Key\": \"Environment\", \"Value\": \"Development\"}, {\"Key\": \"Project\", \"Value\": \"MLTraining\"}, {\"Key\": \"Owner\", \"Value\": \"DataScienceTeam\"}, {\"Key\": \"CostCenter\", \"Value\": \"ML-Research\"}]'" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### View Current Configuration\n", + "\n", + "Let's examine the generated configuration to understand what will be deployed:" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Display the current configuration\n", + "!cat config.yaml | head -50" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 3: Validate Configuration\n", + "\n", + "The `hyp validate` command performs comprehensive validation of your cluster configuration before deployment. This helps catch configuration errors early and ensures all prerequisites are met.\n", + "\n", + "**Validation checks include:**\n", + "- AWS credentials and permissions\n", + "- Resource quotas and limits\n", + "- Configuration syntax and values\n", + "- Network and security settings\n", + "- Instance type availability in target regions\n" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Validate the cluster configuration\n", + "# This checks for potential issues before deployment\n", + "!hyp validate" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 4: Reset Configuration (Optional)\n", + "\n", + "The `hyp reset` command allows you to reset your configuration to defaults or clean up any partial deployments. This is useful when you want to start fresh or if validation reveals issues that require a clean slate.\n", + "\n", + "**Use cases for reset:**\n", + "- Starting over with a clean configuration\n", + "- Cleaning up after failed deployments\n", + "- Switching between different cluster configurations\n" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Reset configuration if needed (uncomment to use)\n", + "# !hyp reset\n", + "\n", + "print(\"Reset command available if configuration changes are needed\")" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 5: Create the Cluster\n", + "\n", + "The `hyp create` command deploys your HyperPod cluster infrastructure. This process creates all the necessary AWS resources including VPC, EKS cluster, IAM roles, S3 buckets, and the HyperPod cluster itself.\n", + "\n", + "**Deployment includes:**\n", + "- VPC and networking infrastructure\n", + "- EKS cluster with managed node groups\n", + "- SageMaker HyperPod cluster\n", + "- IAM roles and policies\n", + "- S3 buckets for artifacts\n", + "- FSx file system (if configured)\n", + "\n", + "**Note:** This process typically takes 15-30 minutes to complete.\n" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Create the HyperPod cluster\n", + "# This will deploy all infrastructure components\n", + "!hyp create" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 6: Monitor Cluster Creation\n", + "\n", + "While the cluster is being created, you can monitor its progress using the describe and list commands. These provide real-time status updates on the deployment process.\n" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Check cluster creation status\n", + "import time\n", + "\n", + "print(\"Monitoring cluster creation progress...\")\n", + "for i in range(5):\n", + " print(f\"\\n--- Status Check {i+1} ---\")\n", + " !hyp describe cluster-stack \n", + " time.sleep(30) # Wait 30 seconds between checks" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 7: Describe Cluster Stack\n", + "\n", + "The `hyp describe cluster-stack` command provides detailed information about your deployed cluster, including resource IDs, endpoints, and current status.\n", + "\n", + "**Information provided:**\n", + "- Cluster status and health\n", + "- Resource ARNs and IDs\n", + "- Network configuration details\n", + "- Instance group information\n", + "- Storage configuration\n" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Get detailed information about the cluster stack\n", + "!hyp describe cluster-stack " + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 8: List All Cluster Stacks\n", + "\n", + "The `hyp list cluster-stack` command shows all HyperPod cluster stacks in your account. This is useful for managing multiple clusters and getting an overview of your infrastructure.\n", + "\n", + "**Displays:**\n", + "- All cluster stacks in the current region\n", + "- Stack names and creation timestamps\n", + "- Current status of each stack\n", + "- Resource counts and types\n" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# List all cluster stacks in your account\n", + "!hyp list cluster-stack" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 9: Update Cluster Configuration\n", + "\n", + "The `hyp update cluster` command allows you to modify your existing cluster configuration. You can add or remove instance groups, update tags, or modify other cluster settings.\n", + "\n", + "**Common update scenarios:**\n", + "- Scaling instance groups up or down\n", + "- Adding new instance types\n", + "- Updating cluster tags\n", + "- Modifying storage configurations\n", + "\n", + "**Note:** Some changes may require cluster restart or recreation.\n" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Update cluster configuration (example: adding more tags)\n", + "# Uncomment and modify as needed\n", + "# !hyp update cluster --add-tags '[{\"Key\": \"UpdatedBy\", \"Value\": \"NotebookExample\"}]'\n", + "\n", + "print(\"Update command available for cluster modifications\")" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 10: Verify Cluster Connectivity\n", + "\n", + "Once your cluster is created, verify that you can connect to it and that all components are functioning properly.\n" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Set cluster context for kubectl operations\n", + "# Replace 'your-cluster-name' with your actual cluster name\n", + "# !hyp set-cluster-context --cluster-name your-cluster-name\n", + "\n", + "# Get cluster context information\n", + "# !hyp get-cluster-context\n", + "\n", + "print(\"Cluster connectivity commands available after deployment\")" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Next Steps\n", + "\n", + "After successfully creating your HyperPod cluster, you can:\n", + "\n", + "1. **Submit Training Jobs**: Use `hyp create hyp-pytorch-job` to run distributed training\n", + "2. **Deploy Inference Endpoints**: Use `hyp create hyp-jumpstart-endpoint` for model serving\n", + "3. **Monitor Resources**: Check pod status with `hyp list-pods`\n", + "4. **Access Logs**: View training logs with `hyp get-logs`\n", + "5. **Scale Cluster**: Add or remove instance groups as needed\n", + "\n", + "## Troubleshooting\n", + "\n", + "If you encounter issues during cluster creation:\n", + "\n", + "- Check AWS CloudFormation console for detailed error messages\n", + "- Verify AWS credentials and permissions\n", + "- Ensure resource quotas are sufficient\n", + "- Review the configuration file for syntax errors\n", + "- Use `hyp validate` to identify configuration issues\n", + "\n", + "## Cleanup\n", + "\n", + "To avoid ongoing charges, remember to delete your cluster when no longer needed:\n", + "\n", + "```bash\n", + "hyp delete cluster-stack --stack-name your-stack-name\n", + "```\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Summary\n", + "\n", + "This notebook demonstrated the complete HyperPod cluster creation workflow:\n", + "\n", + "✅ **Initialized** cluster configuration with `hyp init cluster-stack` \n", + "✅ **Configured** cluster settings and tags with `hyp configure` \n", + "✅ **Validated** configuration with `hyp validate` \n", + "✅ **Created** cluster infrastructure with `hyp create` \n", + "✅ **Monitored** deployment with `hyp describe cluster-stack` \n", + "✅ **Listed** all clusters with `hyp list cluster-stack` \n", + "✅ **Updated** cluster configuration with `hyp update cluster` \n", + "\n", + "Your HyperPod cluster is now ready for distributed machine learning workloads!\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/cluster_management/cluster_creation_sdk_experience.ipynb b/examples/cluster_management/cluster_creation_sdk_experience.ipynb new file mode 100644 index 00000000..4284094a --- /dev/null +++ b/examples/cluster_management/cluster_creation_sdk_experience.ipynb @@ -0,0 +1,683 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# SageMaker HyperPod Cluster Creation - SDK Experience\n", + "\n", + "This notebook demonstrates the complete end-to-end workflow for creating a SageMaker HyperPod cluster using the HyperPod SDK with the HpClusterStack class. The SDK provides programmatic control over cluster lifecycle management.\n", + "\n", + "## Prerequisites\n", + "\n", + "- AWS CLI configured with appropriate permissions\n", + "- SageMaker HyperPod SDK installed (`pip install sagemaker-hyperpod`)\n", + "- SageMaker Core SDK installed (`pip install sagemaker-core`)\n", + "- Python 3.8+ environment\n", + "\n", + "## Workflow Overview\n", + "\n", + "1. **Initialize** - Create HpClusterStack instance with configuration\n", + "2. **Configure** - Set cluster settings and tags programmatically\n", + "3. **Create** - Deploy the cluster infrastructure\n", + "4. **Monitor** - Check cluster status and manage lifecycle" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 1: Import Required Libraries and Initialize Configuration\n", + "\n", + "First, we'll import the necessary SDK components and create an HpClusterStack instance with default settings. This is equivalent to `hyp init cluster-stack` in the CLI.\n", + "\n", + "**What this does:**\n", + "- Imports HpClusterStack and related classes\n", + "- Creates cluster configuration with default settings\n", + "- Sets up basic infrastructure components (VPC, EKS, S3, etc.)\n", + "- Generates unique resource names to avoid conflicts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import uuid\n", + "import time\n", + "from sagemaker.hyperpod.cluster_management.hp_cluster_stack import HpClusterStack\n", + "from sagemaker_core.main.resources import Cluster\n", + "\n", + "# Generate unique resource prefix to avoid conflicts\n", + "resource_prefix = f\"hyperpod-sdk-{str(uuid.uuid4())[:8]}\"\n", + "\n", + "# Initialize cluster stack configuration (equivalent to hyp init cluster-stack)\n", + "cluster_stack = HpClusterStack(\n", + " stage=\"prod\",\n", + " resource_name_prefix=resource_prefix,\n", + " hyperpod_cluster_name=f\"{resource_prefix}-cluster\",\n", + " eks_cluster_name=f\"{resource_prefix}-eks\",\n", + " s3_bucket_name=f\"{resource_prefix}-s3-bucket\",\n", + " sagemaker_iam_role_name=f\"{resource_prefix}-iam-role\",\n", + " \n", + " # Infrastructure components to create\n", + " create_vpc_stack=True,\n", + " create_security_group_stack=True,\n", + " create_eks_cluster_stack=True,\n", + " create_s3_bucket_stack=True,\n", + " create_s3_endpoint_stack=True,\n", + " create_life_cycle_script_stack=True,\n", + " create_sagemaker_iam_role_stack=True,\n", + " create_helm_chart_stack=True,\n", + " create_hyperpod_cluster_stack=True,\n", + " create_fsx_stack=True,\n", + " \n", + " # Network configuration\n", + " vpc_cidr=\"10.192.0.0/16\",\n", + " availability_zone_ids=[\"use2-az1\", \"use2-az2\", \"use2-az3\"],\n", + " \n", + " # Kubernetes configuration\n", + " kubernetes_version=\"1.31\",\n", + " node_provisioning_mode=\"Continuous\",\n", + " \n", + " # Instance group configuration\n", + " instance_group_settings=[\n", + " {\n", + " \"InstanceCount\": 1,\n", + " \"InstanceGroupName\": \"default\",\n", + " \"InstanceType\": \"ml.t3.medium\",\n", + " \"TargetAvailabilityZoneId\": \"use2-az2\",\n", + " \"ThreadsPerCore\": 1,\n", + " \"InstanceStorageConfigs\": [\n", + " {\"EbsVolumeConfig\": {\"VolumeSizeInGB\": 500}}\n", + " ]\n", + " }\n", + " ]\n", + ")\n", + "\n", + "print(f\"Initialized cluster stack with prefix: {resource_prefix}\")\n", + "print(f\"Cluster name: {cluster_stack.hyperpod_cluster_name}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 2: Configure Cluster Settings and Tags\n", + "\n", + "Configure the cluster with custom tags and additional settings. This is equivalent to `hyp configure --tags []` in the CLI.\n", + "\n", + "**Key configuration options:**\n", + "- **Tags**: For resource organization and cost tracking\n", + "- **Instance Groups**: Define compute resources and their specifications\n", + "- **Networking**: VPC, subnets, and security group settings\n", + "- **Storage**: FSx and EBS volume configurations" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Configure cluster with custom tags (equivalent to hyp configure --tags)\n", + "cluster_tags = [\n", + " {\"Key\": \"Environment\", \"Value\": \"Development\"},\n", + " {\"Key\": \"Project\", \"Value\": \"MLTraining\"},\n", + " {\"Key\": \"Owner\", \"Value\": \"DataScienceTeam\"},\n", + " {\"Key\": \"CostCenter\", \"Value\": \"ML-Research\"},\n", + " {\"Key\": \"CreatedBy\", \"Value\": \"SDK-Example\"}\n", + "]\n", + "\n", + "# Update cluster stack with tags\n", + "cluster_stack.tags = cluster_tags\n", + "\n", + "# Additional configuration options\n", + "cluster_stack.node_recovery = \"Automatic\"\n", + "cluster_stack.fsx_availability_zone_id = \"use2-az2\"\n", + "cluster_stack.storage_capacity = 1200\n", + "cluster_stack.per_unit_storage_throughput = 250\n", + "\n", + "print(\"Configured cluster with custom tags:\")\n", + "for tag in cluster_tags:\n", + " print(f\" {tag['Key']}: {tag['Value']}\")\n", + "\n", + "print(f\"\\nNode recovery: {cluster_stack.node_recovery}\")\n", + "print(f\"FSx storage capacity: {cluster_stack.storage_capacity} GiB\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### View Current Configuration\n", + "\n", + "Let's examine the current configuration to understand what will be deployed:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Display current configuration details\n", + "print(\"=== Cluster Configuration ===\")\n", + "print(f\"Resource Prefix: {cluster_stack.resource_name_prefix}\")\n", + "print(f\"HyperPod Cluster: {cluster_stack.hyperpod_cluster_name}\")\n", + "print(f\"EKS Cluster: {cluster_stack.eks_cluster_name}\")\n", + "print(f\"S3 Bucket: {cluster_stack.s3_bucket_name}\")\n", + "print(f\"VPC CIDR: {cluster_stack.vpc_cidr}\")\n", + "print(f\"Kubernetes Version: {cluster_stack.kubernetes_version}\")\n", + "print(f\"\\nInstance Groups:\")\n", + "for ig in cluster_stack.instance_group_settings:\n", + " print(f\" - {ig['InstanceGroupName']}: {ig['InstanceCount']}x {ig['InstanceType']}\")\n", + "print(f\"\\nInfrastructure Components:\")\n", + "print(f\" VPC Stack: {cluster_stack.create_vpc_stack}\")\n", + "print(f\" EKS Stack: {cluster_stack.create_eks_cluster_stack}\")\n", + "print(f\" HyperPod Stack: {cluster_stack.create_hyperpod_cluster_stack}\")\n", + "print(f\" FSx Stack: {cluster_stack.create_fsx_stack}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 3: Create the Cluster\n", + "\n", + "Deploy the HyperPod cluster infrastructure using the SDK. This is equivalent to `hyp create` in the CLI.\n", + "\n", + "**Deployment includes:**\n", + "- VPC and networking infrastructure\n", + "- EKS cluster with managed node groups\n", + "- SageMaker HyperPod cluster\n", + "- IAM roles and policies\n", + "- S3 buckets for artifacts\n", + "- FSx file system (if configured)\n", + "\n", + "**Note:** This process typically takes 15-30 minutes to complete." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create the HyperPod cluster (equivalent to hyp create)\n", + "try:\n", + " print(\"Starting cluster creation...\")\n", + " print(f\"This will create cluster: {cluster_stack.hyperpod_cluster_name}\")\n", + " \n", + " # Deploy the cluster infrastructure\n", + " response = cluster_stack.create(region=\"us-east-2\")\n", + " \n", + " print(\"\\n✅ Cluster creation initiated successfully!\")\n", + " print(f\"Stack Name: {cluster_stack.stack_name}\")\n", + " print(f\"Stack ID: {cluster_stack.stack_id}\")\n", + " \n", + " # Store cluster information for later use\n", + " cluster_name = cluster_stack.hyperpod_cluster_name\n", + " stack_name = cluster_stack.stack_name\n", + " \n", + " print(f\"\\nCluster creation is in progress. This may take 15-30 minutes.\")\n", + " print(f\"Monitor progress in the next steps.\")\n", + " \n", + "except Exception as e:\n", + " print(f\"\\n❌ Cluster creation failed: {str(e)}\")\n", + " raise" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 4: Monitor Cluster Creation\n", + "\n", + "Monitor the cluster creation progress using SDK methods. This provides real-time status updates on the deployment process." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Monitor cluster creation progress\n", + "def monitor_cluster_creation(stack_name, max_checks=30, interval=120):\n", + " \"\"\"Monitor cluster creation progress\"\"\"\n", + " print(f\"Monitoring cluster creation progress for stack: {stack_name}\")\n", + " \n", + " for i in range(max_checks):\n", + " try:\n", + " print(f\"\\n--- Status Check {i+1}/{max_checks} ---\")\n", + " \n", + " # Check stack status\n", + " status = HpClusterStack.check_status(stack_name, region=\"us-east-2\")\n", + " print(f\"Stack Status: {status}\")\n", + " \n", + " # Check if creation is complete\n", + " if status == \"CREATE_COMPLETE\":\n", + " print(\"\\n🎉 Cluster creation completed successfully!\")\n", + " break\n", + " elif status in [\"CREATE_FAILED\", \"ROLLBACK_COMPLETE\", \"DELETE_COMPLETE\"]:\n", + " print(f\"\\n❌ Cluster creation failed with status: {status}\")\n", + " break\n", + " elif status == \"CREATE_IN_PROGRESS\":\n", + " print(\"⏳ Cluster creation still in progress...\")\n", + " \n", + " if i < max_checks - 1: # Don't sleep on the last iteration\n", + " print(f\"Waiting {interval} seconds before next check...\")\n", + " time.sleep(interval)\n", + " \n", + " except Exception as e:\n", + " print(f\"Error checking status: {str(e)}\")\n", + " break\n", + " \n", + " return status\n", + "\n", + "# Start monitoring (uncomment when cluster creation is initiated)\n", + "# final_status = monitor_cluster_creation(stack_name, max_checks=5, interval=30)\n", + "print(\"Monitoring function ready. Uncomment to start monitoring after cluster creation.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 5: Describe Cluster Stack\n", + "\n", + "Get detailed information about the deployed cluster using SDK methods. This is equivalent to `hyp describe cluster-stack` in the CLI.\n", + "\n", + "**Information provided:**\n", + "- Cluster status and health\n", + "- Resource ARNs and IDs\n", + "- Network configuration details\n", + "- Instance group information\n", + "- Storage configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Get detailed information about the cluster stack (equivalent to hyp describe cluster-stack)\n", + "def describe_cluster_stack(stack_name, region=\"us-east-2\"):\n", + " \"\"\"Describe cluster stack details\"\"\"\n", + " try:\n", + " print(f\"Describing cluster stack: {stack_name}\")\n", + " \n", + " # Get stack description\n", + " response = HpClusterStack.describe(stack_name, region=region)\n", + " \n", + " if response and 'Stacks' in response and len(response['Stacks']) > 0:\n", + " stack = response['Stacks'][0]\n", + " \n", + " print(\"\\n=== Stack Information ===\")\n", + " print(f\"Stack Name: {stack.get('StackName', 'N/A')}\")\n", + " print(f\"Stack Status: {stack.get('StackStatus', 'N/A')}\")\n", + " print(f\"Creation Time: {stack.get('CreationTime', 'N/A')}\")\n", + " print(f\"Stack ID: {stack.get('StackId', 'N/A')}\")\n", + " \n", + " # Display parameters\n", + " if 'Parameters' in stack:\n", + " print(\"\\n=== Parameters ===\")\n", + " for param in stack['Parameters'][:10]: # Show first 10 parameters\n", + " print(f\" {param['ParameterKey']}: {param['ParameterValue']}\")\n", + " \n", + " # Display outputs\n", + " if 'Outputs' in stack:\n", + " print(\"\\n=== Outputs ===\")\n", + " for output in stack['Outputs'][:10]: # Show first 10 outputs\n", + " print(f\" {output['OutputKey']}: {output['OutputValue']}\")\n", + " \n", + " # Display tags\n", + " if 'Tags' in stack:\n", + " print(\"\\n=== Tags ===\")\n", + " for tag in stack['Tags']:\n", + " print(f\" {tag['Key']}: {tag['Value']}\")\n", + " \n", + " return response\n", + " \n", + " except Exception as e:\n", + " print(f\"Error describing stack: {str(e)}\")\n", + " return None\n", + "\n", + "# Describe the cluster stack (uncomment when stack exists)\n", + "# describe_cluster_stack(stack_name)\n", + "print(\"Describe function ready. Use after cluster creation is complete.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 6: List All Cluster Stacks\n", + "\n", + "List all HyperPod cluster stacks in your account using SDK methods. This is equivalent to `hyp list cluster-stack` in the CLI.\n", + "\n", + "**Displays:**\n", + "- All cluster stacks in the current region\n", + "- Stack names and creation timestamps\n", + "- Current status of each stack\n", + "- Resource counts and types" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# List all cluster stacks (equivalent to hyp list cluster-stack)\n", + "def list_cluster_stacks(region=\"us-east-2\"):\n", + " \"\"\"List all cluster stacks in the account\"\"\"\n", + " try:\n", + " print(f\"Listing cluster stacks in region: {region}\")\n", + " \n", + " # Get list of stacks\n", + " response = HpClusterStack.list(region=region)\n", + " \n", + " if response and 'StackSummaries' in response:\n", + " stacks = response['StackSummaries']\n", + " \n", + " print(f\"\\n=== Found {len(stacks)} Stack(s) ===\")\n", + " \n", + " if stacks:\n", + " print(f\"{'Stack Name':<40} {'Status':<25} {'Creation Time':<20}\")\n", + " print(\"-\" * 85)\n", + " \n", + " for stack in stacks:\n", + " name = stack.get('StackName', 'N/A')[:39]\n", + " status = stack.get('StackStatus', 'N/A')[:24]\n", + " created = str(stack.get('CreationTime', 'N/A'))[:19]\n", + " print(f\"{name:<40} {status:<25} {created:<20}\")\n", + " else:\n", + " print(\"No cluster stacks found.\")\n", + " \n", + " return response\n", + " \n", + " except Exception as e:\n", + " print(f\"Error listing stacks: {str(e)}\")\n", + " return None\n", + "\n", + "# List all cluster stacks\n", + "list_response = list_cluster_stacks()\n", + "\n", + "# Filter for HyperPod-related stacks\n", + "if list_response and 'StackSummaries' in list_response:\n", + " hyperpod_stacks = [\n", + " stack for stack in list_response['StackSummaries']\n", + " if 'hyperpod' in stack.get('StackName', '').lower()\n", + " ]\n", + " \n", + " if hyperpod_stacks:\n", + " print(f\"\\n=== HyperPod Stacks ({len(hyperpod_stacks)}) ===\")\n", + " for stack in hyperpod_stacks:\n", + " print(f\" - {stack['StackName']} ({stack['StackStatus']})\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 7: Update Cluster Configuration\n", + "\n", + "Update the existing cluster configuration using sagemaker-core's Cluster class. This is equivalent to `hyp update cluster` in the CLI.\n", + "\n", + "**Common update scenarios:**\n", + "- Scaling instance groups up or down\n", + "- Adding new instance types\n", + "- Updating cluster tags\n", + "- Modifying storage configurations\n", + "\n", + "**Note:** Some changes may require cluster restart or recreation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Update cluster configuration using sagemaker-core Cluster class\n", + "def update_cluster(cluster_name, region=\"us-east-2\"):\n", + " \"\"\"Update cluster configuration (equivalent to hyp update cluster)\"\"\"\n", + " try:\n", + " print(f\"Updating cluster: {cluster_name}\")\n", + " \n", + " # Get existing cluster using sagemaker-core\n", + " cluster = Cluster.get(cluster_name=cluster_name)\n", + " \n", + " print(f\"\\nCurrent cluster status: {cluster.cluster_status}\")\n", + " print(f\"Current instance groups: {len(cluster.instance_groups)}\")\n", + " \n", + " # Display current instance groups\n", + " print(\"\\n=== Current Instance Groups ===\")\n", + " for ig in cluster.instance_groups:\n", + " print(f\" - {ig.instance_group_name}: {ig.current_count}x {ig.instance_type}\")\n", + " \n", + " # Example: Update cluster tags\n", + " updated_tags = [\n", + " {\"Key\": \"Environment\", \"Value\": \"Development\"},\n", + " {\"Key\": \"Project\", \"Value\": \"MLTraining\"},\n", + " {\"Key\": \"Owner\", \"Value\": \"DataScienceTeam\"},\n", + " {\"Key\": \"CostCenter\", \"Value\": \"ML-Research\"},\n", + " {\"Key\": \"UpdatedBy\", \"Value\": \"SDK-Example\"},\n", + " {\"Key\": \"LastUpdated\", \"Value\": str(time.time())}\n", + " ]\n", + " \n", + " # Update cluster with new tags\n", + " cluster.update(tags=updated_tags)\n", + " \n", + " print(\"\\n✅ Cluster updated successfully!\")\n", + " print(\"Updated tags:\")\n", + " for tag in updated_tags:\n", + " print(f\" {tag['Key']}: {tag['Value']}\")\n", + " \n", + " return cluster\n", + " \n", + " except Exception as e:\n", + " print(f\"Error updating cluster: {str(e)}\")\n", + " return None\n", + "\n", + "# Example: Scale instance group\n", + "def scale_instance_group(cluster_name, instance_group_name, target_count, region=\"us-east-2\"):\n", + " \"\"\"Scale an instance group to target count\"\"\"\n", + " try:\n", + " print(f\"Scaling instance group '{instance_group_name}' to {target_count} instances\")\n", + " \n", + " # Get cluster\n", + " cluster = Cluster.get(cluster_name=cluster_name)\n", + " \n", + " # Find the instance group\n", + " target_ig = None\n", + " for ig in cluster.instance_groups:\n", + " if ig.instance_group_name == instance_group_name:\n", + " target_ig = ig\n", + " break\n", + " \n", + " if not target_ig:\n", + " print(f\"Instance group '{instance_group_name}' not found\")\n", + " return None\n", + " \n", + " print(f\"Current count: {target_ig.current_count}\")\n", + " print(f\"Target count: {target_count}\")\n", + " \n", + " # Update instance group count\n", + " target_ig.target_count = target_count\n", + " \n", + " # Apply the update\n", + " cluster.update(instance_groups=[target_ig])\n", + " \n", + " print(f\"\\n✅ Instance group scaling initiated!\")\n", + " \n", + " return cluster\n", + " \n", + " except Exception as e:\n", + " print(f\"Error scaling instance group: {str(e)}\")\n", + " return None\n", + "\n", + "# Update functions ready (uncomment when cluster exists)\n", + "# updated_cluster = update_cluster(cluster_name)\n", + "# scaled_cluster = scale_instance_group(cluster_name, \"controller-group\", 2)\n", + "\n", + "print(\"Update functions ready. Use after cluster creation is complete.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 8: Verify Cluster Status and Health\n", + "\n", + "Verify that the cluster is healthy and ready for workloads using comprehensive status checks." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Comprehensive cluster health check\n", + "def check_cluster_health(cluster_name, region=\"us-east-2\"):\n", + " \"\"\"Perform comprehensive cluster health check\"\"\"\n", + " try:\n", + " print(f\"Checking health for cluster: {cluster_name}\")\n", + " \n", + " # Get cluster details\n", + " cluster = Cluster.get(cluster_name=cluster_name)\n", + " \n", + " print(\"\\n=== Cluster Health Summary ===\")\n", + " print(f\"Cluster Name: {cluster.cluster_name}\")\n", + " print(f\"Cluster Status: {cluster.cluster_status}\")\n", + " print(f\"Creation Time: {cluster.creation_time}\")\n", + " print(f\"Cluster ARN: {cluster.cluster_arn}\")\n", + " \n", + " # Check instance groups health\n", + " print(\"\\n=== Instance Groups Health ===\")\n", + " total_instances = 0\n", + " healthy_instances = 0\n", + " \n", + " for ig in cluster.instance_groups:\n", + " print(f\"\\nInstance Group: {ig.instance_group_name}\")\n", + " print(f\" Instance Type: {ig.instance_type}\")\n", + " print(f\" Current Count: {ig.current_count}\")\n", + " print(f\" Target Count: {getattr(ig, 'target_count', 'N/A')}\")\n", + " print(f\" Status: {getattr(ig, 'instance_group_status', 'N/A')}\")\n", + " \n", + " total_instances += ig.current_count\n", + " if getattr(ig, 'instance_group_status', '') == 'InService':\n", + " healthy_instances += ig.current_count\n", + " \n", + " print(f\"\\n=== Overall Health ===\")\n", + " print(f\"Total Instances: {total_instances}\")\n", + " print(f\"Healthy Instances: {healthy_instances}\")\n", + " health_percentage = (healthy_instances / total_instances * 100) if total_instances > 0 else 0\n", + " print(f\"Health Percentage: {health_percentage:.1f}%\")\n", + " \n", + " # Determine overall health status\n", + " if cluster.cluster_status == 'InService' and health_percentage >= 80:\n", + " print(\"\\n🟢 Cluster is HEALTHY and ready for workloads\")\n", + " elif cluster.cluster_status == 'Creating':\n", + " print(\"\\n🟡 Cluster is still CREATING\")\n", + " else:\n", + " print(\"\\n🔴 Cluster may have ISSUES - check individual components\")\n", + " \n", + " return cluster\n", + " \n", + " except Exception as e:\n", + " print(f\"Error checking cluster health: {str(e)}\")\n", + " return None\n", + "\n", + "# Health check function ready (uncomment when cluster exists)\n", + "# cluster_health = check_cluster_health(cluster_name)\n", + "\n", + "print(\"Health check function ready. Use after cluster creation is complete.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Next Steps\n", + "\n", + "After successfully creating your HyperPod cluster using the SDK, you can:\n", + "\n", + "1. **Submit Training Jobs**: Use HyperPod SDK training classes for distributed training\n", + "2. **Deploy Inference Endpoints**: Use HyperPod SDK inference classes for model serving\n", + "3. **Monitor Resources**: Use SDK methods to check pod and job status\n", + "4. **Access Logs**: Retrieve training and system logs programmatically\n", + "5. **Scale Cluster**: Modify instance groups using the Cluster class\n", + "\n", + "## Troubleshooting\n", + "\n", + "If you encounter issues during cluster creation:\n", + "\n", + "- Check AWS CloudFormation console for detailed error messages\n", + "- Verify AWS credentials and permissions using `boto3.Session()`\n", + "- Ensure resource quotas are sufficient\n", + "- Review the cluster configuration parameters\n", + "\n", + "## Cleanup\n", + "\n", + "To avoid ongoing charges, remember to delete your cluster when no longer needed:\n", + "\n", + "```python\n", + "# Delete cluster using sagemaker-core\n", + "cluster = Cluster.get(cluster_name=cluster_name)\n", + "cluster.delete()\n", + "\n", + "# Or delete the entire stack\n", + "import boto3\n", + "cf_client = boto3.client('cloudformation', region_name='us-east-2')\n", + "cf_client.delete_stack(StackName=stack_name)\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Summary\n", + "\n", + "This notebook demonstrated the complete HyperPod cluster creation workflow using the SDK:\n", + "\n", + "✅ **Initialized** cluster configuration with `HpClusterStack` class \n", + "✅ **Configured** cluster settings and tags programmatically \n", + "✅ **Created** cluster infrastructure with `cluster_stack.create()` \n", + "✅ **Monitored** deployment with `HpClusterStack.check_status()` \n", + "✅ **Listed** all clusters with `HpClusterStack.list()` \n", + "✅ **Updated** cluster configuration with `Cluster.update()` \n", + "✅ **Verified** cluster health with comprehensive checks \n", + "\n", + "Your HyperPod cluster is now ready for distributed machine learning workloads using the SDK!" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/inference/CLI/inference-fsx-model-e2e-cli.ipynb b/examples/inference/CLI/inference-fsx-model-e2e-cli.ipynb index 8aa6e2fc..05913ec8 100644 --- a/examples/inference/CLI/inference-fsx-model-e2e-cli.ipynb +++ b/examples/inference/CLI/inference-fsx-model-e2e-cli.ipynb @@ -5,7 +5,7 @@ "id": "2d55c8b9", "metadata": {}, "source": [ - "## Inference Operator CLI E2E Expereience (S3 custom model)" + "## Inference Operator CLI E2E Expereience (FSX custom model)" ] }, { @@ -35,7 +35,7 @@ "metadata": {}, "outputs": [], "source": [ - "!hyp set-cluster-context --cluster-name hp-cluster-for-inf-Beta2try1" + "!hyp set-cluster-context --cluster-name " ] }, { @@ -47,24 +47,19 @@ "source": [ "!hyp create hyp-custom-endpoint \\\n", " --version 1.0 \\\n", - " --env \\\n", - " '{\"HF_MODEL_ID\":\"/opt/ml/model\", \\\n", - " \"SAGEMAKER_PROGRAM\":\"inference.py\", \\\n", - " \"SAGEMAKER_SUBMIT_DIRECTORY\":\"/opt/ml/model/code\", \\\n", - " \"MODEL_CACHE_ROOT\":\"/opt/ml/model\", \\\n", - " \"SAGEMAKER_ENV\":\"1\"}' \\\n", + " --env '{ \"key1\": \"val1\", \"key2\": \"val2\"}' \\\n", " --model-source-type fsx \\\n", - " --model-location deepseek-1-5b \\\n", - " --fsx-file-system-id fs-0e6a92495c35a81f2 \\\n", - " --image-uri 763104351884.dkr.ecr.us-east-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.4.0-tgi2.3.1-gpu-py311-cu124-ubuntu22.04-v2.0 \\\n", + " --model-location \\\n", + " --fsx-file-system-id \\\n", + " --image-uri \\\n", " --model-volume-mount-name model-weights \\\n", " --container-port 8080 \\\n", " --resources-requests '{\"cpu\": \"4\", \"nvidia.com/gpu\": 1, \"memory\": \"32Gi\"}' \\\n", " --resources-limits '{\"nvidia.com/gpu\": 1}' \\\n", - " --tls-certificate-output-s3-uri s3://tls-bucket-inf1-beta2 \\\n", - " --instance-type ml.g5.8xlarge \\\n", - " --endpoint-name endpoint-fsx-test-cli \\\n", - " --model-name deepseek15b-fsx-test-cli" + " --tls-certificate-output-s3-uri s3://sample-bucket \\\n", + " --instance-type \\\n", + " --endpoint-name endpoint-fsx \\\n", + " --model-name " ] }, { @@ -84,7 +79,7 @@ "metadata": {}, "outputs": [], "source": [ - "!hyp describe hyp-custom-endpoint --name endpoint-fsx-test-cli" + "!hyp describe hyp-custom-endpoint --name endpoint-fsx" ] }, { @@ -94,7 +89,7 @@ "metadata": {}, "outputs": [], "source": [ - "!hyp invoke hyp-custom-endpoint --endpoint-name endpoint-fsx-test-cli --body '{\"inputs\":\"What is the capital of USA?\"}'" + "!hyp invoke hyp-custom-endpoint --endpoint-name endpoint-fsx --body '{\"inputs\":\"What is the capital of USA?\"}'" ] }, { @@ -104,7 +99,7 @@ "metadata": {}, "outputs": [], "source": [ - "!hyp delete hyp-custom-endpoint --name endpoint-fsx-test-cli" + "!hyp delete hyp-custom-endpoint --name endpoint-fsx" ] }, { diff --git a/examples/inference/CLI/inference-jumpstart-e2e-cli.ipynb b/examples/inference/CLI/inference-jumpstart-e2e-cli.ipynb index efd11840..d524c74c 100644 --- a/examples/inference/CLI/inference-jumpstart-e2e-cli.ipynb +++ b/examples/inference/CLI/inference-jumpstart-e2e-cli.ipynb @@ -1,10 +1,10 @@ { "cells": [ { - "metadata": {}, "cell_type": "markdown", - "source": "", - "id": "f28ecfc84cef3505" + "id": "f28ecfc84cef3505", + "metadata": {}, + "source": [] }, { "cell_type": "markdown", @@ -41,7 +41,7 @@ "metadata": {}, "outputs": [], "source": [ - "!hyp set-cluster-context --cluster-name hp-cluster-for-inf-Beta2try1" + "!hyp set-cluster-context --cluster-name " ] }, { @@ -53,11 +53,9 @@ "source": [ "!hyp create hyp-jumpstart-endpoint \\\n", " --version 1.0 \\\n", - " --model-id deepseek-llm-r1-distill-qwen-1-5b \\\n", - " --model-version 2.0.4 \\\n", - " --instance-type ml.g5.8xlarge \\\n", - " --endpoint-name endpoint-js-test-cli \\\n", - " --tls-certificate-output-s3-uri s3://tls-bucket-inf1-beta2" + " --model-id \\\n", + " --instance-type \\\n", + " --endpoint-name endpoint-js \\" ] }, { @@ -77,7 +75,7 @@ "metadata": {}, "outputs": [], "source": [ - "!hyp describe hyp-jumpstart-endpoint --name endpoint-js-test-cli" + "!hyp describe hyp-jumpstart-endpoint --name endpoint-js" ] }, { @@ -87,7 +85,7 @@ "metadata": {}, "outputs": [], "source": [ - "!hyp invoke hyp-jumpstart-endpoint --endpoint-name endpoint-js-test-cli --body '{\"inputs\":\"What is the capital of USA?\"}'" + "!hyp invoke hyp-jumpstart-endpoint --endpoint-name endpoint-js --body '{\"inputs\":\"What is the capital of USA?\"}'" ] }, { @@ -97,7 +95,7 @@ "metadata": {}, "outputs": [], "source": [ - "!hyp delete hyp-jumpstart-endpoint --name endpoint-js-test-cli" + "!hyp delete hyp-jumpstart-endpoint --name endpoint-js" ] }, { diff --git a/examples/inference/CLI/inference-jumpstart-init-experience.ipynb b/examples/inference/CLI/inference-jumpstart-init-experience.ipynb new file mode 100644 index 00000000..966998e4 --- /dev/null +++ b/examples/inference/CLI/inference-jumpstart-init-experience.ipynb @@ -0,0 +1,323 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# SageMaker HyperPod Jumpstart Endpoint - Init Experience\n", + "\n", + "This notebook demonstrates the complete end-to-end workflow for creating a SageMaker HyperPod Jumpstart Endpoint using the HyperPod CLI. The init experience provides a guided approach to create Hyperpod Jumpstart Endpoint with validation and configuration management.\n", + "\n", + "## Prerequisites\n", + "\n", + "- SageMaker HyperPod CLI installed (`pip install sagemaker-hyperpod`)\n", + "- Hyperpod jumpstart inference template installed (`pip install hyperpod-jumpstart-inference-template`)\n", + "- Hyperpod inference operator installed in your hyperpod cluster\n", + "- Python 3.8+ environment\n", + "\n", + "## Workflow Overview\n", + "\n", + "1. **Initialize** - Create initial jumpstart endpoint configuration\n", + "2. **Configure** - Customize jumpstart endpoint parameters\n", + "3. **Validate** - Verify configuration before deployment\n", + "4. **Create** - Deploy the jumpstart endpoint creation\n", + "5. **Monitor** - Check jumpstart endpoint status and manage lifecycle\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 0: Connect to your Hyperpod cluster\n", + "\n", + "Make sure you have installed hyperpod inference operator in your hyperpod cluster.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# List all available SageMaker HyperPod clusters in your account\n", + "!hyp list-cluster" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Configure your local kubectl environment to interact with a specific SageMaker HyperPod cluster (and namespace)\n", + "!hyp set-cluster-context --cluster-name ml-cluster-integ-test" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 1: Initialize Jumpstart Endpoint Configuration\n", + "\n", + "The `hyp init hyp-jumpstart-endpoint` command creates a new configuration template with default settings. This generates a `config.yaml` file that serves as the foundation for your deployment.\n", + "\n", + "**What this does:**\n", + "- Creates a `config.yaml` with default jumpstart endpoint settings.\n", + "- Creates a `k8s.jinja` which is a reference to the k8s payload that is going to be submitted with. Users can refer this to understand how the parameters are being used. \n", + "- Creates a `README.md` which is a detailed explanation of the init experience.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize a new jumpstart endpoint configuration in the current directory\n", + "!hyp init hyp-jumpstart-endpoint" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 2: Configure Jumpstart Endpoint Settings\n", + "\n", + "The `hyp configure` command allows you to customize your jumpstart endpoint configuration.\n", + "\n", + "**Key configuration options:**\n", + "- **model_id**: Unique identifier of the model within the SageMakerPublicHub\n", + "- **instance_type**: EC2 instance type for the inference server\n", + "- **endpoint_name**: Name of SageMaker endpoint" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!hyp configure --endpoint-name my-jumpstart-endpoint" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### View Current Configuration\n", + "\n", + "Let's examine the generated configuration to understand what will be deployed:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Display the current configuration\n", + "!cat config.yaml | head -50" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 3: Validate Configuration\n", + "\n", + "The `hyp validate` command performs syntax validation of your jumpstart endpoint configuration before deployment. This helps catch configuration errors early and ensures all prerequisites are met.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Validate the jumpstart endpoint configuration\n", + "# This checks for potential issues before deployment\n", + "!hyp validate" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 4: Reset Configuration (Optional)\n", + "\n", + "The `hyp reset` command allows you to reset your configuration to defaults or clean up any partial deployments. This is useful when you want to start fresh or if validation reveals issues that require a clean slate.\n", + "\n", + "**Use cases for reset:**\n", + "- Starting over with a clean configuration\n", + "- Cleaning up after failed deployments\n", + "- Switching between different jumpstart endpoint configurations\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Reset configuration if needed (uncomment to use)\n", + "# !hyp reset\n", + "\n", + "print(\"Reset command available if configuration changes are needed\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 5: Create the Jumpstart Endpoint\n", + "\n", + "The `hyp create` command deploys your HyperPod jumpstart endpoint with configurations in the config.yaml. A timestamped folder is created in the `runs` folder, where the config.yaml and the values-injected k8s.yaml kubernates payload is saved.\n", + "\n", + "**Note:** The sagemaker jumpstart endpoint typically takes 10-15 minutes to be created.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create the jumpstart endpoint\n", + "!hyp create" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 6: Monitor Jumpstart Endpoint Creation\n", + "\n", + "While the jumpstart endpoint is being created, you can monitor its progress using the describe and list commands. These provide real-time status updates on the deployment process." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Check jumpstart endpoint creation status\n", + "import time\n", + "\n", + "print(\"Monitoring jumpstart endpoint progress...\")\n", + "for i in range(5):\n", + " print(f\"\\n--- Status Check {i+1} ---\")\n", + " !hyp describe hyp-jumpstart-endpoint --name my-jumpstart-endpoint\n", + " time.sleep(30) # Wait 30 seconds between checks" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 7: Invoke Sagemaker Endpoint\n", + "\n", + "After the sagemaker endpoint is successfully created, you can use `hyp invoke hyp-jumpstart-endpoint` command to do basic invocation of sagemaker endpoint." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!hyp invoke hyp-jumpstart-endpoint --endpoint-name my-jumpstart-endpoint --body '{\"inputs\":\"What is the capital of USA?\"}'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 8: Describe Jumpstart Endpoint\n", + "\n", + "The `hyp describe hyp-jumpstart-endpoint` command provides detailed information about your jumpstart endpoint deployment status and sagemaker endpoint status." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Get detailed information about the jumpstart endpoint\n", + "!hyp describe hyp-jumpstart-endpoint --name my-jumpstart-endpoint" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 9: List All Jumpstart Endpoints\n", + "\n", + "The `hyp list hyp-jumpstart-endpoint` command shows all HyperPod jumpstart endpoints in your account. This is useful for managing multiple jumpstart endpoint deployments and getting an overview of your deployments.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# List all jumpstart endpoints in your account\n", + "!hyp list hyp-jumpstart-endpoint" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Next Steps\n", + "\n", + "After successfully creating your HyperPod Jumpstart Endpoint, you can:\n", + "\n", + "1. **Monitor Resources**: Check pod status with `hyp list-pods hyp-jumpstart-endpoint`\n", + "2. **Access Logs**: View pod logs with `hyp get-logs hyp-jumpstart-endpoint`\n", + "\n", + "\n", + "## Troubleshooting\n", + "\n", + "If you encounter issues during Jumpstart Endpoint creation:\n", + "\n", + "- Use `hyp get-operator-logs hyp-jumpstart-endpoint` to check potential operator log errors\n", + "- Verify AWS credentials and permissions\n", + "- Ensure resource quotas are sufficient\n", + "- Review the configuration file for syntax errors\n", + "- Use `hyp validate` to identify configuration issues\n", + "\n", + "## Cleanup\n", + "\n", + "To avoid ongoing charges, remember to delete your jumpstart endpoint when no longer needed:\n", + "\n", + "```bash\n", + "hyp delete hyp-jumpstart-endpoint --name my-jumpstart-endpoint\n", + "```\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.2" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/inference/CLI/inference-s3-model-e2e-cli.ipynb b/examples/inference/CLI/inference-s3-model-e2e-cli.ipynb index 64eee879..40b614c5 100644 --- a/examples/inference/CLI/inference-s3-model-e2e-cli.ipynb +++ b/examples/inference/CLI/inference-s3-model-e2e-cli.ipynb @@ -35,7 +35,7 @@ "metadata": {}, "outputs": [], "source": [ - "!hyp set-cluster-context --cluster-name hp-cluster-for-inf-Beta2try1" + "!hyp set-cluster-context --cluster-name " ] }, { @@ -47,38 +47,31 @@ "source": [ "!hyp create hyp-custom-endpoint \\\n", " --version 1.0 \\\n", - " --env \\\n", - " '{ \\\n", - " \"HF_MODEL_ID\": \"/opt/ml/model\", \\\n", - " \"SAGEMAKER_PROGRAM\": \"inference.py\", \\\n", - " \"SAGEMAKER_SUBMIT_DIRECTORY\": \"/opt/ml/model/code\", \\\n", - " \"MODEL_CACHE_ROOT\": \"/opt/ml/model\", \\\n", - " \"SAGEMAKER_ENV\": \"1\" \\\n", - " }' \\\n", + " --env '{ \"key1\": \"val1\", \"key2\": \"val2\"}' \\\n", " --metric-collection-period 30 \\\n", " --metric-name Invocations \\\n", " --metric-stat Sum \\\n", " --metric-type Average \\\n", " --min-value 0.0 \\\n", - " --cloud-watch-trigger-name SageMaker-Invocations-new \\\n", + " --cloud-watch-trigger-name SageMaker-Invocations \\\n", " --cloud-watch-trigger-namespace AWS/SageMaker \\\n", " --target-value 10 \\\n", " --use-cached-metrics true \\\n", " --model-source-type s3 \\\n", - " --model-location deepseek15b \\\n", - " --s3-bucket-name test-model-s3-zhaoqi \\\n", - " --s3-region us-east-2 \\\n", - " --image-uri 763104351884.dkr.ecr.us-east-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.4.0-tgi2.3.1-gpu-py311-cu124-ubuntu22.04-v2.0 \\\n", + " --model-location \\\n", + " --s3-bucket-name \\\n", + " --s3-region \\\n", + " --image-uri \\\n", " --model-volume-mount-name model-weights \\\n", " --container-port 8080 \\\n", " --resources-requests '{\"cpu\": \"30000m\", \"nvidia.com/gpu\": 1, \"memory\": \"100Gi\"}' \\\n", " --resources-limits '{\"nvidia.com/gpu\": 1}' \\\n", - " --tls-certificate-output-s3-uri s3://tls-bucket-inf1-beta2 \\\n", - " --instance-type ml.g5.8xlarge \\\n", - " --dimensions '{\"EndpointName\": \"endpoint-s3-test-cli\", \"VariantName\": \"AllTraffic\"}' \\\n", + " --tls-certificate-output-s3-uri s3://sample-bucket \\\n", + " --instance-type \\\n", + " --dimensions '{\"EndpointName\": \"endpoint-s3\", \"VariantName\": \"AllTraffic\"}' \\\n", " --metrics-enabled true \\\n", - " --endpoint-name endpoint-s3-test-cli \\\n", - " --model-name deepseek15b-s3-test-cli" + " --endpoint-name endpoint-s3 \\\n", + " --model-name " ] }, { @@ -98,7 +91,7 @@ "metadata": {}, "outputs": [], "source": [ - "!hyp describe hyp-custom-endpoint --name endpoint-s3-test-cli" + "!hyp describe hyp-custom-endpoint --name endpoint-s3" ] }, { @@ -108,7 +101,7 @@ "metadata": {}, "outputs": [], "source": [ - "!hyp invoke hyp-custom-endpoint --endpoint-name endpoint-s3-test-cli --body '{\"inputs\":\"What is the capital of USA?\"}'" + "!hyp invoke hyp-custom-endpoint --endpoint-name endpoint-s3 --body '{\"inputs\":\"What is the capital of USA?\"}'" ] }, { @@ -118,7 +111,7 @@ "metadata": {}, "outputs": [], "source": [ - "!hyp delete hyp-custom-endpoint --name endpoint-s3-test-cli" + "!hyp delete hyp-custom-endpoint --name endpoint-s3" ] }, { diff --git a/examples/inference/CLI/inference-s3-model-init-experience.ipynb b/examples/inference/CLI/inference-s3-model-init-experience.ipynb new file mode 100644 index 00000000..35450e35 --- /dev/null +++ b/examples/inference/CLI/inference-s3-model-init-experience.ipynb @@ -0,0 +1,327 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# SageMaker HyperPod Custom Endpoint - Init Experience\n", + "\n", + "This notebook demonstrates the complete end-to-end workflow for creating a SageMaker HyperPod Custom Endpoint using the HyperPod CLI. The init experience provides a guided approach to create Hyperpod Custom Endpoint with validation and configuration management.\n", + "\n", + "## Prerequisites\n", + "\n", + "- SageMaker HyperPod CLI installed (`pip install sagemaker-hyperpod`)\n", + "- Hyperpod custom inference template installed (`pip install hyperpod-custom-inference-template`)\n", + "- Hyperpod inference operator installed in your hyperpod cluster\n", + "- Python 3.8+ environment\n", + "\n", + "## Workflow Overview\n", + "\n", + "1. **Initialize** - Create initial custom endpoint configuration\n", + "2. **Configure** - Customize custom endpoint parameters\n", + "3. **Validate** - Verify configuration before deployment\n", + "4. **Create** - Deploy the custom endpoint creation\n", + "5. **Monitor** - Check custom endpoint status and manage lifecycle\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 0: Connect to your Hyperpod cluster\n", + "\n", + "Make sure you have installed hyperpod inference operator in your hyperpod cluster.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# List all available SageMaker HyperPod clusters in your account\n", + "!hyp list-cluster" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Configure your local kubectl environment to interact with a specific SageMaker HyperPod cluster (and namespace)\n", + "!hyp set-cluster-context --cluster-name ml-cluster-integ-test" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 1: Initialize Custom Endpoint Configuration\n", + "\n", + "The `hyp init hyp-custom-endpoint` command creates a new configuration template with default settings. This generates a `config.yaml` file that serves as the foundation for your deployment.\n", + "\n", + "**What this does:**\n", + "- Creates a `config.yaml` with default custom endpoint settings.\n", + "- Creates a `k8s.jinja` which is a reference to the k8s payload that is going to be submitted with. Users can refer this to understand how the parameters are being used. \n", + "- Creates a `README.md` which is a detailed explanation of the init experience.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize a new custom endpoint configuration in the current directory\n", + "!hyp init hyp-custom-endpoint" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 2: Configure Custom Endpoint Settings\n", + "\n", + "The `hyp configure` command allows you to customize your custom endpoint configuration.\n", + "\n", + "**Key configuration options:**\n", + "- **model_name**: Name of model to create on SageMaker\n", + "- **instance_type**: EC2 instance type for the inference server\n", + "- **endpoint_name**: Name of SageMaker endpoint\n", + "- **model_source_type**: Source type: fsx or s3\n", + "- **image_uri**: Inference server image name\n", + "- **container_port**: Port on which the model server listens\n", + "- **model_volume_mount_name**: Path inside container for model volume" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!hyp configure --endpoint-name my-custom-endpoint" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### View Current Configuration\n", + "\n", + "Let's examine the generated configuration to understand what will be deployed:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Display the current configuration\n", + "!cat config.yaml | head -50" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 3: Validate Configuration\n", + "\n", + "The `hyp validate` command performs syntax validation of your custom endpoint configuration before deployment. This helps catch configuration errors early and ensures all prerequisites are met.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Validate the custom endpoint configuration\n", + "# This checks for potential issues before deployment\n", + "!hyp validate" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 4: Reset Configuration (Optional)\n", + "\n", + "The `hyp reset` command allows you to reset your configuration to defaults or clean up any partial deployments. This is useful when you want to start fresh or if validation reveals issues that require a clean slate.\n", + "\n", + "**Use cases for reset:**\n", + "- Starting over with a clean configuration\n", + "- Cleaning up after failed deployments\n", + "- Switching between different custom endpoint configurations\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Reset configuration if needed (uncomment to use)\n", + "# !hyp reset\n", + "\n", + "print(\"Reset command available if configuration changes are needed\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 5: Create the Custom Endpoint\n", + "\n", + "The `hyp create` command deploys your HyperPod custom endpoint with configurations in the config.yaml. A timestamped folder is created in the `runs` folder, where the config.yaml and the values-injected k8s.yaml kubernates payload is saved.\n", + "\n", + "**Note:** The sagemaker custom endpoint typically takes 10-15 minutes to be created.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create the custom endpoint\n", + "!hyp create" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 6: Monitor Custom Endpoint Creation\n", + "\n", + "While the custom endpoint is being created, you can monitor its progress using the describe and list commands. These provide real-time status updates on the deployment process." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Check custom endpoint creation status\n", + "import time\n", + "\n", + "print(\"Monitoring custom endpoint progress...\")\n", + "for i in range(5):\n", + " print(f\"\\n--- Status Check {i+1} ---\")\n", + " !hyp describe hyp-custom-endpoint --name my-custom-endpoint\n", + " time.sleep(30) # Wait 30 seconds between checks" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 7: Invoke Sagemaker Endpoint\n", + "\n", + "After the sagemaker endpoint is successfully created, you can use `hyp invoke hyp-custom-endpoint` command to do basic invocation of sagemaker endpoint." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!hyp invoke hyp-custom-endpoint --endpoint-name my-custom-endpoint --body '{\"inputs\":\"What is the capital of USA?\"}'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 8: Describe Custom Endpoint\n", + "\n", + "The `hyp describe hyp-custom-endpoint` command provides detailed information about your custom endpoint deployment status and sagemaker endpoint status." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Get detailed information about the custom endpoint\n", + "!hyp describe hyp-custom-endpoint --name my-custom-endpoint" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 9: List All Custom Endpoints\n", + "\n", + "The `hyp list hyp-custom-endpoint` command shows all HyperPod custom endpoints in your account. This is useful for managing multiple custom endpoint deployments and getting an overview of your deployments.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# List all custom endpoints in your account\n", + "!hyp list hyp-custom-endpoint" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Next Steps\n", + "\n", + "After successfully creating your HyperPod Custom Endpoint, you can:\n", + "\n", + "1. **Monitor Resources**: Check pod status with `hyp list-pods hyp-custom-endpoint`\n", + "2. **Access Logs**: View pod logs with `hyp get-logs hyp-custom-endpoint`\n", + "\n", + "\n", + "## Troubleshooting\n", + "\n", + "If you encounter issues during Custom Endpoint creation:\n", + "\n", + "- Use `hyp get-operator-logs hyp-custom-endpoint` to check potential operator log errors\n", + "- Verify AWS credentials and permissions\n", + "- Ensure resource quotas are sufficient\n", + "- Review the configuration file for syntax errors\n", + "- Use `hyp validate` to identify configuration issues\n", + "\n", + "## Cleanup\n", + "\n", + "To avoid ongoing charges, remember to delete your custom endpoint when no longer needed:\n", + "\n", + "```bash\n", + "hyp delete hyp-custom-endpoint --name my-custom-endpoint\n", + "```\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.2" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/inference/SDK/inference-fsx-model-e2e.ipynb b/examples/inference/SDK/inference-fsx-model-e2e.ipynb index 10ae5b13..387cc6d5 100644 --- a/examples/inference/SDK/inference-fsx-model-e2e.ipynb +++ b/examples/inference/SDK/inference-fsx-model-e2e.ipynb @@ -1,5 +1,13 @@ { "cells": [ + { + "cell_type": "markdown", + "id": "f9758178", + "metadata": {}, + "source": [ + "## Inference Operator PySDK E2E Expereience (FSX custom model)" + ] + }, { "cell_type": "code", "execution_count": null, @@ -7,10 +15,19 @@ "metadata": {}, "outputs": [], "source": [ - "from sagemaker.hyperpod.hyperpod_manager import HyperPodManager\n", - "\n", - "HyperPodManager.list_clusters(region='us-east-2')\n", - "HyperPodManager.set_context('', region='us-east-2')" + "from sagemaker.hyperpod import list_clusters, set_cluster_context\n", + "list_clusters(region='us-east-2')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "765ef3fd", + "metadata": {}, + "outputs": [], + "source": [ + "# choose the HP cluster\n", + "set_cluster_context('', region='us-east-2')" ] }, { @@ -20,8 +37,9 @@ "metadata": {}, "outputs": [], "source": [ - "from sagemaker.hyperpod.inference.config.hp_endpoint_config import CloudWatchTrigger, PrometheusTrigger, AutoScalingSpec, ModelMetrics, Metrics, FsxStorage, ModelSourceConfig, Tags, TlsConfig, ConfigMapKeyRef, FieldRef, ResourceFieldRef, SecretKeyRef, ValueFrom, EnvironmentVariables, ModelInvocationPort, ModelVolumeMount, Claims, Resources, Worker\n", + "from sagemaker.hyperpod.inference.config.hp_endpoint_config import FsxStorage, ModelSourceConfig, TlsConfig, EnvironmentVariables, ModelInvocationPort, ModelVolumeMount, Resources, Worker\n", "from sagemaker.hyperpod.inference.hp_endpoint import HPEndpoint\n", + "from sagemaker.hyperpod.common.config.metadata import Metadata\n", "import yaml\n", "import time" ] @@ -33,13 +51,17 @@ "metadata": {}, "outputs": [], "source": [ - "tls_config=TlsConfig(tls_certificate_output_s3_uri='s3://')\n", + "# If you don't set metadata name, it will be default to endpoint name\n", + "# If you don't set namespace, it will be default to \"default\"\n", + "metadata=Metadata(name='', namespace='')\n", + "\n", + "tls_config=TlsConfig(tls_certificate_output_s3_uri='s3://')\n", "\n", "model_source_config = ModelSourceConfig(\n", " model_source_type='fsx',\n", - " model_location=\"\",\n", + " model_location=\"\",\n", " fsx_storage=FsxStorage(\n", - " file_system_id=''\n", + " file_system_id=''\n", " ),\n", ")\n", "\n", @@ -73,7 +95,8 @@ "outputs": [], "source": [ "fsx_endpoint = HPEndpoint(\n", - " endpoint_name='test-endpoint-name-fsx-pysdk',\n", + " metadata=metadata,\n", + " endpoint_name='',\n", " instance_type='ml.g5.8xlarge',\n", " model_name='deepseek15b-fsx-test-pysdk',\n", " tls_config=tls_config,\n", @@ -165,7 +188,7 @@ "metadata": {}, "outputs": [], "source": [ - "endpoint = HPEndpoint.get(name='')" + "endpoint = HPEndpoint.get(name='')" ] }, { diff --git a/examples/inference/SDK/inference-jumpstart-e2e.ipynb b/examples/inference/SDK/inference-jumpstart-e2e.ipynb index 1cb0b4b4..52f53c71 100644 --- a/examples/inference/SDK/inference-jumpstart-e2e.ipynb +++ b/examples/inference/SDK/inference-jumpstart-e2e.ipynb @@ -8,14 +8,6 @@ "## Inference Operator PySDK E2E Expereience (JumpStart model)" ] }, - { - "cell_type": "markdown", - "id": "1b3ce5c1-3c3d-4139-b7ae-042f360f3032", - "metadata": {}, - "source": [ - "Prerequisite: Data scientists should list clusters and set cluster context" - ] - }, { "cell_type": "code", "execution_count": null, @@ -23,7 +15,7 @@ "metadata": {}, "outputs": [], "source": [ - "from sagemaker.hyperpod.hyperpod_manager import HyperPodManager" + "from sagemaker.hyperpod import list_clusters, set_cluster_context" ] }, { @@ -33,8 +25,7 @@ "metadata": {}, "outputs": [], "source": [ - "#Set region \n", - "region = \"us-west-2\"" + "list_clusters(region='us-east-2')" ] }, { @@ -44,8 +35,8 @@ "metadata": {}, "outputs": [], "source": [ - "# choose the HP cluster user works on\n", - "HyperPodManager.set_context('sagemaker-hyperpod-eks-cluster-demo-05-01', region=region)" + "# choose the HP cluster\n", + "set_cluster_context('', region='us-east-2')" ] }, { @@ -64,10 +55,10 @@ "outputs": [], "source": [ "# Import the helper module\n", - "from jumpstart_public_hub_visualization_utils import get_all_public_hub_model_data\n", + "from sagemaker.hyperpod.inference.jumpstart_public_hub_visualization_utils import get_all_public_hub_model_data\n", "\n", "# Load and display SageMaker public hub models\n", - "get_all_public_hub_model_data(region=\"us-west-2\")" + "get_all_public_hub_model_data(region=\"us-east-2\")" ] }, { @@ -95,6 +86,7 @@ "source": [ "from sagemaker.hyperpod.inference.config.hp_jumpstart_endpoint_config import Model, Server,SageMakerEndpoint, TlsConfig, EnvironmentVariables\n", "from sagemaker.hyperpod.inference.hp_jumpstart_endpoint import HPJumpStartEndpoint\n", + "from sagemaker.hyperpod.common.config.metadata import Metadata\n", "import yaml\n", "import time" ] @@ -114,23 +106,25 @@ "metadata": {}, "outputs": [], "source": [ + "# If you don't set metadata name, it will be default to endpoint name\n", + "# If you don't set namespace, it will be default to \"default\"\n", + "metadata=Metadata(name='', namespace='')\n", + "\n", "# create configs\n", "model=Model(\n", - " model_id='deepseek-llm-r1-distill-qwen-1-5b',\n", - " model_version='2.0.4',\n", + " model_id='deepseek-llm-r1-distill-qwen-1-5b'\n", ")\n", "server=Server(\n", " instance_type='ml.g5.8xlarge',\n", ")\n", - "endpoint_name=SageMakerEndpoint(name='deepsek7bsme-testing-jumpstart-7-1')\n", - "tls_config=TlsConfig(tls_certificate_output_s3_uri='s3://tls-bucket-inf1-beta2')\n", + "endpoint_name=SageMakerEndpoint(name='')\n", "\n", "# create spec\n", "js_endpoint=HPJumpStartEndpoint(\n", + " metadata=metadata,\n", " model=model,\n", " server=server,\n", - " sage_maker_endpoint=endpoint_name,\n", - " tls_config=tls_config,\n", + " sage_maker_endpoint=endpoint_name\n", ")" ] }, @@ -230,7 +224,7 @@ "outputs": [], "source": [ "# output is similar to kubectl describe jumpstartmodel\n", - "endpoint = HPJumpStartEndpoint.get(name='deepseek-llm-r1-distill-qwen-1-5b')\n", + "endpoint = HPJumpStartEndpoint.get(name='')\n", "print_yaml(endpoint)" ] }, @@ -265,10 +259,7 @@ "outputs": [], "source": [ "# get operator logs\n", - "print(js_endpoint.get_operator_logs(since_hours=1))\n", - "\n", - "# get specific pod log\n", - "# js_endpoint.get_logs(pod='pod-name')" + "print(js_endpoint.get_operator_logs(since_hours=0.1))" ] }, { diff --git a/examples/inference/SDK/inference-s3-model-e2e.ipynb b/examples/inference/SDK/inference-s3-model-e2e.ipynb index 2c41a11d..b57d0fc6 100644 --- a/examples/inference/SDK/inference-s3-model-e2e.ipynb +++ b/examples/inference/SDK/inference-s3-model-e2e.ipynb @@ -1,5 +1,13 @@ { "cells": [ + { + "cell_type": "markdown", + "id": "625ebf46", + "metadata": {}, + "source": [ + "## Inference Operator PySDK E2E Expereience (S3 custom model)" + ] + }, { "cell_type": "code", "execution_count": null, @@ -7,10 +15,19 @@ "metadata": {}, "outputs": [], "source": [ - "from sagemaker.hyperpod.hyperpod_manager import HyperPodManager\n", - "\n", - "HyperPodManager.list_clusters(region='us-east-2')\n", - "HyperPodManager.set_context('', region='us-east-2')" + "from sagemaker.hyperpod import list_clusters, set_cluster_context\n", + "list_clusters(region='us-east-2')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "14cd61ab", + "metadata": {}, + "outputs": [], + "source": [ + "# choose the HP cluster\n", + "set_cluster_context('', region='us-east-2')" ] }, { @@ -20,8 +37,9 @@ "metadata": {}, "outputs": [], "source": [ - "from sagemaker.hyperpod.inference.config.hp_endpoint_config import CloudWatchTrigger, Dimensions, PrometheusTrigger, AutoScalingSpec, ModelMetrics, Metrics, FsxStorage, S3Storage, ModelSourceConfig, Tags, TlsConfig, ConfigMapKeyRef, FieldRef, ResourceFieldRef, SecretKeyRef, ValueFrom, EnvironmentVariables, ModelInvocationPort, ModelVolumeMount, Claims, Resources, Worker\n", + "from sagemaker.hyperpod.inference.config.hp_endpoint_config import CloudWatchTrigger, Dimensions, AutoScalingSpec, Metrics, S3Storage, ModelSourceConfig, TlsConfig, EnvironmentVariables, ModelInvocationPort, ModelVolumeMount, Resources, Worker\n", "from sagemaker.hyperpod.inference.hp_endpoint import HPEndpoint\n", + "from sagemaker.hyperpod.common.config.metadata import Metadata \n", "import yaml\n", "import time" ] @@ -33,13 +51,17 @@ "metadata": {}, "outputs": [], "source": [ - "tls_config=TlsConfig(tls_certificate_output_s3_uri='s3://')\n", + "# If you don't set metadata name, it will be default to endpoint name\n", + "# If you don't set namespace, it will be default to \"default\"\n", + "metadata=Metadata(name='', namespace='')\n", + "\n", + "tls_config=TlsConfig(tls_certificate_output_s3_uri='s3://')\n", "\n", "model_source_config = ModelSourceConfig(\n", " model_source_type='s3',\n", - " model_location=\"\",\n", + " model_location=\"\",\n", " s3_storage=S3Storage(\n", - " bucket_name='',\n", + " bucket_name='',\n", " region='us-east-2',\n", " ),\n", ")\n", @@ -63,35 +85,7 @@ " limits={\"nvidia.com/gpu\": 1}\n", " ),\n", " environment_variables=environment_variables,\n", - ")\n", - "\n", - "# Create dimensions\n", - "dimensions = [\n", - " Dimensions(name=\"EndpointName\", value=\"\"),\n", - " Dimensions(name=\"VariantName\", value=\"AllTraffic\")\n", - "]\n", - "\n", - "# Create CloudWatch trigger\n", - "cloudwatch_trigger = CloudWatchTrigger(\n", - " dimensions=dimensions,\n", - " metric_collection_period=30,\n", - " metric_name=\"Invocations\",\n", - " metric_stat=\"Sum\",\n", - " metric_type=\"Average\",\n", - " min_value=0.0,\n", - " name=\"SageMaker-Invocations\",\n", - " namespace=\"AWS/SageMaker\",\n", - " target_value=10,\n", - " use_cached_metrics=False\n", - ")\n", - "\n", - "# Create autoscaling spec\n", - "auto_scaling_spec = AutoScalingSpec(\n", - " cloud_watch_trigger=cloudwatch_trigger\n", - ")\n", - "\n", - "# Create metrics\n", - "metrics = Metrics(enabled=True)" + ")" ] }, { @@ -102,14 +96,13 @@ "outputs": [], "source": [ "s3_endpoint = HPEndpoint(\n", - " endpoint_name='s3-test-endpoint-name',\n", + " metadata=metadata,\n", + " endpoint_name='',\n", " instance_type='ml.g5.8xlarge',\n", " model_name='deepseek15b-test-model-name', \n", " tls_config=tls_config,\n", " model_source_config=model_source_config,\n", " worker=worker,\n", - " auto_scaling_spec=auto_scaling_spec,\n", - " metrics=metrics,\n", ")" ] }, @@ -120,7 +113,7 @@ "metadata": {}, "outputs": [], "source": [ - "s3_endpoint.create(debug=True)" + "s3_endpoint.create()" ] }, { @@ -193,7 +186,17 @@ "outputs": [], "source": [ "endpoint_list = HPEndpoint.list()\n", - "print_yaml(endpoint_list[1])" + "print_yaml(endpoint_list[0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "660e8d47", + "metadata": {}, + "outputs": [], + "source": [ + "s3_endpoint = HPEndpoint.get(name='')" ] }, { @@ -206,10 +209,7 @@ "outputs": [], "source": [ "# get operator logs\n", - "print(s3_endpoint.get_operator_logs(since_hours=0.5))\n", - "\n", - "# get specific pod log\n", - "# js_endpoint.get_logs(pod='pod-name')" + "print(s3_endpoint.get_operator_logs(since_hours=0.1))" ] }, { diff --git a/examples/training/CLI/training-e2e-cli.ipynb b/examples/training/CLI/training-e2e-cli.ipynb index 9a915769..9791c52e 100644 --- a/examples/training/CLI/training-e2e-cli.ipynb +++ b/examples/training/CLI/training-e2e-cli.ipynb @@ -4,7 +4,9 @@ "cell_type": "markdown", "id": "2d275612", "metadata": {}, - "source": "## Training Operator CLI E2E Experience " + "source": [ + "## Training Operator CLI E2E Experience " + ] }, { "cell_type": "markdown", @@ -19,25 +21,48 @@ { "cell_type": "code", "execution_count": null, - "id": "b30debba", + "id": "9df747dbfa211453", + "metadata": {}, + "outputs": [], + "source": [ + "!hyp list-cluster --output table" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8db986d2b42a9e88", "metadata": {}, "outputs": [], - "source": "!hyperpod get-clusters" + "source": [ + "!hyp set-cluster-context --cluster-name " + ] }, { + "cell_type": "code", + "execution_count": null, + "id": "ba996d7dc8e128d5", + "metadata": {}, + "outputs": [], + "source": [ + "#verify the cluster context\n", + "!hyp get-cluster-context " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a541575e45e68b3d", "metadata": { "jupyter": { "is_executing": true } }, - "cell_type": "code", + "outputs": [], "source": [ "# To verify the opinionated list of arguments\n", "!hyp create hyp-pytorch-job --help" - ], - "id": "a541575e45e68b3d", - "outputs": [], - "execution_count": null + ] }, { "cell_type": "code", @@ -46,6 +71,7 @@ "metadata": {}, "outputs": [], "source": [ + "#example command\n", "!hyp create hyp-pytorch-job \\\n", " --version 1.0 \\\n", " --job-name test-pytorch-job-cli \\\n", @@ -68,12 +94,24 @@ ] }, { - "metadata": {}, "cell_type": "code", + "execution_count": null, + "id": "19c32fa0", + "metadata": {}, "outputs": [], + "source": [ + "!hyp describe hyp-pytorch-job --job-name test-pytorch-job-cli" + ] + }, + { + "cell_type": "code", "execution_count": null, - "source": "!hyp describe hyp-pytorch-job --job-name test-pytorch-job-cli", - "id": "19c32fa0" + "id": "7d90c1ab", + "metadata": {}, + "outputs": [], + "source": [ + "!hyp get-operator-logs hyp-pytorch-job --since-hours 0.5" + ] }, { "cell_type": "code", @@ -81,7 +119,9 @@ "id": "dca0cb1f", "metadata": {}, "outputs": [], - "source": "!hyp list-pods hyp-pytorch-job --job-name test-pytorch-job-cli" + "source": [ + "!hyp list-pods hyp-pytorch-job --job-name test-pytorch-job-cli" + ] }, { "cell_type": "code", @@ -89,7 +129,9 @@ "id": "64ae67bf", "metadata": {}, "outputs": [], - "source": "!hyp get-logs hyp-pytorch-job --pod-name test-pytorch-job-cli-pod-0 --job-name test-pytorch-job-cli" + "source": [ + "!hyp get-logs hyp-pytorch-job --pod-name test-pytorch-job-cli-pod-0 --job-name test-pytorch-job-cli" + ] }, { "cell_type": "code", @@ -97,7 +139,9 @@ "id": "fcf2161f", "metadata": {}, "outputs": [], - "source": "!hyp delete hyp-pytorch-job --job-name test-pytorch-job-cli\n" + "source": [ + "!hyp delete hyp-pytorch-job --job-name test-pytorch-job-cli\n" + ] } ], "metadata": { diff --git a/examples/training/CLI/training-init-experience.ipynb b/examples/training/CLI/training-init-experience.ipynb new file mode 100644 index 00000000..4600f367 --- /dev/null +++ b/examples/training/CLI/training-init-experience.ipynb @@ -0,0 +1,302 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# SageMaker HyperPod Pytorch Job - Init Experience\n", + "\n", + "This notebook demonstrates the complete end-to-end workflow for creating a SageMaker HyperPod Pytorch Job using the HyperPod CLI. The init experience provides a guided approach to create Hyperpod Pytorch Job with validation and configuration management.\n", + "\n", + "## Prerequisites\n", + "\n", + "- SageMaker HyperPod CLI installed (`pip install sagemaker-hyperpod`)\n", + "- Hyperpod pytorch job template installed (`pip install hyperpod-pytorch-job-template`)\n", + "- Hyperpod training operator installed in your hyperpod cluster\n", + "- Python 3.8+ environment\n", + "\n", + "## Workflow Overview\n", + "\n", + "1. **Initialize** - Create initial pytorch job configuration\n", + "2. **Configure** - Customize pytorch job parameters\n", + "3. **Validate** - Verify configuration before deployment\n", + "4. **Create** - Deploy the pytorch job creation\n", + "5. **Monitor** - Check pytorch job status and manage lifecycle\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 0: Connect to your Hyperpod cluster\n", + "\n", + "Make sure you have installed hyperpod training operator in your hyperpod cluster.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# List all available SageMaker HyperPod clusters in your account\n", + "!hyp list-cluster" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Configure your local kubectl environment to interact with a specific SageMaker HyperPod cluster (and namespace)\n", + "!hyp set-cluster-context --cluster-name ml-cluster-integ-test" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 1: Initialize Pytorch Job Configuration\n", + "\n", + "The `hyp init hyp-pytorch-job` command creates a new configuration template with default settings. This generates a `config.yaml` file that serves as the foundation for your deployment.\n", + "\n", + "**What this does:**\n", + "- Creates a `config.yaml` with default pytorch job settings.\n", + "- Creates a `k8s.jinja` which is a reference to the k8s payload that is going to be submitted with. Users can refer this to understand how the parameters are being used. \n", + "- Creates a `README.md` which is a detailed explanation of the init experience.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize a new pytorch job configuration in the current directory\n", + "!hyp init hyp-pytorch-job" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 2: Configure Pytorch Job Settings\n", + "\n", + "The `hyp configure` command allows you to customize your pytorch job configuration.\n", + "\n", + "**Key configuration options:**\n", + "- **job_name**: Job name\n", + "- **image**: Docker image for training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!hyp configure --job-name my-pytorch-job" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### View Current Configuration\n", + "\n", + "Let's examine the generated configuration to understand what will be deployed:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Display the current configuration\n", + "!cat config.yaml | head -50" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 3: Validate Configuration\n", + "\n", + "The `hyp validate` command performs syntax validation of your pytorch job configuration before deployment. This helps catch configuration errors early and ensures all prerequisites are met.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Validate the pytorch job configuration\n", + "# This checks for potential issues before deployment\n", + "!hyp validate" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 4: Reset Configuration (Optional)\n", + "\n", + "The `hyp reset` command allows you to reset your configuration to defaults or clean up any partial deployments. This is useful when you want to start fresh or if validation reveals issues that require a clean slate.\n", + "\n", + "**Use cases for reset:**\n", + "- Starting over with a clean configuration\n", + "- Cleaning up after failed deployments\n", + "- Switching between different pytorch job configurations\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Reset configuration if needed (uncomment to use)\n", + "# !hyp reset\n", + "\n", + "print(\"Reset command available if configuration changes are needed\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 5: Create the Pytorch Job\n", + "\n", + "The `hyp create` command deploys your HyperPod pytorch job with configurations in the config.yaml. A timestamped folder is created in the `runs` folder, where the config.yaml and the values-injected k8s.yaml kubernates payload is saved." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create the pytorch job\n", + "!hyp create" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 6: Monitor Pytorch Job Creation\n", + "\n", + "While the pytorch job is being created, you can monitor its progress using the describe and list commands. These provide real-time status updates on the deployment process." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Check pytorch job creation status\n", + "import time\n", + "\n", + "print(\"Monitoring pytorch job progress...\")\n", + "for i in range(5):\n", + " print(f\"\\n--- Status Check {i+1} ---\")\n", + " !hyp describe hyp-pytorch-job --name my-pytorch-job\n", + " time.sleep(30) # Wait 30 seconds between checks" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 7: Describe Pytorch Job\n", + "\n", + "The `hyp describe hyp-pytorch-job` command provides detailed information about your pytorch job deployment status and sagemaker pytorch job status." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Get detailed information about the pytorch job\n", + "!hyp describe hyp-pytorch-job --name my-pytorch-job" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 8: List All Pytorch Jobs\n", + "\n", + "The `hyp list hyp-pytorch-job` command shows all HyperPod pytorch jobs in your account. This is useful for managing multiple pytorch job deployments and getting an overview of your deployments.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# List all pytorch jobs in your account\n", + "!hyp list hyp-pytorch-job" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Next Steps\n", + "\n", + "After successfully creating your HyperPod Pytorch Job, you can:\n", + "\n", + "1. **Monitor Resources**: Check pod status with `hyp list-pods hyp-pytorch-job`\n", + "2. **Access Logs**: View pod logs with `hyp get-logs hyp-pytorch-job`\n", + "\n", + "\n", + "## Troubleshooting\n", + "\n", + "If you encounter issues during Pytorch Job creation:\n", + "\n", + "- Use `hyp get-operator-logs hyp-pytorch-job` to check potential operator log errors\n", + "- Verify AWS credentials and permissions\n", + "- Ensure resource quotas are sufficient\n", + "- Review the configuration file for syntax errors\n", + "- Use `hyp validate` to identify configuration issues\n", + "\n", + "## Cleanup\n", + "\n", + "To avoid ongoing charges, remember to delete your pytorch job when no longer needed:\n", + "\n", + "```bash\n", + "hyp delete hyp-pytorch-job --name my-pytorch-job\n", + "```\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.2" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/training/SDK/training_sdk_example.ipynb b/examples/training/SDK/training_sdk_example.ipynb index 009dccf2..027b1b2f 100644 --- a/examples/training/SDK/training_sdk_example.ipynb +++ b/examples/training/SDK/training_sdk_example.ipynb @@ -129,6 +129,25 @@ "print(pytorch_job.get_logs_from_pod(\"demo-pod-0\"))" ] }, + { + "cell_type": "markdown", + "id": "49edfbb1", + "metadata": {}, + "source": [ + "### Get training operator logs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9f4fb64e", + "metadata": {}, + "outputs": [], + "source": [ + "# get operator logs\n", + "print(pytorch_job.get_operator_logs(since_hours=0.1))" + ] + }, { "cell_type": "markdown", "id": "3b0e4b5d", diff --git a/helm_chart/HyperPodHelmChart/Chart.yaml b/helm_chart/HyperPodHelmChart/Chart.yaml index ede7fff9..35d36a39 100644 --- a/helm_chart/HyperPodHelmChart/Chart.yaml +++ b/helm_chart/HyperPodHelmChart/Chart.yaml @@ -24,9 +24,14 @@ version: 0.1.0 appVersion: "1.16.0" dependencies: + - name: cert-manager + version: "v1.18.2" + repository: oci://quay.io/jetstack/charts + condition: cert-manager.enabled - name: training-operators version: "0.1.0" repository: "file://charts/training-operators" + condition: trainingOperators.enabled - name: mlflow version: "0.1.0" repository: "file://charts/mlflow" @@ -36,7 +41,7 @@ dependencies: repository: https://nvidia.github.io/k8s-device-plugin condition: nvidia-device-plugin.devicePlugin.enabled - name: aws-efa-k8s-device-plugin - version: "0.5.3" + version: "0.5.10" repository: https://aws.github.io/eks-charts/ condition: aws-efa-k8s-device-plugin.devicePlugin.enabled - name: neuron-device-plugin diff --git a/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent-0.1.0.tgz b/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent-0.1.0.tgz new file mode 100644 index 00000000..b8792797 Binary files /dev/null and b/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent-0.1.0.tgz differ diff --git a/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/Chart.yaml b/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/Chart.yaml index 0e38bdd5..e93502a5 100644 --- a/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/Chart.yaml +++ b/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/Chart.yaml @@ -1,5 +1,5 @@ apiVersion: v2 name: health-monitoring-agent version: 0.1.0 -appVersion: 1.0 +appVersion: "1.0" description: A Helm chart for setting up Hyperpod health-monitoring-agent related permissions diff --git a/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/templates/_helpers.tpl b/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/templates/_helpers.tpl new file mode 100644 index 00000000..faec3ffb --- /dev/null +++ b/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/templates/_helpers.tpl @@ -0,0 +1,180 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "health-monitoring-agent.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "health-monitoring-agent.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "health-monitoring-agent.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "health-monitoring-agent.labels" -}} +helm.sh/chart: {{ include "health-monitoring-agent.chart" . }} +{{ include "health-monitoring-agent.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "health-monitoring-agent.selectorLabels" -}} +app.kubernetes.io/name: {{ include "health-monitoring-agent.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Generate the health monitoring agent image URI based on AWS region +*/}} +{{- define "health-monitoring-agent.imageUri" -}} +{{- $region := "" -}} +{{- $imageTag := .Values.imageTag | default "1.0.935.0_1.0.282.0" -}} + +{{/* Debug: Show image tag selection if debug is enabled */}} +{{- if .Values.debug -}} + {{/* DEBUG: Image tag selection - Values.imageTag: {{ .Values.imageTag | default "not set" }}, Final imageTag: {{ $imageTag }} */}} +{{- end -}} + +{{/* Try to get region from various sources in priority order */}} +{{- if .Values.region -}} + {{/* 1. Explicit region setting (highest priority) */}} + {{- $region = .Values.region -}} + {{- if .Values.debug -}} + {{/* DEBUG: Using explicit region setting: {{ $region }} */}} + {{- end -}} +{{- else if and .Values.global .Values.global.region -}} + {{/* 2. Global region setting */}} + {{- $region = .Values.global.region -}} + {{- if .Values.debug -}} + {{/* DEBUG: Using global region setting: {{ $region }} */}} + {{- end -}} +{{- else -}} + {{/* 3. Try to detect region from Kubernetes cluster context */}} + {{- $detectedRegion := "" -}} + {{- if .Values.debug -}} + {{/* DEBUG: Attempting automatic region detection... */}} + {{- end -}} + + {{/* Note: cluster-info ConfigMap doesn't exist in EKS clusters, so we skip this method */}} + {{- if .Values.debug -}} + {{/* DEBUG: Skipping cluster-info ConfigMap lookup (not available in EKS clusters) */}} + {{- end -}} + + {{/* Try alternative method: look for AWS node info */}} + {{- if not $detectedRegion -}} + {{- if .Values.debug -}} + {{/* DEBUG: Trying to detect region from node labels... */}} + {{- end -}} + {{- $nodes := lookup "v1" "Node" "" "" -}} + {{- if $nodes -}} + {{- if .Values.debug -}} + {{/* DEBUG: Found {{ len $nodes.items }} nodes, checking labels... */}} + {{- end -}} + {{- range $nodes.items -}} + {{- if .metadata.labels -}} + {{/* Check for topology.kubernetes.io/region label */}} + {{- if index .metadata.labels "topology.kubernetes.io/region" -}} + {{- $detectedRegion = index .metadata.labels "topology.kubernetes.io/region" -}} + {{- if $.Values.debug -}} + {{/* DEBUG: Found region from topology.kubernetes.io/region label: {{ $detectedRegion }} */}} + {{- end -}} + {{- break -}} + {{- end -}} + {{/* Check for failure-domain.beta.kubernetes.io/region label (legacy) */}} + {{- if and (not $detectedRegion) (index .metadata.labels "failure-domain.beta.kubernetes.io/region") -}} + {{- $detectedRegion = index .metadata.labels "failure-domain.beta.kubernetes.io/region" -}} + {{- if $.Values.debug -}} + {{/* DEBUG: Found region from failure-domain.beta.kubernetes.io/region label: {{ $detectedRegion }} */}} + {{- end -}} + {{- break -}} + {{- end -}} + {{- end -}} + {{- end -}} + {{- else -}} + {{- if .Values.debug -}} + {{/* DEBUG: No nodes found for region detection */}} + {{- end -}} + {{- end -}} + {{- end -}} + + {{/* Use detected region or fall back to default */}} + {{- if $detectedRegion -}} + {{- $region = $detectedRegion -}} + {{- if .Values.debug -}} + {{/* DEBUG: Using detected region: {{ $region }} */}} + {{- end -}} + {{- else -}} + {{/* 4. Default fallback to us-east-1 */}} + {{- $region = "us-east-1" -}} + {{- if .Values.debug -}} + {{/* DEBUG: No region detected, using default fallback: {{ $region }} */}} + {{- end -}} + {{- end -}} +{{- end -}} + +{{/* Region to ECR account ID mapping */}} +{{- $regionAccountMap := dict + "us-east-1" "767398015722" + "us-west-2" "905418368575" + "us-east-2" "851725546812" + "us-west-1" "011528288828" + "eu-central-1" "211125453373" + "eu-north-1" "654654141839" + "eu-west-1" "533267293120" + "eu-west-2" "011528288831" + "ap-northeast-1" "533267052152" + "ap-south-1" "011528288864" + "ap-southeast-1" "905418428165" + "ap-southeast-2" "851725636348" + "sa-east-1" "025066253954" +-}} + +{{/* Get the account ID for the region, default to us-west-2 account if region not found */}} +{{- $accountId := index $regionAccountMap $region | default "767398015722" -}} + +{{/* Debug: Show final region and account mapping */}} +{{- if .Values.debug -}} + {{/* DEBUG: Final region: {{ $region }}, Account ID: {{ $accountId }} */}} +{{- end -}} + +{{/* Allow override of the full image URI if specified */}} +{{- if .Values.hmaimage -}} + {{- if .Values.debug -}} + {{/* DEBUG: Using override image URI: {{ .Values.hmaimage }} */}} + {{- end -}} + {{- .Values.hmaimage -}} +{{- else -}} + {{- $finalImageUri := printf "%s.dkr.ecr.%s.amazonaws.com/hyperpod-health-monitoring-agent:%s" $accountId $region $imageTag -}} + {{- if .Values.debug -}} + {{/* DEBUG: Generated image URI: {{ $finalImageUri }} */}} + {{- end -}} + {{- $finalImageUri -}} +{{- end -}} +{{- end }} diff --git a/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/templates/health-monitoring-agent.yaml b/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/templates/health-monitoring-agent.yaml index 128a9533..c7bee94c 100644 --- a/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/templates/health-monitoring-agent.yaml +++ b/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/templates/health-monitoring-agent.yaml @@ -85,12 +85,6 @@ spec: - ml.g5.16xlarge - ml.g5.24xlarge - ml.g5.48xlarge - - ml.inf2.xlarge - - ml.inf2.8xlarge - - ml.inf2.24xlarge - - ml.inf2.48xlarge - - ml.trn1.32xlarge - - ml.trn1n.32xlarge - ml.g6.xlarge - ml.g6.2xlarge - ml.g6.4xlarge @@ -109,14 +103,14 @@ spec: - ml.g6e.12xlarge - ml.g6e.24xlarge - ml.g6e.48xlarge - - ml.trn2.48xlarge - ml.p6-b200.48xlarge + - ml.p6e-gb200.36xlarge containers: - name: health-monitoring-agent args: - --enable-k8s-exporter=false - --config.system-log-monitor=/config/system-message-monitor.json - image: {{ .Values.hmaimage }} + image: {{ include "health-monitoring-agent.imageUri" . }} resources: limits: cpu: 500m @@ -165,3 +159,93 @@ spec: operator: Exists - effect: NoExecute operator: Exists +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: health-monitoring-agent-non-nvidia + namespace: {{ .Values.namespace }} + labels: + app: health-monitoring-agent-non-nvidia +spec: + selector: + matchLabels: + app: health-monitoring-agent-non-nvidia + template: + metadata: + labels: + app: health-monitoring-agent-non-nvidia + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node.kubernetes.io/instance-type + operator: In + values: + - ml.inf2.xlarge + - ml.inf2.8xlarge + - ml.inf2.24xlarge + - ml.inf2.48xlarge + - ml.trn1.32xlarge + - ml.trn1n.32xlarge + - ml.trn2.48xlarge + containers: + - name: health-monitoring-agent-non-nvidia + args: + - --enable-k8s-exporter=false + - --config.system-log-monitor=/config/system-message-monitor.json + image: {{ include "health-monitoring-agent.imageUri" . }} + resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 500m + memory: 512Mi + imagePullPolicy: IfNotPresent + securityContext: + runAsUser: 1000 + runAsGroup: 2000 + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: NODE_IP + valueFrom: + fieldRef: + fieldPath: status.hostIP + - name: NVIDIA_VISIBLE_DEVICES + value: "void" + - name: NVIDIA_DRIVER_CAPABILITIES + value: "" + volumeMounts: + - name: log + mountPath: /var/log + - name: kmsg + mountPath: /dev/kmsg + readOnly: true + # Make sure node problem detector is in the same timezone + # with the host. + - name: localtime + mountPath: /etc/localtime + readOnly: true + serviceAccountName: health-monitoring-agent + volumes: + - name: log + # Config `log` to your system log directory + hostPath: + path: /var/log/ + - name: kmsg + hostPath: + path: /dev/kmsg + - name: localtime + hostPath: + path: /etc/localtime + tolerations: + - effect: NoSchedule + operator: Exists + - effect: NoExecute + operator: Exists \ No newline at end of file diff --git a/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/values.yaml b/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/values.yaml index 56287fd0..1f335b2d 100644 --- a/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/values.yaml +++ b/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/values.yaml @@ -1,2 +1,32 @@ namespace: "aws-hyperpod" -hmaimage: "905418368575.dkr.ecr.us-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.552.0_1.0.161.0" + +# AWS region for the health monitoring agent ECR image +# The chart automatically detects the region from Kubernetes cluster context. +# Only specify this if you want to override the automatic detection. +# +# Automatic detection priority: +# 1. This explicit region setting (highest priority) +# 2. Global region setting (global.region) +# 3. Kubernetes cluster context detection: +# - EKS API server URL patterns +# - Node topology labels (topology.kubernetes.io/region) +# - AWS provider IDs in node specifications +# - Legacy region labels (failure-domain.beta.kubernetes.io/region) +# 4. Default fallback: us-west-2 +# +# Supported regions: us-east-1, us-west-2, us-east-2, us-west-1, eu-central-1, +# eu-north-1, eu-west-1, eu-west-2, ap-northeast-1, ap-south-1, ap-southeast-1, +# ap-southeast-2, sa-east-1 +region: "" + +# Image tag for health monitoring agent +# If not specified, uses global.imageTag or defaults to hardcoded version +imageTag: "" + +# Override the health monitoring agent image URI +# If specified, this will override the automatic region-based URI selection +# Example: "905418368575.dkr.ecr.us-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.935.0_1.0.282.0" +hmaimage: "" + +# Enable debug output for region selection process +debug: true diff --git a/helm_chart/HyperPodHelmChart/values.yaml b/helm_chart/HyperPodHelmChart/values.yaml index 9e4ba31a..7628c91c 100644 --- a/helm_chart/HyperPodHelmChart/values.yaml +++ b/helm_chart/HyperPodHelmChart/values.yaml @@ -2,6 +2,11 @@ # This is a YAML-formatted file. # Declare variables to be passed into your templates. +# Global configuration +global: + # AWS region for all components (can be overridden per component) + region: "" + replicaCount: 1 image: @@ -110,6 +115,15 @@ namespace: create: true name: aws-hyperpod +cert-manager: + enabled: false + namespace: cert-manager + global: + leaderElection: + namespace: cert-manager + crds: + enabled: true + mlflow: enabled: false @@ -175,6 +189,8 @@ nvidia-device-plugin: - ml.p5.48xlarge - ml.p5e.48xlarge - ml.p5en.48xlarge + - ml.p6-b200.48xlarge + - ml.p6e-gb200.36xlarge tolerations: - key: nvidia.com/gpu operator: Exists @@ -192,6 +208,7 @@ aws-efa-k8s-device-plugin: devicePlugin: enabled: true supportedInstanceLabels: + # https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa.html#efa-instance-types values: - ml.c5n.9xlarge - ml.c5n.18xlarge @@ -232,6 +249,8 @@ aws-efa-k8s-device-plugin: - ml.p5.48xlarge - ml.p5e.48xlarge - ml.p5en.48xlarge + - ml.p6-b200.48xlarge + - ml.p6e-gb200.36xlarge - ml.r7i.large - ml.r7i.xlarge - ml.r7i.2xlarge @@ -258,7 +277,9 @@ aws-efa-k8s-device-plugin: mpi-operator: enabled: true health-monitoring-agent: - enabled: true + enabled: true + # AWS region will be automatically detected or can be specified + # region: "us-east-1" deep-health-check: enabled: true job-auto-restart: diff --git a/helm_chart/get_helm.sh b/helm_chart/get_helm.sh index 2292b70e..20ac9975 100755 --- a/helm_chart/get_helm.sh +++ b/helm_chart/get_helm.sh @@ -274,7 +274,7 @@ help () { echo "Accepted cli arguments are:" echo -e "\t[--help|-h ] ->> prints this help" echo -e "\t[--version|-v ] . When not defined it fetches the latest release from GitHub" - echo -e "\te.g. --version v3.0.0 or -v canary" + echo -e "\te.g. --version v3.0.2 or -v canary" echo -e "\t[--no-sudo] ->> install without sudo" } @@ -310,7 +310,7 @@ while [[ $# -gt 0 ]]; do export DESIRED_VERSION="v${1}" fi else - echo -e "Please provide the desired version. e.g. --version v3.0.0 or -v canary" + echo -e "Please provide the desired version. e.g. --version v3.0.2 or -v canary" exit 0 fi ;; diff --git a/helm_chart/readme.md b/helm_chart/readme.md index b6a47b48..225d4858 100644 --- a/helm_chart/readme.md +++ b/helm_chart/readme.md @@ -33,6 +33,7 @@ More information about orchestration features for cluster admins [here](https:// | [Kubeflow Training Operator](https://www.kubeflow.org/docs/components/trainer/legacy-v1/overview/) | Installs operators for managing various machine learning training jobs, such as TensorFlow, PyTorch, and MXNet, providing native Kubernetes support for distributed training workloads. | | Yes | | HyperPod patching | Deploys the RBAC and controller resources needed for orchestrating rolling updates and patching workflows in SageMaker HyperPod clusters. Includes pod eviction and node monitoring. | HyperPod Resiliency | Yes | | hyperpod-inference-operator | Installs the HyperPod Inference Operator and its dependencies to the cluster, allowing cluster deployment and inferencing of JumpStart, s3-hosted, and FSx-hosted models | No | +| [cert-manager](https://github.com/cert-manager/cert-manager) | Automatically provisions and manages TLS certificates in Kubernetes clusters. Provides certificate lifecycle management including issuance, renewal, and revocation for secure communications. | [Hyperpod training operator](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-eks-operator.html) | No | > **_Note_** The `mpijob` scheme is disabled in the Training Operator helm chart to avoid conflicting with the MPI Operator. @@ -48,6 +49,20 @@ storage: enabled: true ``` +To enable cert-manager for TLS certificate management, pass in `--set cert-manager.enabled=true` when installing or upgrading the main chart or set the following in the values.yaml file: +``` +cert-manager: + enabled: true + namespace: cert-manager + global: + leaderElection: + namespace: cert-manager + crds: + enabled: true +``` +namespace specifies which name space cert-manager should be installed + + --- The following plugins are only required for HyperPod Resiliency if you are using the following supported devices, such as GPU/Neuron instances, unless you install these plugins on your own. @@ -169,21 +184,69 @@ helm upgrade dependencies helm_chart/HyperPodHelmChart --namespace kube-system ## 6. Notes - Training job auto resume is expected to work with Kubeflow training operator release v1.7.0, v1.8.0, v1.8.1 https://github.com/kubeflow/training-operator/releases -- If you intend to use the Health Monitoring Agent container image from another region, please see below list to find relevant region's URI. - ``` - IAD 767398015722.dkr.ecr.us-east-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0 - PDX 905418368575.dkr.ecr.us-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0 - CMH 851725546812.dkr.ecr.us-east-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0 - SFO 011528288828.dkr.ecr.us-west-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0 - FRA 211125453373.dkr.ecr.eu-central-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0 - ARN 654654141839.dkr.ecr.eu-north-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0 - DUB 533267293120.dkr.ecr.eu-west-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0 - LHR 011528288831.dkr.ecr.eu-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0 - NRT 533267052152.dkr.ecr.ap-northeast-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0 - BOM 011528288864.dkr.ecr.ap-south-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0 - SIN 905418428165.dkr.ecr.ap-southeast-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0 - SYD 851725636348.dkr.ecr.ap-southeast-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0 - GRU 025066253954.dkr.ecr.sa-east-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0 +- The Health Monitoring Agent now automatically selects the correct container image URI based on your AWS region. The Helm chart intelligently detects the region from your Kubernetes cluster context. + +- **Intelligent Region Detection**: The chart automatically detects your AWS region using multiple methods: + 1. **Explicit region setting** (highest priority): `--set health-monitoring-agent.region=us-east-1` + 2. **Global region setting**: `--set global.region=us-east-1` + 3. **Kubernetes cluster context detection**: Automatically extracts region from: + - EKS API server URL patterns + - Node topology labels (`topology.kubernetes.io/region`) + - AWS provider IDs in node specifications + - Legacy region labels (`failure-domain.beta.kubernetes.io/region`) + 4. **Default fallback region**: us-east-1 + +- **Manual Region Override**: If needed, you can still specify a region manually: + ```bash + helm install dependencies helm_chart/HyperPodHelmChart --namespace kube-system --set health-monitoring-agent.region=us-west-2 + ``` + +- **Debug Mode**: Enabled by default, to troubleshoot region detection and image selection: + ```bash + # Disable debug mode during installation + helm install dependencies helm_chart/HyperPodHelmChart --namespace kube-system --set health-monitoring-agent.debug=false + + # Or upgrade existing installation with debug disabled + helm upgrade dependencies helm_chart/HyperPodHelmChart --namespace kube-system --set health-monitoring-agent.debug=false + ``` + +- **Viewing Debug Information**: When debug mode is enabled, detailed information is stored in a ConfigMap: + ```bash + # View debug information (clean output) + kubectl get configmap health-monitoring-agent-debug -n aws-hyperpod -o jsonpath='{.data.debug-info\.txt}' + + # View full ConfigMap details + kubectl get configmap health-monitoring-agent-debug -n aws-hyperpod -o yaml + ``` + +- **Debug Information Includes**: + - Image tag selection process (component-specific settings) + - Region detection methods attempted (EKS API server URL, node labels) + - Number of nodes found and labels checked + - Final region determination and account ID mapping + - Generated image URI + - Timestamp of debug information generation + +- **Custom Image Override**: For advanced use cases, you can still override the image URI completely: + ```bash + helm install dependencies helm_chart/HyperPodHelmChart --namespace kube-system --set health-monitoring-agent.hmaimage="" + ``` + +- **Supported Regions and their ECR URIs**: + ``` + us-east-1 (US East (N. Virginia)): 767398015722.dkr.ecr.us-east-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.935.0_1.0.282.0 + us-west-2 (US West (Oregon)): 905418368575.dkr.ecr.us-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.935.0_1.0.282.0 + us-east-2 (US East (Ohio)): 851725546812.dkr.ecr.us-east-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.935.0_1.0.282.0 + us-west-1 (US West (N. California)): 011528288828.dkr.ecr.us-west-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.935.0_1.0.282.0 + eu-central-1 (Europe (Frankfurt)): 211125453373.dkr.ecr.eu-central-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.935.0_1.0.282.0 + eu-north-1 (Europe (Stockholm)): 654654141839.dkr.ecr.eu-north-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.935.0_1.0.282.0 + eu-west-1 (Europe (Ireland)): 533267293120.dkr.ecr.eu-west-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.935.0_1.0.282.0 + eu-west-2 (Europe (London)): 011528288831.dkr.ecr.eu-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.935.0_1.0.282.0 + ap-northeast-1 (Asia Pacific (Tokyo)): 533267052152.dkr.ecr.ap-northeast-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.935.0_1.0.282.0 + ap-south-1 (Asia Pacific (Mumbai)): 011528288864.dkr.ecr.ap-south-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.935.0_1.0.282.0 + ap-southeast-1 (Asia Pacific (Singapore)): 905418428165.dkr.ecr.ap-southeast-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.935.0_1.0.282.0 + ap-southeast-2 (Asia Pacific (Sydney)): 851725636348.dkr.ecr.ap-southeast-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.935.0_1.0.282.0 + sa-east-1 (South America (São Paulo)): 025066253954.dkr.ecr.sa-east-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.935.0_1.0.282.0 ``` ## 7. Troubleshooting diff --git a/hyperpod-cluster-stack-template/README.md b/hyperpod-cluster-stack-template/README.md new file mode 100644 index 00000000..3e05e263 --- /dev/null +++ b/hyperpod-cluster-stack-template/README.md @@ -0,0 +1,10 @@ + +# hyperpod-cluster-stack-template + +## Installation +`pip install hyperpod-cluster-stack-template` + +## Overview +This package provides the model and template for the cloudformation required for cluster stack creation . + + diff --git a/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/__init__.py b/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/creation_template.yaml b/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/creation_template.yaml new file mode 100644 index 00000000..f896f56b --- /dev/null +++ b/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/creation_template.yaml @@ -0,0 +1,1124 @@ +Description: Main Stack for EKS based HyperPod Cluster +Metadata: + AWS::CloudFormation::Interface: + ParameterGroups: + - Label: + default: General Settings + Parameters: + - ResourceNamePrefix + - Stage + - NodeRecovery + - Tags + - Label: + default: Networking + Parameters: + - CreateVPCStack + - VpcId + - VpcCIDR + - AvailabilityZoneIds + - CreateSecurityGroupStack + - SecurityGroupId + - SecurityGroupIds + - CreatePrivateSubnetStack + - PrivateSubnetIds + - EksPrivateSubnetIds + - NatGatewayIds + - PrivateRouteTableIds + - CreateS3EndpointStack + - Label: + default: Orchestration + Parameters: + - CreateEKSClusterStack + - EKSClusterName + - KubernetesVersion + - CreateHelmChartStack + - HelmRepoUrl + - HelmRepoPath + - HelmRelease + - Namespace + - HelmOperators + - Label: + default: Lifecycle Configuration + Parameters: + - CreateLifeCycleScriptStack + - CreateS3BucketStack + - S3BucketName + - GithubRawUrl + - OnCreatePath + - Label: + default: Permissions + Parameters: + - CreateSageMakerIAMRoleStack + - SageMakerIAMRoleName + - Label: + default: Storage + Parameters: + - CreateFsxStack + - FsxFileSystemId + - FsxSubnetId + - FsxAvailabilityZone + - StorageCapacity + - PerUnitStorageThroughput + - DataCompressionType + - FileSystemTypeVersion + - Label: + default: HyperPod Cluster + Parameters: + - CreateHyperPodClusterStack + - HyperPodClusterName + - Label: + default: Instance Groups + Parameters: + - InstanceGroupSettings1 + - InstanceGroupSettings2 + - InstanceGroupSettings3 + - InstanceGroupSettings4 + - InstanceGroupSettings5 + - InstanceGroupSettings6 + - InstanceGroupSettings7 + - InstanceGroupSettings8 + - InstanceGroupSettings9 + - InstanceGroupSettings10 + - InstanceGroupSettings11 + - InstanceGroupSettings12 + - InstanceGroupSettings13 + - InstanceGroupSettings14 + - InstanceGroupSettings15 + - InstanceGroupSettings16 + - InstanceGroupSettings17 + - InstanceGroupSettings18 + - InstanceGroupSettings19 + - InstanceGroupSettings20 + - Label: + default: Restricted Instance Groups + Parameters: + - RigSettings1 + - RigSettings2 + - RigSettings3 + - RigSettings4 + - RigSettings5 + - RigSettings6 + - RigSettings7 + - RigSettings8 + - RigSettings9 + - RigSettings10 + - RigSettings11 + - RigSettings12 + - RigSettings13 + - RigSettings14 + - RigSettings15 + - RigSettings16 + - RigSettings17 + - RigSettings18 + - RigSettings19 + - RigSettings20 + ParameterLabels: + ResourceNamePrefix: + default: Resource Name Prefix + Stage: + default: Deployment Stage + NodeRecovery: + default: Instance Recovery + Tags: + default: Resource Tags + CreateVPCStack: + default: Create New VPC + VpcId: + default: Existing VPC ID + VpcCIDR: + default: VPC CIDR Range + AvailabilityZoneIds: + default: Availability Zone IDs + CreateSecurityGroupStack: + default: Create New Security Group + SecurityGroupId: + default: Existing Security Group ID + SecurityGroupIds: + default: Security Group IDs + CreatePrivateSubnetStack: + default: Create Private Subnets + PrivateSubnetIds: + default: Private Subnet IDs + EksPrivateSubnetIds: + default: EKS Private Subnet IDs + NatGatewayIds: + default: NAT Gateway IDs + PrivateRouteTableIds: + default: Private Route Table IDs + CreateS3EndpointStack: + default: Create S3 Endpoint + CreateEKSClusterStack: + default: Create New EKS Cluster + EKSClusterName: + default: EKS Cluster Name + KubernetesVersion: + default: Kubernetes Version + CreateHelmChartStack: + default: Install Helm Charts + HelmRepoUrl: + default: Helm Repository URL + HelmRepoPath: + default: Helm Chart Path + HelmRelease: + default: Helm Release Name + Namespace: + default: Kubernetes Namespace + HelmOperators: + default: Enabled Operators + CreateLifeCycleScriptStack: + default: Create Lifecycle Scripts + CreateS3BucketStack: + default: Create New S3 Bucket + S3BucketName: + default: S3 Bucket Name + GithubRawUrl: + default: GitHub Raw URL + OnCreatePath: + default: OnCreate Script Path + CreateSageMakerIAMRoleStack: + default: Create New IAM Role + SageMakerIAMRoleName: + default: IAM Role Name + CreateFsxStack: + default: Create New FSx for Lustre File System + FsxFileSystemId: + default: Existing FSx File System ID + FsxSubnetId: + default: FSx Subnet ID + FsxAvailabilityZone: + default: FSx Availability Zone + StorageCapacity: + default: Storage Capacity (GB) + PerUnitStorageThroughput: + default: Per-unit Storage Throughput (MB/s/TiB) + DataCompressionType: + default: Compression Type + FileSystemTypeVersion: + default: Lustre Version + CreateHyperPodClusterStack: + default: Create HyperPod Cluster + HyperPodClusterName: + default: HyperPod Cluster Name +Parameters: + Stage: + Type: String + Default: prod + AllowedValues: + - gamma + - prod + Description: Deployment stage (gamma, prod) + EnableHPInferenceFeature: + Type: String + Default: 'false' + Description: Feature flag for enabling HP inference + CustomBucketName: + Type: String + Default: '' + Description: Custom S3 bucket name for templates + ResourceNamePrefix: + Type: String + Default: hyperpod-cli-integ-test + Description: Prefix to be used for all resources created by this template. + VpcCIDR: + Type: String + Default: 10.192.0.0/16 + Description: The IP range (CIDR notation) for the VPC. + AvailabilityZoneIds: + Type: String + Default: use2-az1,use2-az2,use2-az3 + Description: List of AZs to deploy subnets in (up to 5, comma separated) + NodeProvisioningMode: + Type: String + Default: Continuous + Description: The node provisioning mode + VpcId: + Type: String + Default: '' + Description: The ID of the VPC you wish to use if you do not want to create a new VPC. + NatGatewayIds: + Type: String + Default: '' + Description: Comma-separated list of NAT Gateway IDs to route internet bound traffic to from the newly created private subnets. + SecurityGroupId: + Type: String + Default: '' + Description: The ID of the security group associated with an existing EKS cluster. + KubernetesVersion: + Type: String + Default: '1.31' + Description: The Kubernetes version to use for the EKS cluster. + EKSClusterName: + Type: String + Default: eks + Description: The name of the newly created of preexisting EKS cluster you wish to use. + EksPrivateSubnetIds: + Type: String + Default: '' + Description: Comma-delimited list of private subnet IDs for the EKS cluster + SecurityGroupIds: + Type: String + Default: '' + Description: The Id of your cluster security group. + PrivateRouteTableIds: + Type: String + Default: '' + Description: Comma-separated list of private route table IDs. + S3BucketName: + Type: String + Default: s3-bucket + Description: The name of the S3 bucket used to store the cluster lifecycle scripts. + GithubRawUrl: + Type: String + Default: >- + https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/refs/heads/main/1.architectures/7.sagemaker-hyperpod-eks/LifecycleScripts/base-config/on_create.sh + Description: The raw GitHub URL for the lifecycle script. + HelmRepoUrl: + Type: String + Default: https://github.com/aws/sagemaker-hyperpod-cli.git + Description: The URL of the Helm repo containing the HyperPod Helm chart. + HelmRepoPath: + Type: String + Default: helm_chart/HyperPodHelmChart + Description: The path to the HyperPod Helm chart in the Helm repo. + HelmOperators: + Type: String + Default: 'mlflow.enabled=true,trainingOperators.enabled=true,cluster-role-and-bindings.enabled=true,namespaced-role-and-bindings.enable=true,nvidia-device-plugin.devicePlugin.enabled=true,neuron-device-plugin.devicePlugin.enabled=true,aws-efa-k8s-device-plugin.devicePlugin.enabled=true,mpi-operator.enabled=true,health-monitoring-agent.enabled=true,deep-health-check.enabled=true,job-auto-restart.enabled=true,hyperpod-patching.enabled=true' + Description: The configuration of HyperPod Helm chart + Namespace: + Type: String + Default: kube-system + Description: The namespace to deploy the HyperPod Helm chart into. + HelmRelease: + Type: String + Default: dependencies + Description: The name of the Helm release. + HyperPodClusterName: + Type: String + Default: hyperpod-cluster-integ-test + Description: Name of SageMaker HyperPod Cluster. + NodeRecovery: + Type: String + Default: Automatic + AllowedValues: + - Automatic + - None + Description: Specifies whether to enable or disable the automatic node recovery feature (Automatic or None). + SageMakerIAMRoleName: + Type: String + Default: iam-role + Description: The name of the IAM role that SageMaker will use to access the AWS resources on your behalf. + PrivateSubnetIds: + Type: String + Default: '' + Description: Comma-separated list of private subnet IDs for EKS cluster. + OnCreatePath: + Type: String + Default: sagemaker-hyperpod-eks-bucket + Description: >- + The file name of lifecycle script for the general purpose instance group. This script runs during cluster + creation. + InstanceGroupSettings1: + Type: String + Default: >- + [{"InstanceCount":1,"InstanceGroupName":"default","InstanceType":"ml.t3.medium","TargetAvailabilityZoneId":"use2-az2","ThreadsPerCore":1,"InstanceStorageConfigs":[{"EbsVolumeConfig":{"VolumeSizeInGB":500}}]}] + Description: JSON array string containing instance group configurations. + RigS3BucketName: + Type: String + Default: '' + Description: The name of the S3 bucket for RIG resources + RigSettings1: + Type: String + Default: '[]' + Description: JSON array string containing restricted instance group configurations. + InstanceGroupSettings2: + Type: String + Default: '[]' + Description: JSON array string containing instance group configurations. + RigSettings2: + Type: String + Default: '[]' + Description: JSON array string containing restricted instance group configurations. + InstanceGroupSettings3: + Type: String + Default: '[]' + Description: JSON array string containing instance group configurations. + RigSettings3: + Type: String + Default: '[]' + Description: JSON array string containing restricted instance group configurations. + InstanceGroupSettings4: + Type: String + Default: '[]' + Description: JSON array string containing instance group configurations. + RigSettings4: + Type: String + Default: '[]' + Description: JSON array string containing restricted instance group configurations. + InstanceGroupSettings5: + Type: String + Default: '[]' + Description: JSON array string containing instance group configurations. + RigSettings5: + Type: String + Default: '[]' + Description: JSON array string containing restricted instance group configurations. + InstanceGroupSettings6: + Type: String + Default: '[]' + Description: JSON array string containing instance group configurations. + RigSettings6: + Type: String + Default: '[]' + Description: JSON array string containing restricted instance group configurations. + InstanceGroupSettings7: + Type: String + Default: '[]' + Description: JSON array string containing instance group configurations. + RigSettings7: + Type: String + Default: '[]' + Description: JSON array string containing restricted instance group configurations. + InstanceGroupSettings8: + Type: String + Default: '[]' + Description: JSON array string containing instance group configurations. + RigSettings8: + Type: String + Default: '[]' + Description: JSON array string containing restricted instance group configurations. + InstanceGroupSettings9: + Type: String + Default: '[]' + Description: JSON array string containing instance group configurations. + RigSettings9: + Type: String + Default: '[]' + Description: JSON array string containing restricted instance group configurations. + InstanceGroupSettings10: + Type: String + Default: '[]' + Description: JSON array string containing instance group configurations. + RigSettings10: + Type: String + Default: '[]' + Description: JSON array string containing restricted instance group configurations. + InstanceGroupSettings11: + Type: String + Default: '[]' + Description: JSON array string containing instance group configurations. + RigSettings11: + Type: String + Default: '[]' + Description: JSON array string containing restricted instance group configurations. + InstanceGroupSettings12: + Type: String + Default: '[]' + Description: JSON array string containing instance group configurations. + RigSettings12: + Type: String + Default: '[]' + Description: JSON array string containing restricted instance group configurations. + InstanceGroupSettings13: + Type: String + Default: '[]' + Description: JSON array string containing instance group configurations. + RigSettings13: + Type: String + Default: '[]' + Description: JSON array string containing restricted instance group configurations. + InstanceGroupSettings14: + Type: String + Default: '[]' + Description: JSON array string containing instance group configurations. + RigSettings14: + Type: String + Default: '[]' + Description: JSON array string containing restricted instance group configurations. + InstanceGroupSettings15: + Type: String + Default: '[]' + Description: JSON array string containing instance group configurations. + RigSettings15: + Type: String + Default: '[]' + Description: JSON array string containing restricted instance group configurations. + InstanceGroupSettings16: + Type: String + Default: '[]' + Description: JSON array string containing instance group configurations. + RigSettings16: + Type: String + Default: '[]' + Description: JSON array string containing restricted instance group configurations. + InstanceGroupSettings17: + Type: String + Default: '[]' + Description: JSON array string containing instance group configurations. + RigSettings17: + Type: String + Default: '[]' + Description: JSON array string containing restricted instance group configurations. + InstanceGroupSettings18: + Type: String + Default: '[]' + Description: JSON array string containing instance group configurations. + RigSettings18: + Type: String + Default: '[]' + Description: JSON array string containing restricted instance group configurations. + InstanceGroupSettings19: + Type: String + Default: '[]' + Description: JSON array string containing instance group configurations. + RigSettings19: + Type: String + Default: '[]' + Description: JSON array string containing restricted instance group configurations. + InstanceGroupSettings20: + Type: String + Default: '[]' + Description: JSON array string containing instance group configurations. + RigSettings20: + Type: String + Default: '[]' + Description: JSON array string containing restricted instance group configurations. + Tags: + Type: String + Default: '[]' + Description: Custom tags for managing the SageMaker HyperPod cluster as an AWS resource. + FsxSubnetId: + Type: String + Default: '' + Description: The subnet id that will be used to create FSx + FsxAvailabilityZone: + Type: String + Default: use2-az2 + Description: The availability zone to get subnet id that will be used to create FSx + PerUnitStorageThroughput: + Type: Number + Default: 250 + Description: Per unit storage throughput for the FSx file system + DataCompressionType: + Type: String + Default: NONE + AllowedValues: + - NONE + - LZ4 + Description: Data compression type for the FSx file system (NONE, LZ4) + FileSystemTypeVersion: + Type: Number + Default: 2.15 + Description: File system type version for the FSx file system + StorageCapacity: + Type: Number + Default: 1200 + Description: Storage capacity for the FSx file system in GiB + FsxFileSystemId: + Type: String + Default: '' + Description: Existing FSx for Lustre file system + CreateVPCStack: + Type: String + Default: 'true' + AllowedValues: + - 'true' + - 'false' + Description: Boolean to Create VPC Stack + CreatePrivateSubnetStack: + Type: String + Default: 'true' + AllowedValues: + - 'true' + - 'false' + Description: Boolean to Create Private Subnet Stack + CreateSecurityGroupStack: + Type: String + Default: 'true' + AllowedValues: + - 'true' + - 'false' + Description: Boolean to Create Security Group Stack + CreateEKSClusterStack: + Type: String + Default: 'true' + AllowedValues: + - 'true' + - 'false' + Description: Boolean to Create EKS Cluster Stack + CreateS3BucketStack: + Type: String + Default: 'true' + AllowedValues: + - 'true' + - 'false' + Description: Boolean to Create S3 Bucket Stack + CreateS3EndpointStack: + Type: String + Default: 'true' + AllowedValues: + - 'true' + - 'false' + Description: Boolean to Create S3 Endpoint Stack + CreateLifeCycleScriptStack: + Type: String + Default: 'true' + AllowedValues: + - 'true' + - 'false' + Description: Boolean to Create Life Cycle Script Stack + CreateSageMakerIAMRoleStack: + Type: String + Default: 'true' + AllowedValues: + - 'true' + - 'false' + Description: Boolean to Create SageMaker IAM Role Stack + CreateHelmChartStack: + Type: String + Default: 'true' + AllowedValues: + - 'true' + - 'false' + Description: Boolean to Create Helm Chart Stack + CreateHyperPodClusterStack: + Type: String + Default: 'true' + AllowedValues: + - 'true' + - 'false' + Description: Boolean to Create HyperPod Cluster Stack + CreateFsxStack: + Type: String + Default: 'true' + AllowedValues: + - 'true' + - 'false' + Description: Boolean to Create FSx for Lustre File System Stack +Conditions: + CreateVPCStackCondition: + Fn::Equals: + - Ref: CreateVPCStack + - 'true' + CreatePrivateSubnetStackCondition: + Fn::Equals: + - Ref: CreatePrivateSubnetStack + - 'true' + CreateSecurityGroupStackCondition: + Fn::Equals: + - Ref: CreateSecurityGroupStack + - 'true' + CreateEKSClusterStackCondition: + Fn::Equals: + - Ref: CreateEKSClusterStack + - 'true' + CreateS3BucketStackCondition: + Fn::Equals: + - Ref: CreateS3BucketStack + - 'true' + CreateS3EndpointStackCondition: + Fn::Equals: + - Ref: CreateS3EndpointStack + - 'true' + CreateLifeCycleScriptStackCondition: + Fn::Equals: + - Ref: CreateLifeCycleScriptStack + - 'true' + CreateSageMakerIAMRoleStackCondition: + Fn::Equals: + - Ref: CreateSageMakerIAMRoleStack + - 'true' + CreateHelmChartStackCondition: + Fn::Equals: + - Ref: CreateHelmChartStack + - 'true' + CreateHyperPodClusterStackCondition: + Fn::And: + - Fn::Equals: + - Ref: CreateHyperPodClusterStack + - 'true' + - Fn::Not: + - Fn::And: + - Fn::Equals: + - Ref: CreateEKSClusterStack + - 'true' + - Fn::Equals: + - Ref: CreateHelmChartStack + - 'false' + CreateFsxStackCondition: + Fn::Equals: + - Ref: CreateFsxStack + - 'true' +Resources: + VPCStack: + Type: AWS::CloudFormation::Stack + Properties: + TemplateURL: + Fn::Sub: >- + https://aws-sagemaker-hyperpod-cluster-setup-${AWS::Region}-${Stage}.s3.${AWS::Region}.amazonaws.com/templates/vpc-template.yaml + Parameters: + ResourceNamePrefix: + Ref: ResourceNamePrefix + VpcCIDR: + Ref: VpcCIDR + AvailabilityZoneIds: + Fn::Join: + - ',' + - - Ref: AvailabilityZoneIds + - ',,,' + Metadata: + aws:cdk:path: MainEksBasedCfnTemplate/VPCStack + Condition: CreateVPCStackCondition + PrivateSubnetStack: + Type: AWS::CloudFormation::Stack + Properties: + TemplateURL: + Fn::Sub: >- + https://aws-sagemaker-hyperpod-cluster-setup-${AWS::Region}-${Stage}.s3.${AWS::Region}.amazonaws.com/templates/private-subnet-template.yaml + Parameters: + ResourceNamePrefix: + Ref: ResourceNamePrefix + VpcId: + Fn::If: + - CreateVPCStackCondition + - Fn::GetAtt: + - VPCStack + - Outputs.VpcId + - Ref: VpcId + VpcCidrBlock: + Ref: VpcCIDR + AvailabilityZoneIds: + Fn::Join: + - ',' + - - Ref: AvailabilityZoneIds + - ',,,' + NatGatewayIds: + Fn::If: + - CreateVPCStackCondition + - Fn::GetAtt: + - VPCStack + - Outputs.NatGatewayIds + - Ref: NatGatewayIds + Metadata: + aws:cdk:path: MainEksBasedCfnTemplate/PrivateSubnetStack + Condition: CreatePrivateSubnetStackCondition + SecurityGroupStack: + Type: AWS::CloudFormation::Stack + Properties: + TemplateURL: + Fn::Sub: >- + https://aws-sagemaker-hyperpod-cluster-setup-${AWS::Region}-${Stage}.s3.${AWS::Region}.amazonaws.com/templates/security-group-template.yaml + Parameters: + ResourceNamePrefix: + Ref: ResourceNamePrefix + VpcId: + Fn::If: + - CreateVPCStackCondition + - Fn::GetAtt: + - VPCStack + - Outputs.VpcId + - Ref: VpcId + SecurityGroupId: + Ref: SecurityGroupId + Metadata: + aws:cdk:path: MainEksBasedCfnTemplate/SecurityGroupStack + Condition: CreateSecurityGroupStackCondition + EKSClusterStack: + Type: AWS::CloudFormation::Stack + Properties: + TemplateURL: + Fn::Sub: >- + https://aws-sagemaker-hyperpod-cluster-setup-${AWS::Region}-${Stage}.s3.${AWS::Region}.amazonaws.com/templates/eks-template.yaml + Parameters: + ResourceNamePrefix: + Ref: ResourceNamePrefix + VpcId: + Fn::If: + - CreateVPCStackCondition + - Fn::GetAtt: + - VPCStack + - Outputs.VpcId + - Ref: VpcId + KubernetesVersion: + Ref: KubernetesVersion + EKSClusterName: + Ref: EKSClusterName + EksPrivateSubnetIds: + Fn::If: + - CreatePrivateSubnetStackCondition + - Fn::GetAtt: + - PrivateSubnetStack + - Outputs.EksPrivateSubnetIds + - Ref: PrivateSubnetIds + SecurityGroupIds: + Fn::If: + - CreateSecurityGroupStackCondition + - Fn::GetAtt: + - SecurityGroupStack + - Outputs.SecurityGroupId + - Ref: SecurityGroupIds + Metadata: + aws:cdk:path: MainEksBasedCfnTemplate/EKSClusterStack + Condition: CreateEKSClusterStackCondition + S3BucketStack: + Type: AWS::CloudFormation::Stack + Properties: + TemplateURL: + Fn::Sub: >- + https://aws-sagemaker-hyperpod-cluster-setup-${AWS::Region}-${Stage}.s3.${AWS::Region}.amazonaws.com/templates/s3-bucket-template.yaml + Parameters: + ResourceNamePrefix: + Ref: ResourceNamePrefix + Metadata: + aws:cdk:path: MainEksBasedCfnTemplate/S3BucketStack + Condition: CreateS3BucketStackCondition + S3EndpointStack: + Type: AWS::CloudFormation::Stack + Properties: + TemplateURL: + Fn::Sub: >- + https://aws-sagemaker-hyperpod-cluster-setup-${AWS::Region}-${Stage}.s3.${AWS::Region}.amazonaws.com/templates/s3-endpoint-template.yaml + Parameters: + VpcId: + Fn::If: + - CreateVPCStackCondition + - Fn::GetAtt: + - VPCStack + - Outputs.VpcId + - Ref: VpcId + PrivateRouteTableIds: + Fn::If: + - CreatePrivateSubnetStackCondition + - Fn::GetAtt: + - PrivateSubnetStack + - Outputs.PrivateRouteTableIds + - Ref: PrivateRouteTableIds + Metadata: + aws:cdk:path: MainEksBasedCfnTemplate/S3EndpointStack + Condition: CreateS3EndpointStackCondition + LifeCycleScriptStack: + Type: AWS::CloudFormation::Stack + Properties: + TemplateURL: + Fn::Sub: >- + https://aws-sagemaker-hyperpod-cluster-setup-${AWS::Region}-${Stage}.s3.${AWS::Region}.amazonaws.com/templates/lifecycle-script-template.yaml + Parameters: + ResourceNamePrefix: + Ref: ResourceNamePrefix + S3BucketName: + Fn::If: + - CreateS3BucketStackCondition + - Fn::GetAtt: + - S3BucketStack + - Outputs.S3BucketName + - Ref: S3BucketName + Metadata: + aws:cdk:path: MainEksBasedCfnTemplate/LifeCycleScriptStack + Condition: CreateLifeCycleScriptStackCondition + SageMakerIAMRoleStack: + Type: AWS::CloudFormation::Stack + Properties: + TemplateURL: + Fn::Sub: >- + https://aws-sagemaker-hyperpod-cluster-setup-${AWS::Region}-${Stage}.s3.${AWS::Region}.amazonaws.com/templates/sagemaker-iam-role-template.yaml + Parameters: + ResourceNamePrefix: + Ref: ResourceNamePrefix + S3BucketName: + Fn::If: + - CreateS3BucketStackCondition + - Fn::GetAtt: + - S3BucketStack + - Outputs.S3BucketName + - Ref: S3BucketName + Metadata: + aws:cdk:path: MainEksBasedCfnTemplate/SageMakerIAMRoleStack + Condition: CreateSageMakerIAMRoleStackCondition + HelmChartStack: + Type: AWS::CloudFormation::Stack + Properties: + TemplateURL: + Fn::Sub: >- + https://aws-sagemaker-hyperpod-cluster-setup-${AWS::Region}-${Stage}.s3.${AWS::Region}.amazonaws.com/templates/helm-chart-template.yaml + Parameters: + ResourceNamePrefix: + Ref: ResourceNamePrefix + HelmRepoUrl: + Ref: HelmRepoUrl + HelmRepoPath: + Ref: HelmRepoPath + Namespace: + Ref: Namespace + HelmRelease: + Ref: HelmRelease + HelmOperators: + Ref: HelmOperators + CustomResourceS3Bucket: + Fn::Sub: aws-sagemaker-hyperpod-cluster-setup-${AWS::Region}-${Stage} + EKSClusterName: + Fn::If: + - CreateEKSClusterStackCondition + - Fn::GetAtt: + - EKSClusterStack + - Outputs.EKSClusterName + - Ref: EKSClusterName + Metadata: + aws:cdk:path: MainEksBasedCfnTemplate/HelmChartStack + Condition: CreateHelmChartStackCondition + HyperPodClusterStack: + Type: AWS::CloudFormation::Stack + Properties: + TemplateURL: + Fn::Sub: >- + https://aws-sagemaker-hyperpod-cluster-setup-${AWS::Region}-${Stage}.s3.${AWS::Region}.amazonaws.com/templates/hyperpod-cluster-template.yaml + Parameters: + ResourceNamePrefix: + Ref: ResourceNamePrefix + HelmChartStatus: + Fn::If: + - CreateHelmChartStackCondition + - Fn::GetAtt: + - HelmChartStack + - Outputs.HelmChartDeploymentComplete + - HelmChartNotRequired + HyperPodClusterName: + Ref: HyperPodClusterName + NodeRecovery: + Ref: NodeRecovery + EKSClusterName: + Fn::If: + - CreateEKSClusterStackCondition + - Fn::GetAtt: + - EKSClusterStack + - Outputs.EKSClusterName + - Ref: EKSClusterName + SecurityGroupIds: + Fn::If: + - CreateSecurityGroupStackCondition + - Fn::GetAtt: + - SecurityGroupStack + - Outputs.SecurityGroupId + - Ref: SecurityGroupIds + PrivateSubnetIds: + Fn::If: + - CreatePrivateSubnetStackCondition + - Fn::GetAtt: + - PrivateSubnetStack + - Outputs.PrivateSubnetIds + - Ref: PrivateSubnetIds + CustomResourceS3Bucket: + Fn::Sub: aws-sagemaker-hyperpod-cluster-setup-${AWS::Region}-${Stage} + SageMakerIAMRoleName: + Fn::If: + - CreateSageMakerIAMRoleStackCondition + - Fn::GetAtt: + - SageMakerIAMRoleStack + - Outputs.SageMakerIAMRoleName + - Ref: SageMakerIAMRoleName + S3BucketName: + Fn::If: + - CreateS3BucketStackCondition + - Fn::GetAtt: + - S3BucketStack + - Outputs.S3BucketName + - Ref: S3BucketName + OnCreatePath: + Fn::If: + - CreateS3BucketStackCondition + - on_create.sh + - Ref: OnCreatePath + InstanceGroupSettings1: + Ref: InstanceGroupSettings1 + InstanceGroupSettings2: + Ref: InstanceGroupSettings2 + InstanceGroupSettings3: + Ref: InstanceGroupSettings3 + InstanceGroupSettings4: + Ref: InstanceGroupSettings4 + InstanceGroupSettings5: + Ref: InstanceGroupSettings5 + InstanceGroupSettings6: + Ref: InstanceGroupSettings6 + InstanceGroupSettings7: + Ref: InstanceGroupSettings7 + InstanceGroupSettings8: + Ref: InstanceGroupSettings8 + InstanceGroupSettings9: + Ref: InstanceGroupSettings9 + InstanceGroupSettings10: + Ref: InstanceGroupSettings10 + InstanceGroupSettings11: + Ref: InstanceGroupSettings11 + InstanceGroupSettings12: + Ref: InstanceGroupSettings12 + InstanceGroupSettings13: + Ref: InstanceGroupSettings13 + InstanceGroupSettings14: + Ref: InstanceGroupSettings14 + InstanceGroupSettings15: + Ref: InstanceGroupSettings15 + InstanceGroupSettings16: + Ref: InstanceGroupSettings16 + InstanceGroupSettings17: + Ref: InstanceGroupSettings17 + InstanceGroupSettings18: + Ref: InstanceGroupSettings18 + InstanceGroupSettings19: + Ref: InstanceGroupSettings19 + InstanceGroupSettings20: + Ref: InstanceGroupSettings20 + RigSettings1: + Ref: RigSettings1 + RigSettings2: + Ref: RigSettings2 + RigSettings3: + Ref: RigSettings3 + RigSettings4: + Ref: RigSettings4 + RigSettings5: + Ref: RigSettings5 + RigSettings6: + Ref: RigSettings6 + RigSettings7: + Ref: RigSettings7 + RigSettings8: + Ref: RigSettings8 + RigSettings9: + Ref: RigSettings9 + RigSettings10: + Ref: RigSettings10 + RigSettings11: + Ref: RigSettings11 + RigSettings12: + Ref: RigSettings12 + RigSettings13: + Ref: RigSettings13 + RigSettings14: + Ref: RigSettings14 + RigSettings15: + Ref: RigSettings15 + RigSettings16: + Ref: RigSettings16 + RigSettings17: + Ref: RigSettings17 + RigSettings18: + Ref: RigSettings18 + RigSettings19: + Ref: RigSettings19 + RigSettings20: + Ref: RigSettings20 + Tags: + Ref: Tags + Metadata: + aws:cdk:path: MainEksBasedCfnTemplate/HyperPodClusterStack + Condition: CreateHyperPodClusterStackCondition + FsxStack: + Type: AWS::CloudFormation::Stack + Properties: + TemplateURL: + Fn::Sub: >- + https://aws-sagemaker-hyperpod-cluster-setup-${AWS::Region}-${Stage}.s3.${AWS::Region}.amazonaws.com/templates/fsx-template.yaml + Parameters: + ResourceNamePrefix: + Ref: ResourceNamePrefix + HelmChartStatus: + Fn::If: + - CreateHelmChartStackCondition + - Fn::GetAtt: + - HelmChartStack + - Outputs.HelmChartDeploymentComplete + - HelmChartNotRequired + EKSClusterName: + Fn::If: + - CreateEKSClusterStackCondition + - Fn::GetAtt: + - EKSClusterStack + - Outputs.EKSClusterName + - Ref: EKSClusterName + CustomResourceS3Bucket: + Fn::Sub: aws-sagemaker-hyperpod-cluster-setup-${AWS::Region}-${Stage} + PrivateSubnetIds: + Fn::If: + - CreatePrivateSubnetStackCondition + - Fn::GetAtt: + - PrivateSubnetStack + - Outputs.PrivateSubnetIds + - Ref: PrivateSubnetIds + FsxSubnetId: + Ref: FsxSubnetId + FsxAvailabilityZone: + Ref: FsxAvailabilityZone + SecurityGroupIds: + Fn::If: + - CreateSecurityGroupStackCondition + - Fn::GetAtt: + - SecurityGroupStack + - Outputs.SecurityGroupId + - Ref: SecurityGroupIds + PerUnitStorageThroughput: + Ref: PerUnitStorageThroughput + DataCompressionType: + Ref: DataCompressionType + FileSystemTypeVersion: + Ref: FileSystemTypeVersion + StorageCapacity: + Ref: StorageCapacity + FsxFileSystemId: + Ref: FsxFileSystemId + Metadata: + aws:cdk:path: MainEksBasedCfnTemplate/FsxStack + Condition: CreateFsxStackCondition +Outputs: + OutputVpcId: + Value: + Fn::GetAtt: + - VPCStack + - Outputs.VpcId + Condition: CreateVPCStackCondition + OutputPrivateSubnetIds: + Value: + Fn::GetAtt: + - PrivateSubnetStack + - Outputs.PrivateSubnetIds + Condition: CreatePrivateSubnetStackCondition + OutputSecurityGroupId: + Value: + Fn::GetAtt: + - SecurityGroupStack + - Outputs.SecurityGroupId + Condition: CreateSecurityGroupStackCondition + OutputEKSClusterArn: + Value: + Fn::GetAtt: + - EKSClusterStack + - Outputs.EKSClusterArn + Condition: CreateEKSClusterStackCondition + OutputEKSClusterName: + Value: + Fn::GetAtt: + - EKSClusterStack + - Outputs.EKSClusterName + Condition: CreateEKSClusterStackCondition + OutputSageMakerIAMRoleArn: + Value: + Fn::GetAtt: + - SageMakerIAMRoleStack + - Outputs.SageMakerIAMRoleArn + Condition: CreateSageMakerIAMRoleStackCondition + OutputS3BucketName: + Value: + Fn::GetAtt: + - S3BucketStack + - Outputs.S3BucketName + Condition: CreateS3BucketStackCondition + OutputHyperPodClusterName: + Value: + Fn::GetAtt: + - HyperPodClusterStack + - Outputs.HyperPodClusterName + Condition: CreateHyperPodClusterStackCondition + OutputHyperPodClusterArn: + Value: + Fn::GetAtt: + - HyperPodClusterStack + - Outputs.HyperPodClusterArn + Condition: CreateHyperPodClusterStackCondition diff --git a/test/integration_tests/lifecycle_script/on_create_noop.sh b/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/registry.py similarity index 67% rename from test/integration_tests/lifecycle_script/on_create_noop.sh rename to hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/registry.py index 85d7badc..ce75e692 100644 --- a/test/integration_tests/lifecycle_script/on_create_noop.sh +++ b/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/registry.py @@ -10,19 +10,13 @@ # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific # language governing permissions and limitations under the License. -#!/bin/bash +from hyperpod_cluster_stack_template.v1_0 import model as v1 +from hyperpod_cluster_stack_template.v1_0.template import TEMPLATE_CONTENT as v1_template -set -ex - -LOG_FILE="/var/log/provision/provisioning.log" -mkdir -p "/var/log/provision" -touch $LOG_FILE - -# Function to log messages -logger() { - echo "$@" | tee -a $LOG_FILE +SCHEMA_REGISTRY = { + "1.0": v1.ClusterStackBase } -logger "[start] on_create.sh" -logger "no more steps to run" -logger "[stop] on_create.sh" \ No newline at end of file +TEMPLATE_REGISTRY = { + "1.0": v1_template +} \ No newline at end of file diff --git a/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/__init__.py b/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/model.py b/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/model.py new file mode 100644 index 00000000..68ba347e --- /dev/null +++ b/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/model.py @@ -0,0 +1,133 @@ +from pydantic import BaseModel, Field, field_validator +from typing import Optional, Literal, List, Any, Union +from sagemaker.hyperpod.common.utils import region_to_az_ids + +class ClusterStackBase(BaseModel): + resource_name_prefix: Optional[str] = Field("hyp-eks-stack", description="Prefix to be used for all resources. A 4-digit UUID will be added to prefix during submission") + create_hyperpod_cluster_stack: Optional[bool] = Field(True, description="Boolean to Create HyperPod Cluster Stack") + hyperpod_cluster_name: Optional[str] = Field("hyperpod-cluster", description="Name of SageMaker HyperPod Cluster") + create_eks_cluster_stack: Optional[bool] = Field(True, description="Boolean to Create EKS Cluster Stack") + kubernetes_version: Optional[str] = Field("1.31", description="The Kubernetes version") + eks_cluster_name: Optional[str] = Field("eks-cluster", description="The name of the EKS cluster") + create_helm_chart_stack: Optional[bool] = Field(True, description="Boolean to Create Helm Chart Stack") + namespace: Optional[str] = Field("kube-system", description="The namespace to deploy the HyperPod Helm chart") + helm_repo_url: str = Field("https://github.com/aws/sagemaker-hyperpod-cli.git", description="The URL of the Helm repo containing the HyperPod Helm chart (fixed default)") + helm_repo_path: str = Field("helm_chart/HyperPodHelmChart", description="The path to the HyperPod Helm chart in the Helm repo (fixed default)") + helm_operators: Optional[str] = Field("mlflow.enabled=true,trainingOperators.enabled=true,cluster-role-and-bindings.enabled=true,namespaced-role-and-bindings.enable=true,nvidia-device-plugin.devicePlugin.enabled=true,neuron-device-plugin.devicePlugin.enabled=true,aws-efa-k8s-device-plugin.devicePlugin.enabled=true,mpi-operator.enabled=true,health-monitoring-agent.enabled=true,deep-health-check.enabled=true,job-auto-restart.enabled=true,hyperpod-patching.enabled=true", description="The configuration of HyperPod Helm chart") + helm_release: Optional[str] = Field("dependencies", description="The name used for Helm chart release") + node_provisioning_mode: Optional[str] = Field("Continuous", description="Enable or disable the continuous provisioning mode. Valid values: \"Continuous\" or leave empty") + node_recovery: Optional[str] = Field("Automatic", description="Specifies whether to enable or disable the automatic node recovery feature. Valid values: \"Automatic\", \"None\"") + instance_group_settings: Union[List[Any], None] = Field([{"InstanceCount":1,"InstanceGroupName":"default","InstanceType":"ml.t3.medium","TargetAvailabilityZoneId":"use2-az2","ThreadsPerCore":1,"InstanceStorageConfigs":[{"EbsVolumeConfig":{"VolumeSizeInGB":500}}]}], description="List of string containing instance group configurations") + rig_settings: Union[List[Any], None] = Field(None, description="List of string containing restricted instance group configurations") + rig_s3_bucket_name: Optional[str] = Field(None, description="The name of the S3 bucket used to store the RIG resources") + tags: Union[List[Any], None] = Field(None, description="Custom tags for managing the SageMaker HyperPod cluster as an AWS resource") + create_vpc_stack: Optional[bool] = Field(True, description="Boolean to Create VPC Stack") + vpc_id: Optional[str] = Field(None, description="The ID of the VPC you wish to use if you do not want to create a new VPC") + vpc_cidr: Optional[str] = Field("10.192.0.0/16", description="The IP range (CIDR notation) for the VPC") + availability_zone_ids: Union[List[str], None] = Field(None, description="List of AZs in submission region to deploy subnets in. Must be provided in YAML format starting with \"-\" below. Example: - use2-az1 for us-east-2 region") + create_security_group_stack: Optional[bool] = Field(True, description="Boolean to Create Security Group Stack") + security_group_id: Optional[str] = Field(None, description="The ID of the security group you wish to use in SecurityGroup substack if you do not want to create a new one") + security_group_ids: Union[List[str], None] = Field(None, description="The security groups you wish to use for Hyperpod cluster if you do not want to create new ones") + private_subnet_ids: Union[List[str], None] = Field(None, description="List of private subnet IDs used for HyperPod cluster if you do not want to create VPC stack") + eks_private_subnet_ids: Union[List[str], None] = Field(None, description="List of private subnet IDs for the EKS cluster if you do not want to create VPC stack") + nat_gateway_ids: Union[List[str], None] = Field(None, description="List of NAT Gateway IDs to route internet bound traffic if you do not want to create VPC stack") + private_route_table_ids: Union[List[str], None] = Field(None, description="List of private route table IDs if you do not want to create VPC stack") + create_s3_endpoint_stack: Optional[bool] = Field(True, description="Boolean to Create S3 Endpoint stack") + enable_hp_inference_feature: Optional[bool] = Field(False, description="Boolean to enable inference operator in Hyperpod cluster") + stage: Optional[str] = Field("prod", description="Deployment stage used in S3 bucket naming for inference operator. Valid values: \"gamma\", \"prod\"") + custom_bucket_name: str = Field("", description="Custom S3 bucket name for templates") + create_life_cycle_script_stack: Optional[bool] = Field(True, description="Boolean to Create Life Cycle Script Stack") + create_s3_bucket_stack: Optional[bool] = Field(True, description="Boolean to Create S3 Bucket Stack") + s3_bucket_name: Optional[str] = Field("s3-bucket", description="The name of the S3 bucket used to store the cluster lifecycle scripts") + github_raw_url: str = Field("https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/refs/heads/main/1.architectures/7.sagemaker-hyperpod-eks/LifecycleScripts/base-config/on_create.sh", description="The raw GitHub URL for the lifecycle script (fixed default)") + on_create_path: Optional[str] = Field("sagemaker-hyperpod-eks-bucket", description="The file name of lifecycle script") + create_sagemaker_iam_role_stack: Optional[bool] = Field(True, description="Boolean to Create SageMaker IAM Role Stack") + sagemaker_iam_role_name: Optional[str] = Field("create-cluster-role", description="The name of the IAM role that SageMaker will use during cluster creation to access the AWS resources on your behalf") + create_fsx_stack: Optional[bool] = Field(True, description="Boolean to Create FSx Stack") + fsx_subnet_id: Optional[str] = Field("", description="The subnet id that will be used to create FSx") + fsx_availability_zone_id: Optional[str] = Field("", description="The availability zone to get subnet id that will be used to create FSx") + per_unit_storage_throughput: Optional[int] = Field(250, description="Per unit storage throughput") + data_compression_type: Optional[str] = Field("NONE", description="Data compression type for the FSx file system. Valid values: \"NONE\", \"LZ4\"") + file_system_type_version: Optional[float] = Field(2.15, description="File system type version for the FSx file system") + storage_capacity: Optional[int] = Field(1200, description="Storage capacity for the FSx file system in GiB") + fsx_file_system_id: Optional[str] = Field("", description="Existing FSx file system ID") + + @field_validator('kubernetes_version', mode='before') + @classmethod + def validate_kubernetes_version(cls, v): + if v is not None: + return str(v) + return v + + def to_config(self, region: str = None): + """Convert CLI model to SDK configuration for cluster stack creation. + + Transforms the CLI model instance into a configuration dictionary that can be used + to instantiate the HpClusterStack SDK class. Applies necessary transformations + including AZ configuration, UUID generation, and field restructuring. + + Args: + region (str, optional): AWS region for AZ configuration. If provided, + automatically sets availability_zone_ids and fsx_availability_zone_id + when not already specified. + + Returns: + dict: Configuration dictionary ready for HpClusterStack instantiation. + Contains all transformed parameters with defaults applied. + + Example: + >>> cli_model = ClusterStackBase(hyperpod_cluster_name="my-cluster") + >>> config = cli_model.to_config(region="us-west-2") + >>> sdk_instance = HpClusterStack(**config) + """ + import uuid + + # Convert model to dict and apply transformations + config = self.model_dump(exclude_none=True) + + # Prepare CFN arrays from numbered fields + instance_group_settings = [] + rig_settings = [] + for i in range(1, 21): + ig_key = f'instance_group_settings{i}' + rig_key = f'rig_settings{i}' + if ig_key in config: + instance_group_settings.append(config.pop(ig_key)) + if rig_key in config: + rig_settings.append(config.pop(rig_key)) + + # Add arrays to config + if instance_group_settings: + config['instance_group_settings'] = instance_group_settings + if rig_settings: + config['rig_settings'] = rig_settings + + # Add default AZ configuration if not provided + if region and (not config.get('availability_zone_ids') or not config.get('fsx_availability_zone_id')): + all_az_ids = region_to_az_ids(region) + default_az_config = { + 'availability_zone_ids': all_az_ids[:2], # First 2 AZs + 'fsx_availability_zone_id': all_az_ids[0] # First AZ + } + if not config.get('availability_zone_ids'): + config['availability_zone_ids'] = default_az_config['availability_zone_ids'] + if not config.get('fsx_availability_zone_id'): + config['fsx_availability_zone_id'] = default_az_config['fsx_availability_zone_id'] + + # Append 4-digit UUID to resource_name_prefix + if config.get('resource_name_prefix'): + config['resource_name_prefix'] = f"{config['resource_name_prefix']}-{str(uuid.uuid4())[:4]}" + + # Set fixed defaults + defaults = { + 'custom_bucket_name': '', + 'github_raw_url': 'https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/refs/heads/main/1.architectures/7.sagemaker-hyperpod-eks/LifecycleScripts/base-config/on_create.sh', + 'helm_repo_url': 'https://github.com/aws/sagemaker-hyperpod-cli.git', + 'helm_repo_path': 'helm_chart/HyperPodHelmChart' + } + + for key, default_value in defaults.items(): + if key not in config: + config[key] = default_value + + return config \ No newline at end of file diff --git a/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/schema.json b/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/schema.json new file mode 100644 index 00000000..6c9acc9e --- /dev/null +++ b/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/schema.json @@ -0,0 +1,638 @@ +{ + "properties": { + "resource_name_prefix": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": "hyp-eks-stack", + "description": "Prefix to be used for all resources. A 4-digit UUID will be added to prefix during submission", + "title": "Resource Name Prefix" + }, + "create_hyperpod_cluster_stack": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "default": true, + "description": "Boolean to Create HyperPod Cluster Stack", + "title": "Create Hyperpod Cluster Stack" + }, + "hyperpod_cluster_name": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": "hyperpod-cluster", + "description": "Name of SageMaker HyperPod Cluster", + "title": "Hyperpod Cluster Name" + }, + "create_eks_cluster_stack": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "default": true, + "description": "Boolean to Create EKS Cluster Stack", + "title": "Create Eks Cluster Stack" + }, + "kubernetes_version": { + "anyOf": [ + { + "type": "str" + }, + { + "type": "null" + } + ], + "default": "1.31", + "description": "The Kubernetes version", + "title": "Kubernetes Version" + }, + "eks_cluster_name": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": "eks-cluster", + "description": "The name of the EKS cluster", + "title": "Eks Cluster Name" + }, + "create_helm_chart_stack": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "default": true, + "description": "Boolean to Create Helm Chart Stack", + "title": "Create Helm Chart Stack" + }, + "namespace": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": "kube-system", + "description": "The namespace to deploy the HyperPod Helm chart", + "title": "Namespace" + }, + "helm_repo_url": { + "default": "https://github.com/aws/sagemaker-hyperpod-cli.git", + "description": "The URL of the Helm repo containing the HyperPod Helm chart (fixed default)", + "title": "Helm Repo Url", + "type": "string" + }, + "helm_repo_path": { + "default": "helm_chart/HyperPodHelmChart", + "description": "The path to the HyperPod Helm chart in the Helm repo (fixed default)", + "title": "Helm Repo Path", + "type": "string" + }, + "helm_operators": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": "mlflow.enabled=true,trainingOperators.enabled=true,cluster-role-and-bindings.enabled=true,namespaced-role-and-bindings.enable=true,nvidia-device-plugin.devicePlugin.enabled=true,neuron-device-plugin.devicePlugin.enabled=true,aws-efa-k8s-device-plugin.devicePlugin.enabled=true,mpi-operator.enabled=true,health-monitoring-agent.enabled=true,deep-health-check.enabled=true,job-auto-restart.enabled=true,hyperpod-patching.enabled=true", + "description": "The configuration of HyperPod Helm chart", + "title": "Helm Operators" + }, + "helm_release": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": "dependencies", + "description": "The name used for Helm chart release", + "title": "Helm Release" + }, + "node_provisioning_mode": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": "Continuous", + "description": "Enable or disable the continuous provisioning mode. Valid values: \"Continuous\" or leave empty", + "title": "Node Provisioning Mode" + }, + "node_recovery": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": "Automatic", + "description": "Specifies whether to enable or disable the automatic node recovery feature. Valid values: \"Automatic\", \"None\"", + "title": "Node Recovery" + }, + "instance_group_settings": { + "anyOf": [ + { + "items": {}, + "type": "array" + }, + { + "type": "null" + } + ], + "default": [ + { + "InstanceCount": 1, + "InstanceGroupName": "default", + "InstanceType": "ml.t3.medium", + "TargetAvailabilityZoneId": "use2-az2", + "ThreadsPerCore": 1, + "InstanceStorageConfigs": [ + { + "EbsVolumeConfig": { + "VolumeSizeInGB": 500 + } + } + ] + } + ], + "description": "List of string containing instance group configurations", + "title": "Instance Group Settings" + }, + "rig_settings": { + "anyOf": [ + { + "items": {}, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "List of string containing restricted instance group configurations", + "title": "Rig Settings" + }, + "rig_s3_bucket_name": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The name of the S3 bucket used to store the RIG resources", + "title": "Rig S3 Bucket Name" + }, + "tags": { + "anyOf": [ + { + "items": {}, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Custom tags for managing the SageMaker HyperPod cluster as an AWS resource", + "title": "Tags" + }, + "create_vpc_stack": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "default": true, + "description": "Boolean to Create VPC Stack", + "title": "Create Vpc Stack" + }, + "vpc_id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The ID of the VPC you wish to use if you do not want to create a new VPC", + "title": "Vpc Id" + }, + "vpc_cidr": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": "10.192.0.0/16", + "description": "The IP range (CIDR notation) for the VPC", + "title": "Vpc Cidr" + }, + "availability_zone_ids": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "List of AZs in submission region to deploy subnets in. Must be provided in YAML format starting with \"-\" below. Example: - use2-az1 for us-east-2 region", + "title": "Availability Zone Ids" + }, + "create_security_group_stack": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "default": true, + "description": "Boolean to Create Security Group Stack", + "title": "Create Security Group Stack" + }, + "security_group_id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The ID of the security group you wish to use in SecurityGroup substack if you do not want to create a new one", + "title": "Security Group Id" + }, + "security_group_ids": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The security groups you wish to use for Hyperpod cluster if you do not want to create new ones", + "title": "Security Group Ids" + }, + "private_subnet_ids": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "List of private subnet IDs used for HyperPod cluster if you do not want to create VPC stack", + "title": "Private Subnet Ids" + }, + "eks_private_subnet_ids": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "List of private subnet IDs for the EKS cluster if you do not want to create VPC stack", + "title": "Eks Private Subnet Ids" + }, + "nat_gateway_ids": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "List of NAT Gateway IDs to route internet bound traffic if you do not want to create VPC stack", + "title": "Nat Gateway Ids" + }, + "private_route_table_ids": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "List of private route table IDs if you do not want to create VPC stack", + "title": "Private Route Table Ids" + }, + "create_s3_endpoint_stack": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "default": true, + "description": "Boolean to Create S3 Endpoint stack", + "title": "Create S3 Endpoint Stack" + }, + "enable_hp_inference_feature": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "default": false, + "description": "Boolean to enable inference operator in Hyperpod cluster", + "title": "Enable Hp Inference Feature" + }, + "stage": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": "prod", + "description": "Deployment stage used in S3 bucket naming for inference operator. Valid values: \"gamma\", \"prod\"", + "title": "Stage" + }, + "custom_bucket_name": { + "default": "", + "description": "Custom S3 bucket name for templates", + "title": "Custom Bucket Name", + "type": "string" + }, + "create_life_cycle_script_stack": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "default": true, + "description": "Boolean to Create Life Cycle Script Stack", + "title": "Create Life Cycle Script Stack" + }, + "create_s3_bucket_stack": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "default": true, + "description": "Boolean to Create S3 Bucket Stack", + "title": "Create S3 Bucket Stack" + }, + "s3_bucket_name": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": "s3-bucket", + "description": "The name of the S3 bucket used to store the cluster lifecycle scripts", + "title": "S3 Bucket Name" + }, + "github_raw_url": { + "default": "https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/refs/heads/main/1.architectures/7.sagemaker-hyperpod-eks/LifecycleScripts/base-config/on_create.sh", + "description": "The raw GitHub URL for the lifecycle script (fixed default)", + "title": "Github Raw Url", + "type": "string" + }, + "on_create_path": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": "sagemaker-hyperpod-eks-bucket", + "description": "The file name of lifecycle script", + "title": "On Create Path" + }, + "create_sagemaker_iam_role_stack": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "default": true, + "description": "Boolean to Create SageMaker IAM Role Stack", + "title": "Create Sagemaker Iam Role Stack" + }, + "sagemaker_iam_role_name": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": "create-cluster-role", + "description": "The name of the IAM role that SageMaker will use during cluster creation to access the AWS resources on your behalf", + "title": "Sagemaker Iam Role Name" + }, + "create_fsx_stack": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "default": true, + "description": "Boolean to Create FSx Stack", + "title": "Create Fsx Stack" + }, + "fsx_subnet_id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": "", + "description": "The subnet id that will be used to create FSx", + "title": "Fsx Subnet Id" + }, + "fsx_availability_zone_id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": "", + "description": "The availability zone to get subnet id that will be used to create FSx", + "title": "Fsx Availability Zone Id" + }, + "per_unit_storage_throughput": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": 250, + "description": "Per unit storage throughput", + "title": "Per Unit Storage Throughput" + }, + "data_compression_type": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": "NONE", + "description": "Data compression type for the FSx file system. Valid values: \"NONE\", \"LZ4\"", + "title": "Data Compression Type" + }, + "file_system_type_version": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ], + "default": 2.15, + "description": "File system type version for the FSx file system", + "title": "File System Type Version" + }, + "storage_capacity": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": 1200, + "description": "Storage capacity for the FSx file system in GiB", + "title": "Storage Capacity" + }, + "fsx_file_system_id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": "", + "description": "Existing FSx file system ID", + "title": "Fsx File System Id" + } + }, + "title": "ClusterStackBase", + "type": "object" +} \ No newline at end of file diff --git a/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/template.py b/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/template.py new file mode 100644 index 00000000..4e4bc4fd --- /dev/null +++ b/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/template.py @@ -0,0 +1,948 @@ +TEMPLATE_CONTENT = """### Please keep template file unchanged ### +Metadata: + AWS::CloudFormation::Interface: + ParameterGroups: + - Label: + default: General Settings + Parameters: + - ResourceNamePrefix + - Stage + - NodeRecovery + - Tags + - Label: + default: Networking + Parameters: + - CreateVPCStack + - VpcId + - VpcCIDR + - AvailabilityZoneIds + - CreateSecurityGroupStack + - SecurityGroupId + - SecurityGroupIds + - CreatePrivateSubnetStack + - PrivateSubnetIds + - EksPrivateSubnetIds + - NatGatewayIds + - PrivateRouteTableIds + - CreateS3EndpointStack + - Label: + default: Orchestration + Parameters: + - CreateEKSClusterStack + - EKSClusterName + - KubernetesVersion + - CreateHelmChartStack + - HelmRepoUrl + - HelmRepoPath + - HelmRelease + - Namespace + - HelmOperators + - Label: + default: Lifecycle Configuration + Parameters: + - CreateLifeCycleScriptStack + - CreateS3BucketStack + - S3BucketName + - GithubRawUrl + - OnCreatePath + - Label: + default: Permissions + Parameters: + - CreateSageMakerIAMRoleStack + - SageMakerIAMRoleName + - Label: + default: Storage + Parameters: + - CreateFsxStack + - FsxFileSystemId + - FsxSubnetId + - FsxAvailabilityZone + - StorageCapacity + - PerUnitStorageThroughput + - DataCompressionType + - FileSystemTypeVersion + - Label: + default: HyperPod Cluster + Parameters: + - CreateHyperPodClusterStack + - HyperPodClusterName + - Label: + default: Instance Groups + Parameters: + - InstanceGroupSettings1 + - InstanceGroupSettings2 + - InstanceGroupSettings3 + - InstanceGroupSettings4 + - InstanceGroupSettings5 + - InstanceGroupSettings6 + - InstanceGroupSettings7 + - InstanceGroupSettings8 + - InstanceGroupSettings9 + - InstanceGroupSettings10 + - InstanceGroupSettings11 + - InstanceGroupSettings12 + - InstanceGroupSettings13 + - InstanceGroupSettings14 + - InstanceGroupSettings15 + - InstanceGroupSettings16 + - InstanceGroupSettings17 + - InstanceGroupSettings18 + - InstanceGroupSettings19 + - InstanceGroupSettings20 + - Label: + default: Restricted Instance Groups + Parameters: + - RigSettings1 + - RigSettings2 + - RigSettings3 + - RigSettings4 + - RigSettings5 + - RigSettings6 + - RigSettings7 + - RigSettings8 + - RigSettings9 + - RigSettings10 + - RigSettings11 + - RigSettings12 + - RigSettings13 + - RigSettings14 + - RigSettings15 + - RigSettings16 + - RigSettings17 + - RigSettings18 + - RigSettings19 + - RigSettings20 + ParameterLabels: + ResourceNamePrefix: + default: Resource Name Prefix + Stage: + default: Deployment Stage + NodeRecovery: + default: Instance Recovery + Tags: + default: Resource Tags + CreateVPCStack: + default: Create New VPC + VpcId: + default: Existing VPC ID + VpcCIDR: + default: VPC CIDR Range + AvailabilityZoneIds: + default: Availability Zone IDs + CreateSecurityGroupStack: + default: Create New Security Group + SecurityGroupId: + default: Existing Security Group ID + SecurityGroupIds: + default: Security Group IDs + CreatePrivateSubnetStack: + default: Create Private Subnets + PrivateSubnetIds: + default: Private Subnet IDs + EksPrivateSubnetIds: + default: EKS Private Subnet IDs + NatGatewayIds: + default: NAT Gateway IDs + PrivateRouteTableIds: + default: Private Route Table IDs + CreateS3EndpointStack: + default: Create S3 Endpoint + CreateEKSClusterStack: + default: Create New EKS Cluster + EKSClusterName: + default: EKS Cluster Name + KubernetesVersion: + default: Kubernetes Version + CreateHelmChartStack: + default: Install Helm Charts + HelmRepoUrl: + default: Helm Repository URL + HelmRepoPath: + default: Helm Chart Path + HelmRelease: + default: Helm Release Name + Namespace: + default: Kubernetes Namespace + HelmOperators: + default: Enabled Operators + CreateLifeCycleScriptStack: + default: Create Lifecycle Scripts + CreateS3BucketStack: + default: Create New S3 Bucket + S3BucketName: + default: S3 Bucket Name + GithubRawUrl: + default: GitHub Raw URL + OnCreatePath: + default: OnCreate Script Path + CreateSageMakerIAMRoleStack: + default: Create New IAM Role + SageMakerIAMRoleName: + default: IAM Role Name + CreateFsxStack: + default: Create New FSx for Lustre File System + FsxFileSystemId: + default: Existing FSx File System ID + FsxSubnetId: + default: FSx Subnet ID + FsxAvailabilityZone: + default: FSx Availability Zone + StorageCapacity: + default: Storage Capacity (GB) + PerUnitStorageThroughput: + default: Per-unit Storage Throughput (MB/s/TiB) + DataCompressionType: + default: Compression Type + FileSystemTypeVersion: + default: Lustre Version + CreateHyperPodClusterStack: + default: Create HyperPod Cluster + HyperPodClusterName: + default: HyperPod Cluster Name +Parameters: + Stage: + Type: String + Default: {{ stage | default('gamma') }} + AllowedValues: + - gamma + - prod + Description: Deployment stage (gamma, prod) + ResourceNamePrefix: + Type: String + Default: {{ resource_name_prefix | default('sagemaker-hyperpod-eks') }} + Description: Prefix to be used for all resources created by this template. + VpcCIDR: + Type: String + Default: {{ vpc_cidr | default('10.192.0.0/16') }} + Description: The IP range (CIDR notation) for the VPC. + AvailabilityZoneIds: + Type: String + Default: {{ availability_zone_ids | default('') }} + Description: List of AZs to deploy subnets in (up to 5, comma separated) + VpcId: + Type: String + Default: {{ vpc_id | default('vpc-1234567890abcdef0') }} + Description: The ID of the VPC you wish to use if you do not want to create a new VPC. + NatGatewayIds: + Type: String + Default: {{ nat_gateway_ids | default('nat-1234567890abcdef0') }} + Description: Comma-separated list of NAT Gateway IDs to route internet bound traffic to from the newly created private subnets. + SecurityGroupId: + Type: String + Default: {{ security_group_id | default('') }} + Description: The ID of the security group associated with an existing EKS cluster. + KubernetesVersion: + Type: String + Default: {{ kubernetes_version | default('1.31') }} + Description: The Kubernetes version to use for the EKS cluster. + EKSClusterName: + Type: String + Default: {{ eks_cluster_name | default('eks') }} + Description: The name of the newly created of preexisting EKS cluster you wish to use. + EksPrivateSubnetIds: + Type: String + Default: {{ eks_private_subnet_ids | default('subnet-1234567890abcdef0,subnet-1234567890abcdef0') }} + Description: Comma-delimited list of private subnet IDs for the EKS cluster + SecurityGroupIds: + Type: String + Default: {{ security_group_ids | default('sg-1234567890abcdef0') }} + Description: The Id of your cluster security group. + PrivateRouteTableIds: + Type: String + Default: {{ private_route_table_ids | default('rtb-1234567890abcdef0') }} + Description: Comma-separated list of private route table IDs. + S3BucketName: + Type: String + Default: {{ s3_bucket_name | default('s3-bucket') }} + Description: The name of the S3 bucket used to store the cluster lifecycle scripts. + GithubRawUrl: + Type: String + Default: https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/refs/heads/main/1.architectures/7.sagemaker-hyperpod-eks/LifecycleScripts/base-config/on_create.sh + Description: The raw GitHub URL for the lifecycle script. + HelmRepoUrl: + Type: String + Default: https://github.com/aws/sagemaker-hyperpod-cli.git + Description: The URL of the Helm repo containing the HyperPod Helm chart. + HelmRepoPath: + Type: String + Default: helm_chart/HyperPodHelmChart + Description: The path to the HyperPod Helm chart in the Helm repo. + HelmOperators: + Type: String + Default: {{ helm_operators | default('') }} + Description: The configuration of HyperPod Helm chart + Namespace: + Type: String + Default: {{ namespace | default('kube-system') }} + Description: The namespace to deploy the HyperPod Helm chart into. + HelmRelease: + Type: String + Default: {{ helm_release | default('hyperpod-dependencies') }} + Description: The name of the Helm release. + HyperPodClusterName: + Type: String + Default: {{ hyperpod_cluster_name | default('hp-cluster') }} + Description: Name of SageMaker HyperPod Cluster. + NodeRecovery: + Type: String + Default: {{ node_recovery | default('Automatic') }} + AllowedValues: + - Automatic + - None + Description: Specifies whether to enable or disable the automatic node recovery feature (Automatic or None). + SageMakerIAMRoleName: + Type: String + Default: {{ sagemaker_iam_role_name | default('iam-role') }} + Description: The name of the IAM role that SageMaker will use to access the AWS resources on your behalf. + PrivateSubnetIds: + Type: String + Default: {{ private_subnet_ids | default('subnet-1234567890abcdef0,subnet-1234567890abcdef0') }} + Description: Comma-separated list of private subnet IDs for EKS cluster. + OnCreatePath: + Type: String + Default: {{ on_create_path | default('sagemaker-hyperpod-eks-bucket') }} + Description: The file name of lifecycle script for the general purpose instance group. This script runs during cluster creation. +{% for i in range(1, 21) %} + InstanceGroupSettings{{ i }}: + Type: String + Default: {{ instance_group_settings[i-1] | default('[]') }} + Description: JSON array string containing instance group configurations. + RigSettings{{ i }}: + Type: String + Default: {{ rig_settings[i-1] | default('[]') }} + Description: JSON array string containing restricted instance group configurations. +{% endfor %} + Tags: + Type: String + Default: {{ tags | default('[]') }} + Description: Custom tags for managing the SageMaker HyperPod cluster as an AWS resource. + FsxSubnetId: + Type: String + Default: {{ fsx_subnet_id | default('') }} + Description: The subnet id that will be used to create FSx + FsxAvailabilityZone: + Type: String + Default: {{ fsx_availability_zone | default('use2-az1') }} + Description: The availability zone to get subnet id that will be used to create FSx + PerUnitStorageThroughput: + Type: Number + Default: {{ per_unit_storage_throughput | default(250) }} + Description: Per unit storage throughput for the FSx file system + DataCompressionType: + Type: String + Default: {{ data_compression_type | default('NONE') }} + AllowedValues: + - NONE + - LZ4 + Description: Data compression type for the FSx file system (NONE, LZ4) + FileSystemTypeVersion: + Type: Number + Default: {{ file_system_type_version | default(2.15) }} + Description: File system type version for the FSx file system + StorageCapacity: + Type: Number + Default: {{ storage_capacity | default(1200) }} + Description: Storage capacity for the FSx file system in GiB + FsxFileSystemId: + Type: String + Default: {{ fsx_file_system_id | default('') }} + Description: Existing FSx for Lustre file system + CreateVPCStack: + Type: String + Default: {{ create_vpc_stack | default('true') }} + AllowedValues: + - 'true' + - 'false' + Description: Boolean to Create VPC Stack + CreatePrivateSubnetStack: + Type: String + Default: {{ create_private_subnet_stack | default('true') }} + AllowedValues: + - 'true' + - 'false' + Description: Boolean to Create Private Subnet Stack + CreateSecurityGroupStack: + Type: String + Default: {{ create_security_group_stack | default('true') }} + AllowedValues: + - 'true' + - 'false' + Description: Boolean to Create Security Group Stack + CreateEKSClusterStack: + Type: String + Default: {{ create_eks_cluster_stack | default('true') }} + AllowedValues: + - 'true' + - 'false' + Description: Boolean to Create EKS Cluster Stack + CreateS3BucketStack: + Type: String + Default: {{ create_s3_bucket_stack | default('true') }} + AllowedValues: + - 'true' + - 'false' + Description: Boolean to Create S3 Bucket Stack + CreateS3EndpointStack: + Type: String + Default: {{ create_s3_endpoint_stack | default('true') }} + AllowedValues: + - 'true' + - 'false' + Description: Boolean to Create S3 Endpoint Stack + CreateLifeCycleScriptStack: + Type: String + Default: {{ create_life_cycle_script_stack | default('true') }} + AllowedValues: + - 'true' + - 'false' + Description: Boolean to Create Life Cycle Script Stack + CreateSageMakerIAMRoleStack: + Type: String + Default: {{ create_sagemaker_iam_role_stack | default('true') }} + AllowedValues: + - 'true' + - 'false' + Description: Boolean to Create SageMaker IAM Role Stack + CreateHelmChartStack: + Type: String + Default: {{ create_helm_chart_stack | default('true') }} + AllowedValues: + - 'true' + - 'false' + Description: Boolean to Create Helm Chart Stack + CreateHyperPodClusterStack: + Type: String + Default: {{ create_hyperpod_cluster_stack | default('true') }} + AllowedValues: + - 'true' + - 'false' + Description: Boolean to Create HyperPod Cluster Stack + CreateFsxStack: + Type: String + Default: {{ create_fsx_stack | default('true') }} + AllowedValues: + - 'true' + - 'false' + Description: Boolean to Create FSx for Lustre File System Stack +Conditions: + CreateVPCStackCondition: + Fn::Equals: + - Ref: CreateVPCStack + - 'true' + CreatePrivateSubnetStackCondition: + Fn::Equals: + - Ref: CreatePrivateSubnetStack + - 'true' + CreateSecurityGroupStackCondition: + Fn::Equals: + - Ref: CreateSecurityGroupStack + - 'true' + CreateEKSClusterStackCondition: + Fn::Equals: + - Ref: CreateEKSClusterStack + - 'true' + CreateS3BucketStackCondition: + Fn::Equals: + - Ref: CreateS3BucketStack + - 'true' + CreateS3EndpointStackCondition: + Fn::Equals: + - Ref: CreateS3EndpointStack + - 'true' + CreateLifeCycleScriptStackCondition: + Fn::Equals: + - Ref: CreateLifeCycleScriptStack + - 'true' + CreateSageMakerIAMRoleStackCondition: + Fn::Equals: + - Ref: CreateSageMakerIAMRoleStack + - 'true' + CreateHelmChartStackCondition: + Fn::Equals: + - Ref: CreateHelmChartStack + - 'true' + CreateHyperPodClusterStackCondition: + Fn::And: + - Fn::Equals: + - Ref: CreateHyperPodClusterStack + - 'true' + - Fn::Not: + - Fn::And: + - Fn::Equals: + - Ref: CreateEKSClusterStack + - 'true' + - Fn::Equals: + - Ref: CreateHelmChartStack + - 'false' + CreateFsxStackCondition: + Fn::Equals: + - Ref: CreateFsxStack + - 'true' +Resources: + VPCStack: + Type: AWS::CloudFormation::Stack + Properties: + TemplateURL: + Fn::Sub: https://aws-sagemaker-hyperpod-cluster-setup-${AWS::Region}-${Stage}.s3.${AWS::Region}.amazonaws.com/templates/vpc-template.yaml + Parameters: + ResourceNamePrefix: + Ref: ResourceNamePrefix + VpcCIDR: + Ref: VpcCIDR + AvailabilityZoneIds: + Fn::Join: + - ',' + - - Ref: AvailabilityZoneIds + - '' + - '' + - '' + Metadata: + aws:cdk:path: MainEksBasedCfnTemplate/VPCStack + Condition: CreateVPCStackCondition + PrivateSubnetStack: + Type: AWS::CloudFormation::Stack + Properties: + TemplateURL: + Fn::Sub: https://aws-sagemaker-hyperpod-cluster-setup-${AWS::Region}-${Stage}.s3.${AWS::Region}.amazonaws.com/templates/private-subnet-template.yaml + Parameters: + ResourceNamePrefix: + Ref: ResourceNamePrefix + VpcId: + Fn::If: + - CreateVPCStackCondition + - Fn::GetAtt: + - VPCStack + - Outputs.VpcId + - Ref: VpcId + VpcCidrBlock: + Ref: VpcCIDR + AvailabilityZoneIds: + Fn::Join: + - ',' + - - Ref: AvailabilityZoneIds + - '' + - '' + - '' + NatGatewayIds: + Fn::If: + - CreateVPCStackCondition + - Fn::GetAtt: + - VPCStack + - Outputs.NatGatewayIds + - Ref: NatGatewayIds + Metadata: + aws:cdk:path: MainEksBasedCfnTemplate/PrivateSubnetStack + Condition: CreatePrivateSubnetStackCondition + SecurityGroupStack: + Type: AWS::CloudFormation::Stack + Properties: + TemplateURL: + Fn::Sub: https://aws-sagemaker-hyperpod-cluster-setup-${AWS::Region}-${Stage}.s3.${AWS::Region}.amazonaws.com/templates/security-group-template.yaml + Parameters: + ResourceNamePrefix: + Ref: ResourceNamePrefix + VpcId: + Fn::If: + - CreateVPCStackCondition + - Fn::GetAtt: + - VPCStack + - Outputs.VpcId + - Ref: VpcId + SecurityGroupId: + Ref: SecurityGroupId + Metadata: + aws:cdk:path: MainEksBasedCfnTemplate/SecurityGroupStack + Condition: CreateSecurityGroupStackCondition + EKSClusterStack: + Type: AWS::CloudFormation::Stack + Properties: + TemplateURL: + Fn::Sub: https://aws-sagemaker-hyperpod-cluster-setup-${AWS::Region}-${Stage}.s3.${AWS::Region}.amazonaws.com/templates/eks-template.yaml + Parameters: + ResourceNamePrefix: + Ref: ResourceNamePrefix + VpcId: + Fn::If: + - CreateVPCStackCondition + - Fn::GetAtt: + - VPCStack + - Outputs.VpcId + - Ref: VpcId + KubernetesVersion: + Ref: KubernetesVersion + EKSClusterName: + Ref: EKSClusterName + EksPrivateSubnetIds: + Fn::If: + - CreatePrivateSubnetStackCondition + - Fn::GetAtt: + - PrivateSubnetStack + - Outputs.EksPrivateSubnetIds + - Ref: PrivateSubnetIds + SecurityGroupIds: + Fn::If: + - CreateSecurityGroupStackCondition + - Fn::GetAtt: + - SecurityGroupStack + - Outputs.SecurityGroupId + - Ref: SecurityGroupIds + Metadata: + aws:cdk:path: MainEksBasedCfnTemplate/EKSClusterStack + Condition: CreateEKSClusterStackCondition + S3BucketStack: + Type: AWS::CloudFormation::Stack + Properties: + TemplateURL: + Fn::Sub: https://aws-sagemaker-hyperpod-cluster-setup-${AWS::Region}-${Stage}.s3.${AWS::Region}.amazonaws.com/templates/s3-bucket-template.yaml + Parameters: + ResourceNamePrefix: + Ref: ResourceNamePrefix + Metadata: + aws:cdk:path: MainEksBasedCfnTemplate/S3BucketStack + Condition: CreateS3BucketStackCondition + S3EndpointStack: + Type: AWS::CloudFormation::Stack + Properties: + TemplateURL: + Fn::Sub: https://aws-sagemaker-hyperpod-cluster-setup-${AWS::Region}-${Stage}.s3.${AWS::Region}.amazonaws.com/templates/s3-endpoint-template.yaml + Parameters: + VpcId: + Fn::If: + - CreateVPCStackCondition + - Fn::GetAtt: + - VPCStack + - Outputs.VpcId + - Ref: VpcId + PrivateRouteTableIds: + Fn::If: + - CreatePrivateSubnetStackCondition + - Fn::GetAtt: + - PrivateSubnetStack + - Outputs.PrivateRouteTableIds + - Ref: PrivateRouteTableIds + Metadata: + aws:cdk:path: MainEksBasedCfnTemplate/S3EndpointStack + Condition: CreateS3EndpointStackCondition + LifeCycleScriptStack: + Type: AWS::CloudFormation::Stack + Properties: + TemplateURL: + Fn::Sub: https://aws-sagemaker-hyperpod-cluster-setup-${AWS::Region}-${Stage}.s3.${AWS::Region}.amazonaws.com/templates/lifecycle-script-template.yaml + Parameters: + ResourceNamePrefix: + Ref: ResourceNamePrefix + S3BucketName: + Fn::If: + - CreateS3BucketStackCondition + - Fn::GetAtt: + - S3BucketStack + - Outputs.S3BucketName + - Ref: S3BucketName + Metadata: + aws:cdk:path: MainEksBasedCfnTemplate/LifeCycleScriptStack + Condition: CreateLifeCycleScriptStackCondition + SageMakerIAMRoleStack: + Type: AWS::CloudFormation::Stack + Properties: + TemplateURL: + Fn::Sub: https://aws-sagemaker-hyperpod-cluster-setup-${AWS::Region}-${Stage}.s3.${AWS::Region}.amazonaws.com/templates/sagemaker-iam-role-template.yaml + Parameters: + ResourceNamePrefix: + Ref: ResourceNamePrefix + S3BucketName: + Fn::If: + - CreateS3BucketStackCondition + - Fn::GetAtt: + - S3BucketStack + - Outputs.S3BucketName + - Ref: S3BucketName + Metadata: + aws:cdk:path: MainEksBasedCfnTemplate/SageMakerIAMRoleStack + Condition: CreateSageMakerIAMRoleStackCondition + HelmChartStack: + Type: AWS::CloudFormation::Stack + Properties: + TemplateURL: + Fn::Sub: https://aws-sagemaker-hyperpod-cluster-setup-${AWS::Region}-${Stage}.s3.${AWS::Region}.amazonaws.com/templates/helm-chart-template.yaml + Parameters: + ResourceNamePrefix: + Ref: ResourceNamePrefix + HelmRepoUrl: + Ref: HelmRepoUrl + HelmRepoPath: + Ref: HelmRepoPath + Namespace: + Ref: Namespace + HelmRelease: + Ref: HelmRelease + HelmOperators: + Ref: HelmOperators + CustomResourceS3Bucket: + Fn::Sub: aws-sagemaker-hyperpod-cluster-setup-${AWS::Region}-${Stage} + EKSClusterName: + Fn::If: + - CreateEKSClusterStackCondition + - Fn::GetAtt: + - EKSClusterStack + - Outputs.EKSClusterName + - Ref: EKSClusterName + Metadata: + aws:cdk:path: MainEksBasedCfnTemplate/HelmChartStack + Condition: CreateHelmChartStackCondition + HyperPodClusterStack: + Type: AWS::CloudFormation::Stack + Properties: + TemplateURL: + Fn::Sub: https://aws-sagemaker-hyperpod-cluster-setup-${AWS::Region}-${Stage}.s3.${AWS::Region}.amazonaws.com/templates/hyperpod-cluster-template.yaml + Parameters: + ResourceNamePrefix: + Ref: ResourceNamePrefix + HelmChartStatus: + Fn::If: + - CreateHelmChartStackCondition + - Fn::GetAtt: + - HelmChartStack + - Outputs.HelmChartDeploymentComplete + - HelmChartNotRequired + HyperPodClusterName: + Ref: HyperPodClusterName + NodeRecovery: + Ref: NodeRecovery + EKSClusterName: + Fn::If: + - CreateEKSClusterStackCondition + - Fn::GetAtt: + - EKSClusterStack + - Outputs.EKSClusterName + - Ref: EKSClusterName + SecurityGroupIds: + Fn::If: + - CreateSecurityGroupStackCondition + - Fn::GetAtt: + - SecurityGroupStack + - Outputs.SecurityGroupId + - Ref: SecurityGroupIds + PrivateSubnetIds: + Fn::If: + - CreatePrivateSubnetStackCondition + - Fn::GetAtt: + - PrivateSubnetStack + - Outputs.PrivateSubnetIds + - Ref: PrivateSubnetIds + CustomResourceS3Bucket: + Fn::Sub: aws-sagemaker-hyperpod-cluster-setup-${AWS::Region}-${Stage} + SageMakerIAMRoleName: + Fn::If: + - CreateSageMakerIAMRoleStackCondition + - Fn::GetAtt: + - SageMakerIAMRoleStack + - Outputs.SageMakerIAMRoleName + - Ref: SageMakerIAMRoleName + S3BucketName: + Fn::If: + - CreateS3BucketStackCondition + - Fn::GetAtt: + - S3BucketStack + - Outputs.S3BucketName + - Ref: S3BucketName + OnCreatePath: + Fn::If: + - CreateS3BucketStackCondition + - on_create.sh + - Ref: OnCreatePath + InstanceGroupSettings1: + Ref: InstanceGroupSettings1 + InstanceGroupSettings2: + Ref: InstanceGroupSettings2 + InstanceGroupSettings3: + Ref: InstanceGroupSettings3 + InstanceGroupSettings4: + Ref: InstanceGroupSettings4 + InstanceGroupSettings5: + Ref: InstanceGroupSettings5 + InstanceGroupSettings6: + Ref: InstanceGroupSettings6 + InstanceGroupSettings7: + Ref: InstanceGroupSettings7 + InstanceGroupSettings8: + Ref: InstanceGroupSettings8 + InstanceGroupSettings9: + Ref: InstanceGroupSettings9 + InstanceGroupSettings10: + Ref: InstanceGroupSettings10 + InstanceGroupSettings11: + Ref: InstanceGroupSettings11 + InstanceGroupSettings12: + Ref: InstanceGroupSettings12 + InstanceGroupSettings13: + Ref: InstanceGroupSettings13 + InstanceGroupSettings14: + Ref: InstanceGroupSettings14 + InstanceGroupSettings15: + Ref: InstanceGroupSettings15 + InstanceGroupSettings16: + Ref: InstanceGroupSettings16 + InstanceGroupSettings17: + Ref: InstanceGroupSettings17 + InstanceGroupSettings18: + Ref: InstanceGroupSettings18 + InstanceGroupSettings19: + Ref: InstanceGroupSettings19 + InstanceGroupSettings20: + Ref: InstanceGroupSettings20 + RigSettings1: + Ref: RigSettings1 + RigSettings2: + Ref: RigSettings2 + RigSettings3: + Ref: RigSettings3 + RigSettings4: + Ref: RigSettings4 + RigSettings5: + Ref: RigSettings5 + RigSettings6: + Ref: RigSettings6 + RigSettings7: + Ref: RigSettings7 + RigSettings8: + Ref: RigSettings8 + RigSettings9: + Ref: RigSettings9 + RigSettings10: + Ref: RigSettings10 + RigSettings11: + Ref: RigSettings11 + RigSettings12: + Ref: RigSettings12 + RigSettings13: + Ref: RigSettings13 + RigSettings14: + Ref: RigSettings14 + RigSettings15: + Ref: RigSettings15 + RigSettings16: + Ref: RigSettings16 + RigSettings17: + Ref: RigSettings17 + RigSettings18: + Ref: RigSettings18 + RigSettings19: + Ref: RigSettings19 + RigSettings20: + Ref: RigSettings20 + Tags: + Ref: Tags + Metadata: + aws:cdk:path: MainEksBasedCfnTemplate/HyperPodClusterStack + Condition: CreateHyperPodClusterStackCondition + FsxStack: + Type: AWS::CloudFormation::Stack + Properties: + TemplateURL: + Fn::Sub: https://aws-sagemaker-hyperpod-cluster-setup-${AWS::Region}-${Stage}.s3.${AWS::Region}.amazonaws.com/templates/fsx-template.yaml + Parameters: + ResourceNamePrefix: + Ref: ResourceNamePrefix + HelmChartStatus: + Fn::If: + - CreateHelmChartStackCondition + - Fn::GetAtt: + - HelmChartStack + - Outputs.HelmChartDeploymentComplete + - HelmChartNotRequired + EKSClusterName: + Fn::If: + - CreateEKSClusterStackCondition + - Fn::GetAtt: + - EKSClusterStack + - Outputs.EKSClusterName + - Ref: EKSClusterName + CustomResourceS3Bucket: + Fn::Sub: aws-sagemaker-hyperpod-cluster-setup-${AWS::Region}-${Stage} + PrivateSubnetIds: + Fn::If: + - CreatePrivateSubnetStackCondition + - Fn::GetAtt: + - PrivateSubnetStack + - Outputs.PrivateSubnetIds + - Ref: PrivateSubnetIds + FsxSubnetId: + Ref: FsxSubnetId + FsxAvailabilityZone: + Ref: FsxAvailabilityZone + SecurityGroupIds: + Fn::If: + - CreateSecurityGroupStackCondition + - Fn::GetAtt: + - SecurityGroupStack + - Outputs.SecurityGroupId + - Ref: SecurityGroupIds + PerUnitStorageThroughput: + Ref: PerUnitStorageThroughput + DataCompressionType: + Ref: DataCompressionType + FileSystemTypeVersion: + Ref: FileSystemTypeVersion + StorageCapacity: + Ref: StorageCapacity + FsxFileSystemId: + Ref: FsxFileSystemId + Metadata: + aws:cdk:path: MainEksBasedCfnTemplate/FsxStack + Condition: CreateFsxStackCondition +Outputs: + OutputVpcId: + Value: + Fn::GetAtt: + - VPCStack + - Outputs.VpcId + Condition: CreateVPCStackCondition + OutputPrivateSubnetIds: + Value: + Fn::GetAtt: + - PrivateSubnetStack + - Outputs.PrivateSubnetIds + Condition: CreatePrivateSubnetStackCondition + OutputSecurityGroupId: + Value: + Fn::GetAtt: + - SecurityGroupStack + - Outputs.SecurityGroupId + Condition: CreateSecurityGroupStackCondition + OutputEKSClusterArn: + Value: + Fn::GetAtt: + - EKSClusterStack + - Outputs.EKSClusterArn + Condition: CreateEKSClusterStackCondition + OutputEKSClusterName: + Value: + Fn::GetAtt: + - EKSClusterStack + - Outputs.EKSClusterName + Condition: CreateEKSClusterStackCondition + OutputSageMakerIAMRoleArn: + Value: + Fn::GetAtt: + - SageMakerIAMRoleStack + - Outputs.SageMakerIAMRoleArn + Condition: CreateSageMakerIAMRoleStackCondition + OutputS3BucketName: + Value: + Fn::GetAtt: + - S3BucketStack + - Outputs.S3BucketName + Condition: CreateS3BucketStackCondition + OutputHyperPodClusterName: + Value: + Fn::GetAtt: + - HyperPodClusterStack + - Outputs.HyperPodClusterName + Condition: CreateHyperPodClusterStackCondition + OutputHyperPodClusterArn: + Value: + Fn::GetAtt: + - HyperPodClusterStack + - Outputs.HyperPodClusterArn + Condition: CreateHyperPodClusterStackCondition +""" \ No newline at end of file diff --git a/hyperpod-cluster-stack-template/pyproject.toml b/hyperpod-cluster-stack-template/pyproject.toml new file mode 100644 index 00000000..09cf76a6 --- /dev/null +++ b/hyperpod-cluster-stack-template/pyproject.toml @@ -0,0 +1,27 @@ +[build-system] +requires = ["setuptools>=61.0", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "hyperpod-cluster-stack-template" +version = "1.0.1" +readme = "README.md" +authors = [{name = "Amazon Web Services"}] +license = {text = "Apache-2.0"} +description = "Versioned JSON-schema + Pydantic models for HyperpodPytorchJobOperator" +requires-python = ">=3.8" +dependencies = [ + "pydantic", +] + +[tool.setuptools.packages.find] +# find all subpackages under hyperpod_pytorch_job_template +where = ["."] +include = ["hyperpod_cluster_stack_template*"] + +[tool.setuptools] +# tells setuptools to include package_data entries below +include-package-data = true + +[tool.setuptools.package-data] +"*" = ["*.yaml", "*.json"] \ No newline at end of file diff --git a/hyperpod-custom-inference-template/CHANGELOG.md b/hyperpod-custom-inference-template/CHANGELOG.md index a7a88bfa..f6aee119 100644 --- a/hyperpod-custom-inference-template/CHANGELOG.md +++ b/hyperpod-custom-inference-template/CHANGELOG.md @@ -1,3 +1,9 @@ +## v1.0.1] ([2025]-[08]-[27]) + +### Features + +* Add metadata_name argument to js and custom endpoint to match with SDK + ## v1.0.0] ([2025]-[07]-[10]) ### Features diff --git a/hyperpod-custom-inference-template/hyperpod_custom_inference_template/registry.py b/hyperpod-custom-inference-template/hyperpod_custom_inference_template/registry.py index f681f844..1da3df96 100644 --- a/hyperpod-custom-inference-template/hyperpod_custom_inference_template/registry.py +++ b/hyperpod-custom-inference-template/hyperpod_custom_inference_template/registry.py @@ -11,7 +11,12 @@ # ANY KIND, either express or implied. See the License for the specific # language governing permissions and limitations under the License. from hyperpod_custom_inference_template.v1_0 import model as v1 +from hyperpod_custom_inference_template.v1_0.template import TEMPLATE_CONTENT as v1_template SCHEMA_REGISTRY = { "1.0": v1.FlatHPEndpoint, } + +TEMPLATE_REGISTRY = { + "1.0": v1_template +} diff --git a/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_0/model.py b/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_0/model.py index 2e346a91..2e0e544e 100644 --- a/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_0/model.py +++ b/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_0/model.py @@ -10,8 +10,9 @@ # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific # language governing permissions and limitations under the License. -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, model_validator, ConfigDict from typing import Optional, List, Dict, Union, Literal +import yaml from sagemaker.hyperpod.inference.config.hp_endpoint_config import ( Metrics, @@ -29,11 +30,29 @@ CloudWatchTrigger ) from sagemaker.hyperpod.inference.hp_endpoint import HPEndpoint +from sagemaker.hyperpod.common.config.metadata import Metadata + class FlatHPEndpoint(BaseModel): + model_config = ConfigDict(extra="forbid") + + namespace: Optional[str] = Field( + default=None, + description="Kubernetes namespace", + min_length=1 + ) + + metadata_name: Optional[str] = Field( + None, + alias="metadata_name", + description="Name of the custom endpoint object", + max_length=63, + pattern=r"^[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}$", + ) + # endpoint_name endpoint_name: Optional[str] = Field( - "", + None, alias="endpoint_name", description="Name of SageMaker endpoint; empty string means no creation", max_length=63, @@ -130,7 +149,7 @@ class FlatHPEndpoint(BaseModel): description="FSX File System DNS Name", ) fsx_file_system_id: Optional[str] = Field( - ..., + None, alias="fsx_file_system_id", description="FSX File System ID", ) @@ -142,12 +161,12 @@ class FlatHPEndpoint(BaseModel): # S3Storage s3_bucket_name: Optional[str] = Field( - ..., + None, alias="s3_bucket_name", description="S3 bucket location", ) s3_region: Optional[str] = Field( - ..., + None, alias="s3_region", description="S3 bucket region", ) @@ -229,13 +248,34 @@ class FlatHPEndpoint(BaseModel): invocation_endpoint: Optional[str] = Field( default="invocations", description=( - "The invocation endpoint of the model server. " - "http://:/ would be pre-populated based on the other fields. " + "The invocation endpoint of the model server. http://:/ would be pre-populated based on the other fields. " "Please fill in the path after http://:/ specific to your model server.", ) ) + @model_validator(mode='after') + def validate_model_source_config(self): + """Validate that required fields are provided based on model_source_type""" + if self.model_source_type == "s3": + if not self.s3_bucket_name or not self.s3_region: + raise ValueError("s3_bucket_name and s3_region are required when model_source_type is 's3'") + elif self.model_source_type == "fsx": + if not self.fsx_file_system_id: + raise ValueError("fsx_file_system_id is required when model_source_type is 'fsx'") + return self + + @model_validator(mode='after') + def validate_name(self): + if not self.metadata_name and not self.endpoint_name: + raise ValueError("Either metadata_name or endpoint_name must be provided") + return self + def to_domain(self) -> HPEndpoint: + if self.endpoint_name and not self.metadata_name: + self.metadata_name = self.endpoint_name + + metadata = Metadata(name=self.metadata_name, namespace=self.namespace) + env_vars = None if self.env: env_vars = [ @@ -317,6 +357,7 @@ def to_domain(self) -> HPEndpoint: resources=resources, ) return HPEndpoint( + metadata=metadata, endpoint_name=self.endpoint_name, instance_type=self.instance_type, metrics=metrics, @@ -327,4 +368,4 @@ def to_domain(self) -> HPEndpoint: worker=worker, invocation_endpoint=self.invocation_endpoint, auto_scaling_spec=auto_scaling_spec - ) + ) \ No newline at end of file diff --git a/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_0/schema.json b/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_0/schema.json index 389df921..8d5c6910 100644 --- a/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_0/schema.json +++ b/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_0/schema.json @@ -1,184 +1,471 @@ { - "$schema": "https://json-schema.org/draft/2020-12/schema", - "title": "FlatHPEndpoint", - "type": "object", "additionalProperties": false, - "required": [ - "instance_type", - "model_name", - "model_source_type", - "image_uri", - "container_port", - "model_volume_mount_name" - ], "properties": { + "namespace": { + "anyOf": [ + { + "minLength": 1, + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Kubernetes namespace", + "title": "Namespace" + }, + "metadata_name": { + "anyOf": [ + { + "maxLength": 63, + "pattern": "^[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}$", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Name of the custom endpoint object", + "title": "Metadata Name" + }, "endpoint_name": { - "type": ["string", "null"], - "description": "Name used for SageMaker endpoint; empty string means no creation", - "default": "", - "maxLength": 63, - "pattern": "^[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}$" + "anyOf": [ + { + "maxLength": 63, + "pattern": "^[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}$", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Name of SageMaker endpoint; empty string means no creation", + "title": "Endpoint Name" }, "env": { - "type": ["object", "null"], + "anyOf": [ + { + "additionalProperties": { + "type": "string" + }, + "type": "object" + }, + { + "type": "null" + } + ], + "default": null, "description": "Map of environment variable names to their values", - "additionalProperties": { "type": "string" } + "title": "Env" }, "instance_type": { - "type": "string", "description": "EC2 instance type for the inference server", - "pattern": "^ml\\..*" + "pattern": "^ml\\..*", + "title": "Instance Type", + "type": "string" }, "metrics_enabled": { - "type": "boolean", + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "default": false, "description": "Enable metrics collection", - "default": false + "title": "Metrics Enabled" }, "model_name": { - "type": "string", "description": "Name of model to create on SageMaker", - "minLength": 1, "maxLength": 63, - "pattern": "^[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}$" + "minLength": 1, + "pattern": "^[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}$", + "title": "Model Name", + "type": "string" }, "model_version": { - "type": ["string", "null"], + "anyOf": [ + { + "maxLength": 14, + "minLength": 5, + "pattern": "^\\d{1,4}\\.\\d{1,4}\\.\\d{1,4}$", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, "description": "Version of the model for the endpoint", - "minLength": 5, - "maxLength": 14, - "pattern": "^\\d{1,4}\\.\\d{1,4}\\.\\d{1,4}$" + "title": "Model Version" }, "model_source_type": { - "type": "string", "description": "Source type: fsx or s3", - "enum": ["fsx", "s3"] + "enum": [ + "fsx", + "s3" + ], + "title": "Model Source Type", + "type": "string" }, "model_location": { - "type": ["string", "null"], - "description": "Specific model data location" + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Specific model data location", + "title": "Model Location" }, "prefetch_enabled": { - "type": "boolean", + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "default": false, "description": "Whether to pre-fetch model data", - "default": false + "title": "Prefetch Enabled" }, "tls_certificate_output_s3_uri": { - "type": ["string", "null"], + "anyOf": [ + { + "pattern": "^s3://([^/]+)/?(.*)$", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, "description": "S3 URI for TLS certificate output", - "pattern": "^s3://([^/]+)/?(.*)$" - }, - "fsx_dns_name": { - "type": ["string", "null"], - "description": "FSX File System DNS Name" - }, - "fsx_file_system_id": { - "type": ["string", "null"], - "description": "FSX File System ID" - }, - "fsx_mount_name": { - "type": ["string", "null"], - "description": "FSX File System Mount Name" - }, - "s3_bucket_name": { - "type": ["string", "null"], - "description": "S3 bucket location" - }, - "s3_region": { - "type": ["string", "null"], - "description": "S3 bucket region" + "title": "Tls Certificate Output S3 Uri" }, "image_uri": { - "type": "string", - "description": "Inference server image name" + "description": "Inference server image name", + "title": "Image Uri", + "type": "string" }, "container_port": { - "type": "integer", - "format": "int32", "description": "Port on which the model server listens", + "maximum": 65535, "minimum": 1, - "maximum": 65535 + "title": "Container Port", + "type": "integer" }, "model_volume_mount_path": { - "type": "string", + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": "/opt/ml/model", "description": "Path inside container for model volume", - "default": "/opt/ml/model" + "title": "Model Volume Mount Path" }, "model_volume_mount_name": { - "type": "string", - "description": "Name of the model volume mount" + "description": "Name of the model volume mount", + "title": "Model Volume Mount Name", + "type": "string" + }, + "fsx_dns_name": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "FSX File System DNS Name", + "title": "Fsx Dns Name" + }, + "fsx_file_system_id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "FSX File System ID", + "title": "Fsx File System Id" + }, + "fsx_mount_name": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "FSX File System Mount Name", + "title": "Fsx Mount Name" + }, + "s3_bucket_name": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "S3 bucket location", + "title": "S3 Bucket Name" + }, + "s3_region": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "S3 bucket region", + "title": "S3 Region" }, "resources_limits": { - "type": ["object", "null"], + "anyOf": [ + { + "additionalProperties": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "string" + } + ] + }, + "type": "object" + }, + { + "type": "null" + } + ], + "default": null, "description": "Resource limits for the worker", - "additionalProperties": { - "type": ["integer", "string"] - } + "title": "Resources Limits" }, "resources_requests": { - "type": ["object", "null"], + "anyOf": [ + { + "additionalProperties": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "string" + } + ] + }, + "type": "object" + }, + { + "type": "null" + } + ], + "default": null, "description": "Resource requests for the worker", - "additionalProperties": { - "type": ["integer", "string"] - } + "title": "Resources Requests" }, "dimensions": { - "type": ["object", "null"], - "description": "CloudWatch Metric dimensions as key–value pairs", - "additionalProperties": { - "type": "string" - } + "anyOf": [ + { + "additionalProperties": { + "type": "string" + }, + "type": "object" + }, + { + "type": "null" + } + ], + "default": null, + "description": "CloudWatch Metric dimensions as key\u2013value pairs", + "title": "Dimensions" }, "metric_collection_period": { - "type": "integer", + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": 300, "description": "Defines the Period for CloudWatch query", - "default": 300 + "title": "Metric Collection Period" }, "metric_collection_start_time": { - "type": "integer", + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": 300, "description": "Defines the StartTime for CloudWatch query", - "default": 300 + "title": "Metric Collection Start Time" }, "metric_name": { - "type": ["string", "null"], - "description": "Metric name to query for CloudWatch trigger" + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Metric name to query for CloudWatch trigger", + "title": "Metric Name" }, "metric_stat": { - "type": "string", + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": "Average", "description": "Statistics metric to be used by Trigger. Defines the Stat for the CloudWatch query. Default is Average.", - "default": "Average" + "title": "Metric Stat" }, "metric_type": { - "type": "string", - "description": "The type of metric to be used by HPA. `Average` – Uses average value per pod; `Value` – Uses absolute metric value.", - "enum": ["Value", "Average"], - "default": "Average" + "anyOf": [ + { + "enum": [ + "Value", + "Average" + ], + "type": "string" + }, + { + "type": "null" + } + ], + "default": "Average", + "description": "The type of metric to be used by HPA. `Average` \u2013 Uses average value per pod; `Value` \u2013 Uses absolute metric value.", + "title": "Metric Type" }, "min_value": { - "type": "number", + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ], + "default": 0, "description": "Minimum metric value used in case of empty response from CloudWatch. Default is 0.", - "default": 0 + "title": "Min Value" }, "cloud_watch_trigger_name": { - "type": ["string", "null"], - "description": "Name for the CloudWatch trigger" + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Name for the CloudWatch trigger", + "title": "Cloud Watch Trigger Name" }, "cloud_watch_trigger_namespace": { - "type": ["string", "null"], - "description": "AWS CloudWatch namespace for the metric" + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "AWS CloudWatch namespace for the metric", + "title": "Cloud Watch Trigger Namespace" }, "target_value": { - "type": ["number", "null"], - "description": "Target value for the CloudWatch metric" + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Target value for the CloudWatch metric", + "title": "Target Value" }, "use_cached_metrics": { - "type": "boolean", + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "default": true, "description": "Enable caching of metric values during polling interval. Default is true.", - "default": true + "title": "Use Cached Metrics" }, "invocation_endpoint": { - "type": "string", + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": "invocations", "description": "The invocation endpoint of the model server. http://:/ would be pre-populated based on the other fields. Please fill in the path after http://:/ specific to your model server.", - "default": "invocations" + "title": "Invocation Endpoint" } - } -} + }, + "required": [ + "instance_type", + "model_name", + "model_source_type", + "image_uri", + "container_port", + "model_volume_mount_name" + ], + "title": "FlatHPEndpoint", + "type": "object" +} \ No newline at end of file diff --git a/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_0/template.py b/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_0/template.py new file mode 100644 index 00000000..63b06fb0 --- /dev/null +++ b/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_0/template.py @@ -0,0 +1,88 @@ +TEMPLATE_CONTENT = """ +apiVersion: inference.sagemaker.aws.amazon.com/v1alpha1 +kind: InferenceEndpointConfig +metadata: + name: {{ metadata_name or endpoint_name }} + namespace: {{ namespace }} +spec: + endpointName: {{ endpoint_name }} + instanceType: {{ instance_type }} + modelName: {{ model_name }} + modelVersion: {{ model_version or "" }} + + metrics: + enabled: {{ metrics_enabled or False }} + + modelSourceConfig: + modelSourceType: {{ model_source_type }} + modelLocation: {{ model_location or "" }} + prefetchEnabled: {{ prefetch_enabled or False }} +{%- if model_source_type == "s3" %} + s3Storage: + bucketName: {{ s3_bucket_name }} + region: {{ s3_region }} +{%- elif model_source_type == "fsx" %} + fsxStorage: + dnsName: {{ fsx_dns_name }} + fileSystemId: {{ fsx_file_system_id }} + mountName: {{ fsx_mount_name or "" }} +{%- endif %} + + tlsConfig: + tlsCertificateOutputS3Uri: {{ tls_certificate_output_s3_uri or "" }} + + worker: + environmentVariables: + {%- if env %} + {%- for key, val in env.items() %} + - name: {{ key }} + value: "{{ val }}" + {%- endfor %} + {%- else %} + [] + {%- endif %} + image: {{ image_uri }} + modelInvocationPort: + containerPort: {{ container_port }} + modelVolumeMount: + name: {{ model_volume_mount_name }} + mountPath: {{ model_volume_mount_path }} + resources: +{%- if resources_limits %} + limits: +{%- for key, val in resources_limits.items() %} + {{ key }}: {{ val }} +{%- endfor %} +{%- else %} + {} +{%- endif %} +{%- if resources_requests %} + requests: +{%- for key, val in resources_requests.items() %} + {{ key }}: {{ val }} +{%- endfor %} +{%- endif %} + + autoScalingSpec: + cloudWatchTrigger: +{%- if dimensions %} + dimensions: +{%- for dim_key, dim_val in dimensions.items() %} + - name: {{ dim_key }} + value: {{ dim_val }} +{%- endfor %} +{%- endif %} + metricCollectionPeriod: {{ metric_collection_period }} + metricCollectionStartTime: {{ metric_collection_start_time }} + metricName: {{ metric_name or "" }} + metricStat: {{ metric_stat }} + metricType: {{ metric_type }} + minValue: {{ min_value }} + name: {{ cloud_watch_trigger_name or "" }} + namespace: {{ cloud_watch_trigger_namespace or "" }} + targetValue: {{ target_value or "" }} + useCachedMetrics: {{ use_cached_metrics or False }} + + invocationEndpoint: {{ invocation_endpoint }} + +""" \ No newline at end of file diff --git a/hyperpod-custom-inference-template/pyproject.toml b/hyperpod-custom-inference-template/pyproject.toml index 2c519b32..2896c0de 100644 --- a/hyperpod-custom-inference-template/pyproject.toml +++ b/hyperpod-custom-inference-template/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "hyperpod-custom-inference-template" -version = "1.0" +version = "1.0.2" readme = "README.md" authors = [{name = "Amazon Web Services"}] license = {text = "Apache-2.0"} @@ -20,4 +20,5 @@ include-package-data = true [tool.setuptools.package-data] # for each versioned subpackage, include schema.json -"hyperpod_custom_inference_template.v1_0" = ["schema.json"] +"*" = ["schema.json"] + diff --git a/hyperpod-jumpstart-inference-template/CHANGELOG.md b/hyperpod-jumpstart-inference-template/CHANGELOG.md index c2f733de..97ba5bf5 100644 --- a/hyperpod-jumpstart-inference-template/CHANGELOG.md +++ b/hyperpod-jumpstart-inference-template/CHANGELOG.md @@ -1,3 +1,9 @@ +## v1.0.1] ([2025]-[08]-[27]) + +### Features + +* Add metadata_name argument to js and custom endpoint to match with SDK + ## v1.0.0] ([2025]-[07]-[10]) ### Features diff --git a/hyperpod-jumpstart-inference-template/hyperpod_jumpstart_inference_template/registry.py b/hyperpod-jumpstart-inference-template/hyperpod_jumpstart_inference_template/registry.py index 401b6d4b..d1abfdea 100644 --- a/hyperpod-jumpstart-inference-template/hyperpod_jumpstart_inference_template/registry.py +++ b/hyperpod-jumpstart-inference-template/hyperpod_jumpstart_inference_template/registry.py @@ -11,7 +11,12 @@ # ANY KIND, either express or implied. See the License for the specific # language governing permissions and limitations under the License. from hyperpod_jumpstart_inference_template.v1_0 import model as v1 +from hyperpod_jumpstart_inference_template.v1_0.template import TEMPLATE_CONTENT as v1_template SCHEMA_REGISTRY = { "1.0": v1.FlatHPJumpStartEndpoint, } + +TEMPLATE_REGISTRY = { + "1.0": v1_template +} diff --git a/hyperpod-jumpstart-inference-template/hyperpod_jumpstart_inference_template/v1_0/model.py b/hyperpod-jumpstart-inference-template/hyperpod_jumpstart_inference_template/v1_0/model.py index 44ad2d63..15953643 100644 --- a/hyperpod-jumpstart-inference-template/hyperpod_jumpstart_inference_template/v1_0/model.py +++ b/hyperpod-jumpstart-inference-template/hyperpod_jumpstart_inference_template/v1_0/model.py @@ -10,23 +10,41 @@ # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific # language governing permissions and limitations under the License. -from pydantic import BaseModel, Field, constr +from pydantic import BaseModel, Field, model_validator, ConfigDict from typing import Optional +import yaml # reuse the nested types from sagemaker.hyperpod.inference.config.hp_jumpstart_endpoint_config import ( Model, SageMakerEndpoint, Server, - TlsConfig, + TlsConfig ) from sagemaker.hyperpod.inference.hp_jumpstart_endpoint import HPJumpStartEndpoint +from sagemaker.hyperpod.common.config.metadata import Metadata class FlatHPJumpStartEndpoint(BaseModel): + model_config = ConfigDict(extra="forbid") + + namespace: Optional[str] = Field( + default=None, + description="Kubernetes namespace", + min_length=1 + ) + accept_eula: bool = Field( False, alias="accept_eula", description="Whether model terms of use have been accepted" ) + metadata_name: Optional[str] = Field( + None, + alias="metadata_name", + description="Name of the jumpstart endpoint object", + max_length=63, + pattern=r"^[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}$", + ) + model_id: str = Field( ..., alias="model_id", @@ -53,22 +71,32 @@ class FlatHPJumpStartEndpoint(BaseModel): ) endpoint_name: Optional[str] = Field( - "", + None, alias="endpoint_name", description="Name of SageMaker endpoint; empty string means no creation", max_length=63, pattern=r"^[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}$", ) - tls_certificate_output_s3_uri: Optional[str] = Field( None, alias="tls_certificate_output_s3_uri", - description="S3 URI to write the TLS certificate (optional)", + description="S3 URI to write the TLS certificate", pattern=r"^s3://([^/]+)/?(.*)$", ) + @model_validator(mode='after') + def validate_name(self): + if not self.metadata_name and not self.endpoint_name: + raise ValueError("Either metadata_name or endpoint_name must be provided") + return self + + def to_domain(self) -> HPJumpStartEndpoint: - # Build nested domain (pydantic) objects + if self.endpoint_name and not self.metadata_name: + self.metadata_name = self.endpoint_name + + metadata = Metadata(name=self.metadata_name, namespace=self.namespace) + model = Model( accept_eula=self.accept_eula, model_id=self.model_id, @@ -82,8 +110,9 @@ def to_domain(self) -> HPJumpStartEndpoint: TlsConfig(tls_certificate_output_s3_uri=self.tls_certificate_output_s3_uri) ) return HPJumpStartEndpoint( + metadata=metadata, model=model, server=server, sage_maker_endpoint=sage_ep, - tls_config=tls, - ) + tls_config=tls + ) \ No newline at end of file diff --git a/hyperpod-jumpstart-inference-template/hyperpod_jumpstart_inference_template/v1_0/schema.json b/hyperpod-jumpstart-inference-template/hyperpod_jumpstart_inference_template/v1_0/schema.json index efe6f340..175a18b6 100644 --- a/hyperpod-jumpstart-inference-template/hyperpod_jumpstart_inference_template/v1_0/schema.json +++ b/hyperpod-jumpstart-inference-template/hyperpod_jumpstart_inference_template/v1_0/schema.json @@ -1,49 +1,105 @@ { - "$schema": "https://json-schema.org/draft/2020-12/schema", - "title": "FlatHPJumpStartEndpointV1", - "type": "object", "additionalProperties": false, - "required": [ - "model_id", - "instance_type" - ], "properties": { + "namespace": { + "anyOf": [ + { + "minLength": 1, + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Kubernetes namespace", + "title": "Namespace" + }, "accept_eula": { - "type": "boolean", + "default": false, "description": "Whether model terms of use have been accepted", - "default": false + "title": "Accept Eula", + "type": "boolean" + }, + "metadata_name": { + "anyOf": [ + { + "maxLength": 63, + "pattern": "^[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}$", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Name of the jumpstart endpoint object", + "title": "Metadata Name" }, "model_id": { - "type": "string", "description": "Unique identifier of the model within the hub", - "minLength": 1, "maxLength": 63, - "pattern": "^[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}$" + "minLength": 1, + "pattern": "^[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}$", + "title": "Model Id", + "type": "string" }, "model_version": { - "type": ["string", "null"], + "anyOf": [ + { + "maxLength": 14, + "minLength": 5, + "pattern": "^\\d{1,4}\\.\\d{1,4}\\.\\d{1,4}$", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, "description": "Semantic version of the model to deploy (e.g. 1.0.0)", - "minLength": 5, - "maxLength": 14, - "pattern": "^\\d{1,4}\\.\\d{1,4}\\.\\d{1,4}$", - "default": null + "title": "Model Version" }, "instance_type": { - "type": "string", "description": "EC2 instance type for the inference server", - "pattern": "^ml\\..*" + "pattern": "^ml\\..*", + "title": "Instance Type", + "type": "string" }, "endpoint_name": { - "type": "string", + "anyOf": [ + { + "maxLength": 63, + "pattern": "^[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}$", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, "description": "Name of SageMaker endpoint; empty string means no creation", - "default": "", - "maxLength": 63, - "pattern": "^[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}$" + "title": "Endpoint Name" }, "tls_certificate_output_s3_uri": { - "type": ["string", "null"], - "description": "S3 URI to write the TLS certificate (optional)", - "pattern": "^s3://([^/]+)/?(.*)$" + "anyOf": [ + { + "pattern": "^s3://([^/]+)/?(.*)$", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "S3 URI to write the TLS certificate", + "title": "Tls Certificate Output S3 Uri" } - } -} + }, + "required": [ + "model_id", + "instance_type" + ], + "title": "FlatHPJumpStartEndpoint", + "type": "object" +} \ No newline at end of file diff --git a/hyperpod-jumpstart-inference-template/hyperpod_jumpstart_inference_template/v1_0/template.py b/hyperpod-jumpstart-inference-template/hyperpod_jumpstart_inference_template/v1_0/template.py new file mode 100644 index 00000000..f89f2095 --- /dev/null +++ b/hyperpod-jumpstart-inference-template/hyperpod_jumpstart_inference_template/v1_0/template.py @@ -0,0 +1,19 @@ +TEMPLATE_CONTENT = """ +apiVersion: inference.sagemaker.aws.amazon.com/v1alpha1 +kind: JumpStartModel +metadata: + name: {{ metadata_name or endpoint_name }} + namespace: {{ namespace or "default" }} +spec: + model: + acceptEula: {{ accept_eula or false }} + modelHubName: "SageMakerPublicHub" + modelId: {{ model_id }} + modelVersion: {{ model_version or "" }} + sageMakerEndpoint: + name: {{ endpoint_name or "" }} + server: + instanceType: {{ instance_type }} + tlsConfig: + tlsCertificateOutputS3Uri: {{ tls_certificate_output_s3_uri or "" }} +""" \ No newline at end of file diff --git a/hyperpod-jumpstart-inference-template/pyproject.toml b/hyperpod-jumpstart-inference-template/pyproject.toml index 1dad8c91..2822ba0b 100644 --- a/hyperpod-jumpstart-inference-template/pyproject.toml +++ b/hyperpod-jumpstart-inference-template/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "hyperpod-jumpstart-inference-template" -version = "1.0" +version = "1.0.2" readme = "README.md" authors = [{name = "Amazon Web Services"}] license = {text = "Apache-2.0"} @@ -20,4 +20,5 @@ include-package-data = true [tool.setuptools.package-data] # for each versioned subpackage, include schema.json -"hyperpod_jumpstart_inference_template.v1_0" = ["schema.json"] +"*" = ["schema.json"] + diff --git a/hyperpod-pytorch-job-template/CHANGELOG.md b/hyperpod-pytorch-job-template/CHANGELOG.md index d904a709..c98fba98 100644 --- a/hyperpod-pytorch-job-template/CHANGELOG.md +++ b/hyperpod-pytorch-job-template/CHANGELOG.md @@ -1,3 +1,28 @@ +## v1.1.2 (2025-09-10) + +### Features + + * Revert node-count val + +## v1.1.1 (2025-08-27) + +### Features + + * Change default container name in pytorch template + * Implementing Task governance feature for SDK flow + +## v1.1.0 (2025-08-14) + +### Features + + * Added parameters for task governance feature + +## v1.0.2 (2025-07-31) + +### Features + + * Add support for --volume, remove --volumes and --persistent-volume-claims + ## v1.0.1 (2025-07-16) ### Features diff --git a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/create_dataclass.py b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/create_dataclass.py deleted file mode 100644 index 0c5c4181..00000000 --- a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/create_dataclass.py +++ /dev/null @@ -1,295 +0,0 @@ -#!/usr/bin/env python3 -""" -Convert Kubernetes CRD OpenAPI v3 Schema to Python Dataclasses -""" - -import json -import yaml -from typing import Dict, Any, List, Optional, Union, Set -from dataclasses import dataclass -import re - - -class CRDToPydanticConverter: - def __init__(self): - self.generated_classes: Set[str] = set() - self.imports = { - 'from pydantic import BaseModel, ConfigDict, Field', - 'from typing import Optional, List, Dict, Union' - } - - def sanitize_class_name(self, name: str) -> str: - """Convert a schema property name to a valid Python class name in PascalCase.""" - # Handle camelCase by inserting underscores before uppercase letters - name = re.sub(r'([a-z0-9])([A-Z])', r'\1_\2', name) - - # Replace hyphens and other non-alphanumeric characters with underscores - name = re.sub(r'[^a-zA-Z0-9_]', '_', name) - - # Split by underscores and capitalize each word - words = [word for word in name.split('_') if word] - name = ''.join(word.capitalize() for word in words) - - # Ensure it starts with a letter - if name and name[0].isdigit(): - name = f"Class{name}" - - return name or "UnknownClass" - - def sanitize_field_name(self, name: str) -> str: - """Convert a schema property name to a valid Python field name in snake_case.""" - # Convert camelCase to snake_case - name = re.sub(r'([a-z0-9])([A-Z])', r'\1_\2', name) - - # Replace hyphens and other chars with underscores - name = re.sub(r'[^a-zA-Z0-9_]', '_', name) - - # Convert to lowercase - name = name.lower() - - # Remove multiple consecutive underscores - name = re.sub(r'_+', '_', name) - - # Remove leading/trailing underscores - name = name.strip('_') - - # Handle Python keywords - if name in ['class', 'def', 'for', 'if', 'else', 'while', 'try', 'except', 'import', 'from', 'as', 'pass', - 'break', 'continue', 'return']: - name = f"{name}_" - - return name - - def get_python_type(self, schema: Dict[str, Any], property_name: str = "") -> str: - """Convert OpenAPI type to Python type annotation.""" - if 'type' not in schema: - # Handle anyOf, oneOf, allOf - if 'anyOf' in schema: - types = [self.get_python_type(s, property_name) for s in schema['anyOf']] - return f"Union[{', '.join(set(types))}]" - elif 'oneOf' in schema: - types = [self.get_python_type(s, property_name) for s in schema['oneOf']] - return f"Union[{', '.join(set(types))}]" - elif 'allOf' in schema: - # For allOf, we'll treat it as the first type (simplified) - return self.get_python_type(schema['allOf'][0], property_name) if schema['allOf'] else 'Any' - else: - return 'Any' - - schema_type = schema['type'] - - if schema_type == 'string': - return 'str' - elif schema_type == 'integer': - return 'int' - elif schema_type == 'number': - return 'float' - elif schema_type == 'boolean': - return 'bool' - elif schema_type == 'array': - if 'items' in schema: - item_type = self.get_python_type(schema['items'], property_name) - return f'List[{item_type}]' - return 'List[Any]' - elif schema_type == 'object': - if 'properties' in schema: - # Generate a new dataclass for this object - class_name = self.sanitize_class_name(property_name or 'NestedObject') - return class_name - elif 'additionalProperties' in schema: - if isinstance(schema['additionalProperties'], dict): - value_type = self.get_python_type(schema['additionalProperties']) - return f'Dict[str, {value_type}]' - else: - return 'Dict[str, Any]' - return 'Dict[str, Any]' - else: - return 'Any' - - def generate_dataclass(self, name: str, schema: Dict[str, Any], required: List[str] = None) -> str: - """Generate a Pydantic BaseModel from an OpenAPI schema.""" - class_name = self.sanitize_class_name(name) - - if class_name in self.generated_classes: - return "" # Already generated - - self.generated_classes.add(class_name) - required = required or [] - - if 'properties' not in schema: - return "" - - properties = schema['properties'] - fields = [] - nested_classes = [] - - for prop_name, prop_schema in properties.items(): - field_name = self.sanitize_field_name(prop_name) - python_type = self.get_python_type(prop_schema, prop_name) - is_required = prop_name in required - if class_name == "VolumeClaimTemplate" and prop_name == "spec": - prop_name = "VolumeClaimTemplateSpec" - - # Generate nested classes if needed - if prop_schema.get('type') == 'object' and 'properties' in prop_schema: - nested_class = self.generate_dataclass( - prop_name, - prop_schema, - prop_schema.get('required', []) - ) - if nested_class: - nested_classes.append(nested_class) - elif prop_schema.get('type') == 'array' and 'items' in prop_schema: - items_schema = prop_schema['items'] - if items_schema.get('type') == 'object' and 'properties' in items_schema: - nested_class = self.generate_dataclass( - prop_name, - items_schema, - items_schema.get('required', []) - ) - if nested_class: - nested_classes.append(nested_class) - - # Create field definition with Field() for alias mapping - field_config_parts = [] - - # Add alias if field name differs from original property name - if field_name != prop_name: - field_config_parts.append(f'alias="{field_name}"') - - # Add description if available - if 'description' in prop_schema: - description = prop_schema['description'].replace('"', '\\"').replace('\n', ' ').strip() - if description.startswith("DEPRECATED"): - continue - field_config_parts.append(f'description="{description}"') - - # Handle default values and required fields - if is_required: - if 'default' in prop_schema: - default_val = repr(prop_schema['default']) - if field_config_parts: - field_config = ', '.join(field_config_parts) - fields.append(f" {prop_name}: {python_type} = Field(default={default_val}, {field_config})") - else: - fields.append(f" {prop_name}: {python_type} = {default_val}") - else: - if field_config_parts: - field_config = ', '.join(field_config_parts) - fields.append(f" {prop_name}: {python_type} = Field({field_config})") - else: - fields.append(f" {prop_name}: {python_type}") - else: - default_val = 'None' - if 'default' in prop_schema: - default_val = repr(prop_schema['default']) - - if field_config_parts: - field_config = ', '.join(field_config_parts) - fields.append( - f" {prop_name}: Optional[{python_type}] = Field(default={default_val}, {field_config})") - else: - fields.append(f" {prop_name}: Optional[{python_type}] = {default_val}") - - # Generate the Pydantic model - model_code = f"""class {class_name}(BaseModel): -""" - - if schema.get('description'): - description = schema['description'].replace('\n', ' ').strip() - model_code += f' """{description}"""\n' - - # forbid extra inputs - model_code += f" model_config = ConfigDict(extra='forbid')\n\n" - - if fields: - model_code += '\n'.join(fields) - else: - model_code += " pass" - - # Combine nested classes with main class - result = '\n\n'.join(nested_classes) - if result and nested_classes: - result += '\n\n' - result += model_code - - return result - - def convert_crd_schema(self, crd_data: Dict[str, Any]) -> str: - """Convert only the spec portion of a CRD schema to Python dataclasses.""" - results = [] - - # Reset state - self.generated_classes.clear() - - # Extract spec schema from CRD - try: - if 'spec' in crd_data and 'versions' in crd_data['spec']: - # Handle multiple versions - for version in crd_data['spec']['versions']: - if 'schema' in version and 'openAPIV3Schema' in version['schema']: - schema = version['schema']['openAPIV3Schema'] - - if 'properties' in schema and 'spec' in schema['properties']: - # Only generate classes for the spec portion - spec_schema = schema['properties']['spec'] - spec_class = self.generate_dataclass( - f"{crd_data['spec']['names']['kind']}Spec", - spec_schema, - spec_schema.get('required', []) - ) - if spec_class: - results.append(spec_class) - - break # Use first version for now - else: - # Handle direct schema input - assume it's already the spec portion - if 'openAPIV3Schema' in crd_data: - schema = crd_data['openAPIV3Schema'] - main_class = self.generate_dataclass( - "CustomResourceSpec", - schema, - schema.get('required', []) - ) - if main_class: - results.append(main_class) - elif 'properties' in crd_data: - # Direct schema properties - assume it's the spec - main_class = self.generate_dataclass( - "CustomResourceSpec", - crd_data, - crd_data.get('required', []) - ) - if main_class: - results.append(main_class) - - except KeyError as e: - raise ValueError(f"Invalid CRD structure: missing {e}") - - if not results: - raise ValueError("No spec schema found in CRD data") - - # Combine imports and classes - imports_code = '\n'.join(sorted(self.imports)) - classes_code = '\n\n'.join(results) - - return f"{imports_code}\n\n\n{classes_code}" - - -def create_dataclass(crd_file_name: str, python_file_name: str): - converter = CRDToPydanticConverter() - - with open(crd_file_name, 'r') as f: - crd_data = yaml.safe_load(f) - - # Convert to dataclasses - dataclasses_code = converter.convert_crd_schema(crd_data) - - # Save to file - with open(python_file_name, 'w') as f: - f.write(dataclasses_code) - - print("Writing Complete") - -if __name__ == '__main__': - create_dataclass("v1_0/schema_1.json", "v1_0/model.py") \ No newline at end of file diff --git a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/registry.py b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/registry.py index f3a55f6b..999323f8 100644 --- a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/registry.py +++ b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/registry.py @@ -10,11 +10,20 @@ # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific # language governing permissions and limitations under the License. -from .v1_0.model import PyTorchJobConfig # Import your model +from .v1_0 import model as v1_0_model # Import your model +from .v1_1 import model as v1_1_model +from .v1_0.template import TEMPLATE_CONTENT as v1_0_template +from .v1_1.template import TEMPLATE_CONTENT as v1_1_template from typing import Dict, Type from pydantic import BaseModel # Direct version-to-model mapping SCHEMA_REGISTRY: Dict[str, Type[BaseModel]] = { - "1.0": PyTorchJobConfig, + "1.0": v1_0_model.PyTorchJobConfig, + "1.1": v1_1_model.PyTorchJobConfig, +} + +TEMPLATE_REGISTRY = { + "1.0": v1_0_template, + "1.1": v1_1_template } \ No newline at end of file diff --git a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/model.py b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/model.py index 9415968b..076bd66e 100644 --- a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/model.py +++ b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/model.py @@ -1,5 +1,6 @@ -from pydantic import BaseModel, ConfigDict, Field -from typing import Optional, List, Dict, Union +from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator +from typing import Optional, List, Dict, Union, Literal +import click from sagemaker.hyperpod.training.config.hyperpod_pytorch_job_unified_config import ( Containers, ReplicaSpec, @@ -8,15 +9,79 @@ Spec, Template, Metadata, + Volumes, + HostPath, + PersistentVolumeClaim ) +from sagemaker.hyperpod.training.hyperpod_pytorch_job import HyperPodPytorchJob +import yaml + +class VolumeConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + name: str = Field( + ..., + description="Volume name", + min_length=1 + ) + type: Literal['hostPath', 'pvc'] = Field(..., description="Volume type") + mount_path: str = Field( + ..., + description="Mount path in container", + min_length=1 + ) + path: Optional[str] = Field( + None, + description="Host path (required for hostPath volumes)", + min_length=1 + ) + claim_name: Optional[str] = Field( + None, + description="PVC claim name (required for pvc volumes)", + min_length=1 + ) + read_only: Optional[bool] = Field(None, description="Read-only flag for pvc volumes") + + @field_validator('mount_path', 'path') + @classmethod + def paths_must_be_absolute(cls, v): + """Validate that paths are absolute (start with /).""" + if v and not v.startswith('/'): + raise ValueError('Path must be absolute (start with /)') + return v + + @model_validator(mode='after') + def validate_type_specific_fields(self): + """Validate that required fields are present based on volume type.""" + + if self.type == 'hostPath': + if not self.path: + raise ValueError('hostPath volumes require path field') + elif self.type == 'pvc': + if not self.claim_name: + raise ValueError('PVC volumes require claim_name field') + + return self class PyTorchJobConfig(BaseModel): model_config = ConfigDict(extra="forbid") - job_name: str = Field(alias="job_name", description="Job name") - image: str = Field(description="Docker image for training") - namespace: Optional[str] = Field(default=None, description="Kubernetes namespace") + job_name: str = Field( + alias="job_name", + description="Job name", + min_length=1, + max_length=63, + pattern=r'^[a-z0-9]([-a-z0-9]*[a-z0-9])?$' + ) + image: str = Field( + description="Docker image for training", + min_length=1 + ) + namespace: Optional[str] = Field( + default="default", + description="Kubernetes namespace", + min_length=1 + ) command: Optional[List[str]] = Field( default=None, description="Command to run in the container" ) @@ -27,16 +92,28 @@ class PyTorchJobConfig(BaseModel): default=None, description="Environment variables as key_value pairs" ) pull_policy: Optional[str] = Field( - default=None, alias="pull_policy", description="Image pull policy" + default=None, + alias="pull_policy", + description="Image pull policy", + min_length=1 ) instance_type: Optional[str] = Field( - default=None, alias="instance_type", description="Instance type for training" + default=None, + alias="instance_type", + description="Instance type for training", + min_length=1 ) node_count: Optional[int] = Field( - default=None, alias="node_count", description="Number of nodes" + default=1, + alias="node_count", + description="Number of nodes", + ge=1 ) tasks_per_node: Optional[int] = Field( - default=None, alias="tasks_per_node", description="Number of tasks per node" + default=None, + alias="tasks_per_node", + description="Number of tasks per node", + ge=1 ) label_selector: Optional[Dict[str, str]] = Field( default=None, @@ -49,131 +126,271 @@ class PyTorchJobConfig(BaseModel): description="Schedule pods only on nodes that passed deep health check", ) scheduler_type: Optional[str] = Field( - default=None, alias="scheduler_type", description="Scheduler type" + default=None, + alias="scheduler_type", + description="If specified, training job pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler.", + min_length=1 ) queue_name: Optional[str] = Field( - default=None, alias="queue_name", description="Queue name for job scheduling" + default=None, + alias="queue_name", + description="Queue name for job scheduling", + min_length=1, + max_length=63, + pattern=r'^[a-z0-9]([-a-z0-9]*[a-z0-9])?$' ) priority: Optional[str] = Field( - default=None, description="Priority class for job scheduling" + default=None, + description="Priority class for job scheduling", + min_length=1 ) max_retry: Optional[int] = Field( - default=None, alias="max_retry", description="Maximum number of job retries" + default=None, + alias="max_retry", + description="Maximum number of job retries", + ge=0 ) - volumes: Optional[List[str]] = Field( - default=None, description="List of volumes to mount" - ) - persistent_volume_claims: Optional[List[str]] = Field( - default=None, - alias="persistent_volume_claims", - description="List of persistent volume claims", + volume: Optional[List[VolumeConfig]] = Field( + default=None, description="List of volume configurations. \ + Command structure: --volume name=,type=,mount_path=, \ + For hostPath: --volume name=model-data,type=hostPath,mount_path=/data,path=/data \ + For persistentVolumeClaim: --volume name=training-output,type=pvc,mount_path=/mnt/output,claim_name=training-output-pvc,read_only=false \ + If multiple --volume flag if multiple volumes are needed \ + " ) service_account_name: Optional[str] = Field( - default=None, alias="service_account_name", description="Service account name" + default=None, + alias="service_account_name", + description="Service account name", + min_length=1 ) + @field_validator('volume') + def validate_no_duplicates(cls, v): + """Validate no duplicate volume names or mount paths.""" + if not v: + return v + + # Check for duplicate volume names + names = [vol.name for vol in v] + if len(names) != len(set(names)): + raise ValueError("Duplicate volume names found") + + # Check for duplicate mount paths + mount_paths = [vol.mount_path for vol in v] + if len(mount_paths) != len(set(mount_paths)): + raise ValueError("Duplicate mount paths found") + + return v + + @field_validator('command', 'args') + def validate_string_lists(cls, v): + """Validate that command and args contain non-empty strings.""" + if not v: + return v + + for i, item in enumerate(v): + if not isinstance(item, str) or not item.strip(): + field_name = cls.model_fields.get('command', {}).get('alias', 'command') if 'command' in str(v) else 'args' + raise ValueError(f"{field_name}[{i}] must be a non-empty string") + + return v + + @field_validator('environment') + def validate_environment_variable_names(cls, v): + """Validate environment variable names follow C_IDENTIFIER pattern.""" + if not v: + return v + + import re + c_identifier_pattern = re.compile(r'^[a-zA-Z_][a-zA-Z0-9_]*$') + + for key in v.keys(): + if not c_identifier_pattern.match(key): + raise ValueError(f"Environment variable name '{key}' must be a valid C_IDENTIFIER") + + return v + + @field_validator('label_selector') + def validate_label_selector_keys(cls, v): + """Validate label selector keys follow Kubernetes label naming conventions.""" + if not v: + return v + + import re + # Kubernetes label key pattern - allows namespaced labels like kubernetes.io/arch + # Pattern: [prefix/]name where prefix and name follow DNS subdomain rules + # Also reject double dots + label_key_pattern = re.compile(r'^([a-zA-Z0-9]([a-zA-Z0-9\-_.]*[a-zA-Z0-9])?/)?[a-zA-Z0-9]([a-zA-Z0-9\-_.]*[a-zA-Z0-9])?$') + + for key in v.keys(): + if not key or not label_key_pattern.match(key) or '..' in key: + raise ValueError(f"Label selector key '{key}' must follow Kubernetes label naming conventions") + + return v + def to_domain(self) -> Dict: - """ - Convert flat config to domain model (HyperPodPytorchJobSpec) - """ - # Create container with required fields - container_kwargs = { - "name": "container-name", - "image": self.image, - "resources": Resources( - requests={"nvidia.com/gpu": "0"}, - limits={"nvidia.com/gpu": "0"}, - ), - } - - # Add optional container fields - if self.command is not None: - container_kwargs["command"] = self.command - if self.args is not None: - container_kwargs["args"] = self.args - if self.pull_policy is not None: - container_kwargs["image_pull_policy"] = self.pull_policy - if self.environment is not None: - container_kwargs["env"] = [ - {"name": k, "value": v} for k, v in self.environment.items() - ] - if self.volumes is not None: - container_kwargs["volume_mounts"] = [ - {"name": v, "mount_path": f"/mnt/{v}"} for v in self.volumes - ] - - # Create container object + """Convert flat config to domain model (HyperPodPytorchJobSpec)""" + + # Helper function to build dict with non-None values + def build_dict(**kwargs): + return {k: v for k, v in kwargs.items() if v is not None} + + # Build container + container_kwargs = build_dict( + name="pytorch-job-container", + image=self.image, + resources=Resources(requests={"nvidia.com/gpu": "0"}, limits={"nvidia.com/gpu": "0"}), + command=self.command, + args=self.args, + image_pull_policy=self.pull_policy, + env=[{"name": k, "value": v} for k, v in self.environment.items()] if self.environment else None, + volume_mounts=[{"name": vol.name, "mount_path": vol.mount_path} for vol in self.volume] if self.volume else None + ) + container = Containers(**container_kwargs) - # Create pod spec kwargs - spec_kwargs = {"containers": list([container])} - - # Add node selector if any selector fields are present - node_selector = {} - if self.instance_type is not None: - map = {"node.kubernetes.io/instance-type": self.instance_type} - node_selector.update(map) - if self.label_selector is not None: - node_selector.update(self.label_selector) - if self.deep_health_check_passed_nodes_only: - map = {"deep-health-check-passed": "true"} - node_selector.update(map) - if node_selector: - spec_kwargs.update({"node_selector": node_selector}) - - # Add other optional pod spec fields - if self.service_account_name is not None: - map = {"service_account_name": self.service_account_name} - spec_kwargs.update(map) - - if self.scheduler_type is not None: - map = {"scheduler_name": self.scheduler_type} - spec_kwargs.update(map) - - # Build metadata labels only if relevant fields are present - metadata_kwargs = {"name": self.job_name} - if self.namespace is not None: - metadata_kwargs["namespace"] = self.namespace - - metadata_labels = {} - if self.queue_name is not None: - metadata_labels["kueue.x-k8s.io/queue-name"] = self.queue_name - if self.priority is not None: - metadata_labels["kueue.x-k8s.io/priority-class"] = self.priority - - if metadata_labels: - metadata_kwargs["labels"] = metadata_labels - - # Create replica spec with only non-None values - replica_kwargs = { - "name": "pod", - "template": Template( - metadata=Metadata(**metadata_kwargs), spec=Spec(**spec_kwargs) - ), - } - - if self.node_count is not None: - replica_kwargs["replicas"] = self.node_count - - replica_spec = ReplicaSpec(**replica_kwargs) - - replica_specs = list([replica_spec]) - - job_kwargs = {"replica_specs": replica_specs} - # Add optional fields only if they exist - if self.tasks_per_node is not None: - job_kwargs["nproc_per_node"] = str(self.tasks_per_node) - - if self.max_retry is not None: - job_kwargs["run_policy"] = RunPolicy( - clean_pod_policy="None", job_max_retry_count=self.max_retry - ) - - # Create base return dictionary - result = { - "name": self.job_name, - "namespace": self.namespace, - "spec": job_kwargs, - } + # Build volumes + volumes = None + if self.volume: + volumes = [] + for vol in self.volume: + if vol.type == "hostPath": + volume_obj = Volumes(name=vol.name, host_path=HostPath(path=vol.path)) + elif vol.type == "pvc": + volume_obj = Volumes(name=vol.name, persistent_volume_claim=PersistentVolumeClaim( + claim_name=vol.claim_name, + read_only=vol.read_only if vol.read_only is not None else False + )) + volumes.append(volume_obj) + + # Build node selector + node_selector = build_dict( + **{"node.kubernetes.io/instance-type": self.instance_type} if self.instance_type else {}, + **self.label_selector if self.label_selector else {}, + **{"deep-health-check-passed": "true"} if self.deep_health_check_passed_nodes_only else {} + ) + + # Build spec + spec_kwargs = build_dict( + containers=[container], + volumes=volumes, + node_selector=node_selector if node_selector else None, + service_account_name=self.service_account_name, + scheduler_name=self.scheduler_type + ) + + # Build metadata + metadata_labels = build_dict( + **{"kueue.x-k8s.io/queue-name": self.queue_name} if self.queue_name else {}, + **{"kueue.x-k8s.io/priority-class": self.priority} if self.priority else {} + ) + metadata_kwargs = build_dict( + name=self.job_name, + namespace=self.namespace, + labels=metadata_labels if metadata_labels else None + ) + + # Build replica spec + replica_kwargs = build_dict( + name="pod", + template=Template(metadata=Metadata(**metadata_kwargs), spec=Spec(**spec_kwargs)), + replicas=self.node_count + ) + + # Build job + job_kwargs = build_dict( + metadata=metadata_kwargs, + replica_specs=[ReplicaSpec(**replica_kwargs)], + nproc_per_node=str(self.tasks_per_node) if self.tasks_per_node else None, + run_policy=RunPolicy(clean_pod_policy="None", job_max_retry_count=self.max_retry) if self.max_retry else None + ) + + result = HyperPodPytorchJob(**job_kwargs) return result + + +# Volume-specific type handlers - only override what's needed +def volume_parse_strings(ctx_or_strings, param=None, value=None): + """Parse volume strings into VolumeConfig objects. Can be used as Click callback.""" + # Handle dual usage pattern (inlined) + if param is not None and value is not None: + volume_strings, is_click_callback = value, True + else: + volume_strings, is_click_callback = ctx_or_strings, False + + if not volume_strings: + return None + if not isinstance(volume_strings, (list, tuple)): + volume_strings = [volume_strings] + + # Core parsing logic + volumes = [] + for vol_str in volume_strings: + vol_dict = {} + for pair in vol_str.split(','): + if '=' in pair: + key, val = pair.split('=', 1) + key = key.strip() + val = val.strip() + vol_dict[key] = val.lower() == 'true' if key == 'read_only' else val + + try: + volumes.append(VolumeConfig(**vol_dict)) + except Exception as e: + error_msg = f"Invalid volume configuration '{vol_str}': {e}" + if is_click_callback: + raise click.BadParameter(error_msg) + else: + raise ValueError(error_msg) + + return volumes + + +def volume_from_dicts(volume_dicts): + """Convert list of volume dictionaries to VolumeConfig objects.""" + if volume_dicts is None: + return None + return [VolumeConfig(**vol_dict) for vol_dict in volume_dicts if isinstance(vol_dict, dict)] + + +def volume_write_to_yaml(key, volumes, file_handle): + """Write VolumeConfig objects to YAML format.""" + if volumes: + file_handle.write(f"{key}:\n") + for vol in volumes: + file_handle.write(f" - name: {vol.name}\n") + file_handle.write(f" type: {vol.type}\n") + file_handle.write(f" mount_path: {vol.mount_path}\n") + if vol.path: + file_handle.write(f" path: {vol.path}\n") + if vol.claim_name: + file_handle.write(f" claim_name: {vol.claim_name}\n") + if vol.read_only is not None: + file_handle.write(f" read_only: {vol.read_only}\n") + file_handle.write("\n") + else: + file_handle.write(f"{key}: []\n\n") + + +def volume_merge_dicts(existing_volumes, new_volumes): + """Merge volume configurations, updating existing volumes by name or adding new ones.""" + merged = {vol.get('name'): vol for vol in existing_volumes} + merged.update({vol.get('name'): vol for vol in new_volumes}) + return list(merged.values()) + + +# Handler definition - merge with defaults, only override specific functions +def _get_volume_type_handler(): + from sagemaker.hyperpod.cli.type_handler_utils import DEFAULT_TYPE_HANDLER + return { + **DEFAULT_TYPE_HANDLER, # Start with all defaults + 'parse_strings': volume_parse_strings, # Override only these + 'from_dicts': volume_from_dicts, + 'write_to_yaml': volume_write_to_yaml, + 'merge_dicts': volume_merge_dicts, + 'needs_multiple_option': True + } + +VOLUME_TYPE_HANDLER = _get_volume_type_handler() diff --git a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/schema.json b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/schema.json index 809a95c6..cca61230 100644 --- a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/schema.json +++ b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/schema.json @@ -1,83 +1,335 @@ { - "$schema": "https://json-schema.org/draft/2020-12/schema", - "title": "HyperPod PyTorch Job Parameters", - "type": "object", - "properties": { - "job-name": {"type": "string", "description": "Job name", "minLength": 1}, - "namespace": {"type": "string", "description": "Kubernetes namespace"}, - "image": {"type": "string", "description": "Docker image for training"}, - "command": { - "type": "array", - "items": {"type": "string"}, - "description": "Command to run in the container" - }, - "args": { - "type": "array", - "items": {"type": "string"}, - "description": "Arguments for the entry script" - }, - "environment": { - "type": "object", - "additionalProperties": {"type": "string"}, - "description": "Environment variables as key-value pairs" - }, - "pull-policy": { - "type": "string", - "enum": ["Always", "Never", "IfNotPresent"], - "description": "Image pull policy" - }, - "instance-type": { - "type": "string", - "description": "Instance type for training" - }, - "node-count": { - "type": "integer", - "minimum": 1, - "description": "Number of nodes" - }, - "tasks-per-node": { - "type": "integer", - "minimum": 1, - "description": "Number of tasks per node" - }, - "label-selector": { - "type": "object", - "additionalProperties": {"type": "string"}, - "description": "Node label selector as key-value pairs" - }, - "deep-health-check-passed-nodes-only": { - "type": "boolean", - "description": "Schedule pods only on nodes that passed deep health check" - }, - "scheduler-type": {"type": "string", "description": "Scheduler type"}, - "queue-name": { - "type": "string", - "description": "Queue name for job scheduling" - }, - "priority": { - "type": "string", - "description": "Priority class for job scheduling" - }, - "max-retry": { - "type": "integer", - "minimum": 0, - "description": "Maximum number of job retries" - }, - "volumes": { - "type": "array", - "items": {"type": "string"}, - "description": "List of volumes to mount" - }, - "persistent-volume-claims": { - "type": "array", - "items": {"type": "string"}, - "description": "List of persistent volume claims" - }, - "service-account-name": { - "type": "string", - "description": "Service account name" - } - }, - "required": ["job-name", "image"], - "additionalProperties": false -} + "$defs": { + "VolumeConfig": { + "properties": { + "name": { + "description": "Volume name", + "minLength": 1, + "title": "Name", + "type": "string" + }, + "type": { + "description": "Volume type", + "enum": [ + "hostPath", + "pvc" + ], + "title": "Type", + "type": "string" + }, + "mount_path": { + "description": "Mount path in container", + "minLength": 1, + "title": "Mount Path", + "type": "string" + }, + "path": { + "anyOf": [ + { + "minLength": 1, + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Host path (required for hostPath volumes)", + "title": "Path" + }, + "claim_name": { + "anyOf": [ + { + "minLength": 1, + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "PVC claim name (required for pvc volumes)", + "title": "Claim Name" + }, + "read_only": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Read-only flag for pvc volumes", + "title": "Read Only" + } + }, + "required": [ + "name", + "type", + "mount_path" + ], + "title": "VolumeConfig", + "type": "object" + } + }, + "additionalProperties": false, + "properties": { + "job_name": { + "description": "Job name", + "maxLength": 63, + "minLength": 1, + "pattern": "^[a-z0-9]([-a-z0-9]*[a-z0-9])?$", + "title": "Job Name", + "type": "string" + }, + "image": { + "description": "Docker image for training", + "minLength": 1, + "title": "Image", + "type": "string" + }, + "namespace": { + "anyOf": [ + { + "minLength": 1, + "type": "string" + }, + { + "type": "null" + } + ], + "default": "default", + "description": "Kubernetes namespace", + "title": "Namespace" + }, + "command": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Command to run in the container", + "title": "Command" + }, + "args": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Arguments for the entry script", + "title": "Args" + }, + "environment": { + "anyOf": [ + { + "additionalProperties": { + "type": "string" + }, + "type": "object" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Environment variables as key_value pairs", + "title": "Environment" + }, + "pull_policy": { + "anyOf": [ + { + "minLength": 1, + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Image pull policy", + "title": "Pull Policy" + }, + "instance_type": { + "anyOf": [ + { + "minLength": 1, + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Instance type for training", + "title": "Instance Type" + }, + "node_count": { + "anyOf": [ + { + "minimum": 1, + "type": "integer" + }, + { + "type": "null" + } + ], + "default": 1, + "description": "Number of nodes", + "title": "Node Count" + }, + "tasks_per_node": { + "anyOf": [ + { + "minimum": 1, + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Number of tasks per node", + "title": "Tasks Per Node" + }, + "label_selector": { + "anyOf": [ + { + "additionalProperties": { + "type": "string" + }, + "type": "object" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Node label selector as key_value pairs", + "title": "Label Selector" + }, + "deep_health_check_passed_nodes_only": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "default": false, + "description": "Schedule pods only on nodes that passed deep health check", + "title": "Deep Health Check Passed Nodes Only" + }, + "scheduler_type": { + "anyOf": [ + { + "minLength": 1, + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "If specified, training job pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler.", + "title": "Scheduler Type" + }, + "queue_name": { + "anyOf": [ + { + "maxLength": 63, + "minLength": 1, + "pattern": "^[a-z0-9]([-a-z0-9]*[a-z0-9])?$", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Queue name for job scheduling", + "title": "Queue Name" + }, + "priority": { + "anyOf": [ + { + "minLength": 1, + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Priority class for job scheduling", + "title": "Priority" + }, + "max_retry": { + "anyOf": [ + { + "minimum": 0, + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Maximum number of job retries", + "title": "Max Retry" + }, + "volume": { + "anyOf": [ + { + "items": { + "$ref": "#/$defs/VolumeConfig" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "List of volume configurations. Command structure: --volume name=,type=,mount_path=, For hostPath: --volume name=model-data,type=hostPath,mount_path=/data,path=/data For persistentVolumeClaim: --volume name=training-output,type=pvc,mount_path=/mnt/output,claim_name=training-output-pvc,read_only=false If multiple --volume flag if multiple volumes are needed ", + "title": "Volume" + }, + "service_account_name": { + "anyOf": [ + { + "minLength": 1, + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Service account name", + "title": "Service Account Name" + } + }, + "required": [ + "job_name", + "image" + ], + "title": "PyTorchJobConfig", + "type": "object" +} \ No newline at end of file diff --git a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/template.py b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/template.py new file mode 100644 index 00000000..f044d162 --- /dev/null +++ b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/template.py @@ -0,0 +1,96 @@ +TEMPLATE_CONTENT = """ +apiVersion: sagemaker.amazonaws.com/v1 +kind: HyperPodPyTorchJob +metadata: + name: {{ job_name }} + namespace: {{ namespace }} +{%- if queue_name or priority %} + labels: + kueue.x-k8s.io/queue-name: {{ queue_name or "" }} + kueue.x-k8s.io/priority-class: {{ priority or "" }} +{%- endif %} +spec: +{%- if tasks_per_node %} + nprocPerNode: "{{ tasks_per_node }}" +{%- endif %} + replicaSpecs: + - name: pod + replicas: {{ node_count or 1 }} + template: + metadata: + name: {{ job_name }} + namespace: {{ namespace }} +{%- if queue_name or priority %} + labels: + kueue.x-k8s.io/queue-name: {{ queue_name or "" }} + kueue.x-k8s.io/priority-class: {{ priority or "" }} +{%- endif %} + spec: + containers: + - name: container-name + image: {{ image }} +{%- if pull_policy %} + imagePullPolicy: {{ pull_policy }} +{%- endif %} +{%- if command %} + command: {{ command | tojson }} +{%- endif %} +{%- if args %} + args: {{ args | tojson }} +{%- endif %} +{%- if environment %} + env: +{%- for key, value in environment.items() %} + - name: {{ key }} + value: "{{ value }}" +{%- endfor %} +{%- endif %} +{%- if volume %} + volumeMounts: +{%- for vol in volume %} + - name: {{ vol.name }} + mountPath: {{ vol.mount_path }} + readOnly: {{ vol.read_only | lower if vol.read_only else false }} +{%- endfor %} +{%- endif %} + resources: + requests: + nvidia.com/gpu: "0" + limits: + nvidia.com/gpu: "0" +{%- if instance_type or label_selector or deep_health_check_passed_nodes_only %} + nodeSelector: + node.kubernetes.io/instance-type: {{ instance_type or "" }} +{%- if label_selector %} +{%- for key, value in label_selector.items() %} + {{ key }}: {{ value }} +{%- endfor %} +{%- endif %} +{%- if deep_health_check_passed_nodes_only %} + deep-health-check-passed: "true" +{%- endif %} +{%- endif %} +{%- if service_account_name %} + serviceAccountName: {{ service_account_name }} +{%- endif %} +{%- if scheduler_type %} + schedulerName: {{ scheduler_type }} +{%- endif %} +{%- if volume %} + volumes: +{%- for vol in volume %} + - name: {{ vol.name }} +{%- if vol.type == "hostPath" %} + hostPath: + path: {{ vol.path }} +{%- elif vol.type == "pvc" %} + persistentVolumeClaim: + claimName: {{ vol.claim_name }} +{%- endif %} +{%- endfor %} +{%- endif %} +{%- if max_retry %} + runPolicy: + cleanPodPolicy: "None" + jobMaxRetryCount: {{ max_retry }} +{%- endif %}""" diff --git a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/__init__.py b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/__init__.py new file mode 100644 index 00000000..78e351d6 --- /dev/null +++ b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/__init__.py @@ -0,0 +1,7 @@ +from .model import PyTorchJobConfig + +def validate(data: dict): + return PyTorchJobConfig(**data) + + +__all__ = ["validate", "PyTorchJobConfig"] \ No newline at end of file diff --git a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py new file mode 100644 index 00000000..abfe0f53 --- /dev/null +++ b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py @@ -0,0 +1,523 @@ +from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator +from typing import Optional, List, Dict, Union, Literal +import click +from sagemaker.hyperpod.training.config.hyperpod_pytorch_job_unified_config import ( + Containers, + ReplicaSpec, + Resources, + RunPolicy, + Spec, + Template, + Metadata, + Volumes, + HostPath, + PersistentVolumeClaim +) +from sagemaker.hyperpod.training.hyperpod_pytorch_job import HyperPodPytorchJob +import yaml + +# Constants +ALLOWED_TOPOLOGY_LABELS = { + 'topology.k8s.aws/ultraserver-id', + 'topology.k8s.aws/network-node-layer-1', + 'topology.k8s.aws/network-node-layer-2', + 'topology.k8s.aws/network-node-layer-3' +} + +class VolumeConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + name: str = Field( + ..., + description="Volume name", + min_length=1 + ) + type: Literal['hostPath', 'pvc'] = Field(..., description="Volume type") + mount_path: str = Field( + ..., + description="Mount path in container", + min_length=1 + ) + path: Optional[str] = Field( + None, + description="Host path (required for hostPath volumes)", + min_length=1 + ) + claim_name: Optional[str] = Field( + None, + description="PVC claim name (required for pvc volumes)", + min_length=1 + ) + read_only: Optional[bool] = Field(None, description="Read-only flag for pvc volumes") + + def to_dict(self) -> dict: + """Convert VolumeConfig to dictionary format.""" + vol_dict = { + 'name': self.name, + 'type': self.type, + 'mount_path': self.mount_path + } + if self.path: + vol_dict['path'] = self.path + if self.claim_name: + vol_dict['claim_name'] = self.claim_name + if self.read_only is not None: + vol_dict['read_only'] = self.read_only + return vol_dict + + @field_validator('mount_path', 'path') + @classmethod + def paths_must_be_absolute(cls, v): + """Validate that paths are absolute (start with /).""" + if v and not v.startswith('/'): + raise ValueError('Path must be absolute (start with /)') + return v + + @model_validator(mode='after') + def validate_type_specific_fields(self): + """Validate that required fields are present based on volume type.""" + + if self.type == 'hostPath': + if not self.path: + raise ValueError('hostPath volumes require path field') + elif self.type == 'pvc': + if not self.claim_name: + raise ValueError('PVC volumes require claim_name field') + + return self + + +class PyTorchJobConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + job_name: str = Field( + alias="job_name", + description="Job name", + min_length=1, + max_length=63, + pattern=r'^[a-z0-9]([-a-z0-9]*[a-z0-9])?$' + ) + image: str = Field( + description="Docker image for training", + min_length=1 + ) + namespace: Optional[str] = Field( + default=None, + description="Kubernetes namespace", + min_length=1 + ) + command: Optional[List[str]] = Field( + default=None, description="Command to run in the container" + ) + args: Optional[List[str]] = Field( + default=None, alias="args", description="Arguments for the entry script" + ) + environment: Optional[Dict[str, str]] = Field( + default=None, description="Environment variables as key_value pairs" + ) + pull_policy: Optional[str] = Field( + default=None, + alias="pull_policy", + description="Image pull policy", + min_length=1 + ) + instance_type: Optional[str] = Field( + default=None, + alias="instance_type", + description="Instance type for training", + min_length=1 + ) + node_count: Optional[int] = Field( + default=None, + alias="node_count", + description="Number of nodes", + ge=1 + ) + tasks_per_node: Optional[str] = Field( + default="auto", + alias="tasks_per_node", + description="Number of workers per node; supported values: [auto,cpu, gpu, int]", + ) + label_selector: Optional[Dict[str, str]] = Field( + default=None, + alias="label_selector", + description="Node label selector as key_value pairs", + ) + deep_health_check_passed_nodes_only: Optional[bool] = Field( + default=False, + alias="deep_health_check_passed_nodes_only", + description="Schedule pods only on nodes that passed deep health check", + ) + scheduler_type: Optional[str] = Field( + default=None, + alias="scheduler_type", + description="If specified, training job pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler.", + min_length=1 + ) + queue_name: Optional[str] = Field( + default=None, + alias="queue_name", + description="Queue name for job scheduling", + min_length=1, + max_length=63, + pattern=r'^[a-z0-9]([-a-z0-9]*[a-z0-9])?$' + ) + priority: Optional[str] = Field( + default=None, + description="Priority class for job scheduling", + min_length=1 + ) + accelerators: Optional[int] = Field( + default=None, + description="Number of accelerators a.k.a GPUs or Trainium Chips", + ) + vcpu: Optional[float] = Field( + default=None, + description="Number of vCPUs", + ) + memory: Optional[float] = Field( + default=None, + description="Amount of memory in GiB", + ) + accelerators_limit: Optional[int] = Field( + default=None, + description="Limit for the number of accelerators a.k.a GPUs or Trainium Chips", + ) + vcpu_limit: Optional[float] = Field( + default=None, + description="Limit for the number of vCPUs", + ) + memory_limit: Optional[float] = Field( + default=None, + description="Limit for the amount of memory in GiB", + ) + + max_retry: Optional[int] = Field( + default=None, + alias="max_retry", + description="Maximum number of job retries", + ge=0 + ) + volume: Optional[List[VolumeConfig]] = Field( + default=None, description="List of volume configurations. \ + Command structure: --volume name=,type=,mount_path=, \ + For hostPath: --volume name=model-data,type=hostPath,mount_path=/data,path=/data \ + For persistentVolumeClaim: --volume name=training-output,type=pvc,mount_path=/mnt/output,claim_name=training-output-pvc,read_only=false \ + If multiple --volume flag if multiple volumes are needed \ + " + ) + service_account_name: Optional[str] = Field( + default=None, + alias="service_account_name", + description="Service account name", + min_length=1 + ) + preferred_topology: Optional[str] = Field( + default=None, + alias="preferred_topology", + description="Preferred topology annotation for scheduling", + ) + required_topology: Optional[str] = Field( + default=None, + alias="required_topology", + description="Required topology annotation for scheduling", + ) + + @field_validator('tasks_per_node', mode='before') + @classmethod + def validate_tasks_per_node(cls, v): + if v is None: + return v + + # Convert to string for validation + v_str = str(v).lower() + + # Check if it's one of the allowed string values + if v_str in ['auto', 'cpu', 'gpu']: + return v_str + + # Check if it's a valid integer (reject floats) + try: + # First check if it contains a decimal point + if '.' in str(v): + raise ValueError("tasks_per_node must be an integer, not a float") + + int_val = int(v) + if int_val >= 0: + return str(int_val) + else: + raise ValueError("tasks_per_node must be non-negative") + except (ValueError, TypeError): + raise ValueError("tasks_per_node must be 'auto', 'cpu', 'gpu', or a non-negative integer") + + @field_validator('volume') + def validate_no_duplicates(cls, v): + """Validate no duplicate volume names or mount paths.""" + if not v: + return v + + # Check for duplicate volume names + names = [vol.name for vol in v] + if len(names) != len(set(names)): + raise ValueError("Duplicate volume names found") + + # Check for duplicate mount paths + mount_paths = [vol.mount_path for vol in v] + if len(mount_paths) != len(set(mount_paths)): + raise ValueError("Duplicate mount paths found") + + return v + + @field_validator('command', 'args') + def validate_string_lists(cls, v): + """Validate that command and args contain non-empty strings.""" + if not v: + return v + + for i, item in enumerate(v): + if not isinstance(item, str) or not item.strip(): + field_name = cls.model_fields.get('command', {}).get('alias', 'command') if 'command' in str(v) else 'args' + raise ValueError(f"{field_name}[{i}] must be a non-empty string") + + return v + + @field_validator('environment') + def validate_environment_variable_names(cls, v): + """Validate environment variable names follow C_IDENTIFIER pattern.""" + if not v: + return v + + import re + c_identifier_pattern = re.compile(r'^[a-zA-Z_][a-zA-Z0-9_]*$') + + for key in v.keys(): + if not c_identifier_pattern.match(key): + raise ValueError(f"Environment variable name '{key}' must be a valid C_IDENTIFIER") + + return v + + @field_validator('label_selector') + def validate_label_selector_keys(cls, v): + """Validate label selector keys follow Kubernetes label naming conventions.""" + if not v: + return v + + import re + # Kubernetes label key pattern - allows namespaced labels like kubernetes.io/arch + # Pattern: [prefix/]name where prefix and name follow DNS subdomain rules + # Also reject double dots + label_key_pattern = re.compile(r'^([a-zA-Z0-9]([a-zA-Z0-9\-_.]*[a-zA-Z0-9])?/)?[a-zA-Z0-9]([a-zA-Z0-9\-_.]*[a-zA-Z0-9])?$') + + for key in v.keys(): + if not key or not label_key_pattern.match(key) or '..' in key: + raise ValueError(f"Label selector key '{key}' must follow Kubernetes label naming conventions") + + return v + + @field_validator('preferred_topology', 'required_topology') + def validate_topology_labels(cls, v): + """Validate topology labels are from allowed set.""" + if v is None: + return v + + if v not in ALLOWED_TOPOLOGY_LABELS: + raise ValueError(f"Topology label '{v}' must be one of: {', '.join(sorted(ALLOWED_TOPOLOGY_LABELS))}") + + return v + + def to_domain(self) -> Dict: + """Convert flat config to domain model (HyperPodPytorchJobSpec)""" + + # Helper function to build dict with non-None values + def build_dict(**kwargs): + return {k: v for k, v in kwargs.items() if v is not None} + + # Build resources + if self.instance_type is None: + requests_value = limits_value = {"nvidia.com/gpu": "0"} + else: + requests_value = build_dict( + accelerators=str(self.accelerators) if self.accelerators else None, + vcpu=str(self.vcpu) if self.vcpu else None, + memory=str(self.memory) if self.memory else None + ) + limits_value = build_dict( + accelerators=str(self.accelerators_limit) if self.accelerators_limit else None, + vcpu=str(self.vcpu_limit) if self.vcpu_limit else None, + memory=str(self.memory_limit) if self.memory_limit else None + ) + + # Build container + container_kwargs = build_dict( + name="pytorch-job-container", + image=self.image, + resources=Resources(requests=requests_value, limits=limits_value), + command=self.command, + args=self.args, + image_pull_policy=self.pull_policy, + env=[{"name": k, "value": v} for k, v in self.environment.items()] if self.environment else None, + volume_mounts=[{"name": vol.name, "mount_path": vol.mount_path} for vol in self.volume] if self.volume else None + ) + + container = Containers(**container_kwargs) + + # Build volumes + volumes = None + if self.volume: + volumes = [] + for vol in self.volume: + if vol.type == "hostPath": + volume_obj = Volumes(name=vol.name, host_path=HostPath(path=vol.path)) + elif vol.type == "pvc": + volume_obj = Volumes(name=vol.name, persistent_volume_claim=PersistentVolumeClaim( + claim_name=vol.claim_name, + read_only=vol.read_only == "true" if vol.read_only else False + )) + volumes.append(volume_obj) + + # Build node selector + node_selector = build_dict( + **{"node.kubernetes.io/instance-type": self.instance_type} if self.instance_type else {}, + **self.label_selector if self.label_selector else {}, + **{"deep-health-check-passed": "true"} if self.deep_health_check_passed_nodes_only else {} + ) + + # Build spec + spec_kwargs = build_dict( + containers=[container], + volumes=volumes, + node_selector=node_selector if node_selector else None, + service_account_name=self.service_account_name, + scheduler_name=self.scheduler_type + ) + + # Build metadata + metadata_labels = build_dict( + **{"kueue.x-k8s.io/queue-name": self.queue_name} if self.queue_name else {}, + **{"kueue.x-k8s.io/priority-class": self.priority} if self.priority else {} + ) + + annotations = build_dict( + **{"kueue.x-k8s.io/podset-preferred-topology": self.preferred_topology} if self.preferred_topology else {}, + **{"kueue.x-k8s.io/podset-required-topology": self.required_topology} if self.required_topology else {} + ) + + metadata_kwargs = build_dict( + name=self.job_name, + namespace=self.namespace, + labels=metadata_labels if metadata_labels else None, + annotations=annotations if annotations else None + ) + + # Build replica spec + replica_kwargs = build_dict( + name="pod", + template=Template(metadata=Metadata(**metadata_kwargs), spec=Spec(**spec_kwargs)), + replicas=self.node_count + ) + + # Build job + job_kwargs = build_dict( + metadata=metadata_kwargs, + replica_specs=[ReplicaSpec(**replica_kwargs)], + nproc_per_node=str(self.tasks_per_node) if self.tasks_per_node else None, + run_policy=RunPolicy(clean_pod_policy="None", job_max_retry_count=self.max_retry) if self.max_retry else None + ) + + result = HyperPodPytorchJob(**job_kwargs) + return result + + def create_from_k8s_yaml(self, yaml_file_path: str) -> None: + """Create HyperPodPytorchJob from k8s YAML file.""" + with open(yaml_file_path, 'r') as f: + yaml_data = yaml.safe_load(f) + + # Combine metadata and spec for full validation + full_data = {**yaml_data['spec'], 'metadata': yaml_data['metadata']} + job = HyperPodPytorchJob.model_validate(full_data, by_name=True) + job.create() + + +# Volume-specific type handlers - only override what's needed +def volume_parse_strings(ctx_or_strings, param=None, value=None): + """Parse volume strings into VolumeConfig objects. Can be used as Click callback.""" + # Handle dual usage pattern (inlined) + if param is not None and value is not None: + volume_strings, is_click_callback = value, True + else: + volume_strings, is_click_callback = ctx_or_strings, False + + if not volume_strings: + return None + if not isinstance(volume_strings, (list, tuple)): + volume_strings = [volume_strings] + + # Core parsing logic + volumes = [] + for vol_str in volume_strings: + vol_dict = {} + for pair in vol_str.split(','): + if '=' in pair: + key, val = pair.split('=', 1) + key = key.strip() + val = val.strip() + vol_dict[key] = val.lower() == 'true' if key == 'read_only' else val + + try: + volumes.append(VolumeConfig(**vol_dict)) + except Exception as e: + error_msg = f"Invalid volume configuration '{vol_str}': {e}" + if is_click_callback: + raise click.BadParameter(error_msg) + else: + raise ValueError(error_msg) + + return volumes + + +def volume_from_dicts(volume_dicts): + """Convert list of volume dictionaries to VolumeConfig objects.""" + if volume_dicts is None: + return None + return [VolumeConfig(**vol_dict) for vol_dict in volume_dicts if isinstance(vol_dict, dict)] + + +def volume_write_to_yaml(key, volumes, file_handle): + """Write VolumeConfig objects to YAML format.""" + if volumes: + file_handle.write(f"{key}:\n") + for vol in volumes: + file_handle.write(f" - name: {vol.name}\n") + file_handle.write(f" type: {vol.type}\n") + file_handle.write(f" mount_path: {vol.mount_path}\n") + if vol.path: + file_handle.write(f" path: {vol.path}\n") + if vol.claim_name: + file_handle.write(f" claim_name: {vol.claim_name}\n") + if vol.read_only is not None: + file_handle.write(f" read_only: {vol.read_only}\n") + file_handle.write("\n") + else: + file_handle.write(f"{key}: []\n\n") + + +def volume_merge_dicts(existing_volumes, new_volumes): + """Merge volume configurations, updating existing volumes by name or adding new ones.""" + merged = {vol.get('name'): vol for vol in existing_volumes} + merged.update({vol.get('name'): vol for vol in new_volumes}) + return list(merged.values()) + + +# Handler definition - merge with defaults, only override specific functions +def _get_volume_type_handler(): + from sagemaker.hyperpod.cli.type_handler_utils import DEFAULT_TYPE_HANDLER + return { + **DEFAULT_TYPE_HANDLER, # Start with all defaults + 'parse_strings': volume_parse_strings, # Override only these + 'from_dicts': volume_from_dicts, + 'write_to_yaml': volume_write_to_yaml, + 'merge_dicts': volume_merge_dicts, + 'needs_multiple_option': True + } + +VOLUME_TYPE_HANDLER = _get_volume_type_handler() diff --git a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/schema.json b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/schema.json new file mode 100644 index 00000000..41abed18 --- /dev/null +++ b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/schema.json @@ -0,0 +1,383 @@ +{ + "$defs": { + "topologyLabels": { + "enum": [ + "topology.k8s.aws/ultraserver-id", + "topology.k8s.aws/network-node-layer-1", + "topology.k8s.aws/network-node-layer-2", + "topology.k8s.aws/network-node-layer-3" + ] + }, + "VolumeConfig": { + "properties": { + "name": { + "description": "Volume name", + "minLength": 1, + "title": "Name", + "type": "string" + }, + "type": { + "description": "Volume type", + "enum": [ + "hostPath", + "pvc" + ], + "title": "Type", + "type": "string" + }, + "mount_path": { + "description": "Mount path in container", + "minLength": 1, + "title": "Mount Path", + "type": "string" + }, + "path": { + "anyOf": [ + { + "minLength": 1, + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Host path (required for hostPath volumes)", + "title": "Path" + }, + "claim_name": { + "anyOf": [ + { + "minLength": 1, + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "PVC claim name (required for pvc volumes)", + "title": "Claim Name" + }, + "read_only": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Read-only flag for pvc volumes", + "title": "Read Only" + } + }, + "required": [ + "name", + "type", + "mount_path" + ], + "title": "VolumeConfig", + "type": "object" + } + }, + "additionalProperties": false, + "properties": { + "job_name": { + "description": "Job name", + "maxLength": 63, + "minLength": 1, + "pattern": "^[a-z0-9]([-a-z0-9]*[a-z0-9])?$", + "title": "Job Name", + "type": "string" + }, + "image": { + "description": "Docker image for training", + "minLength": 1, + "title": "Image", + "type": "string" + }, + "namespace": { + "anyOf": [ + { + "minLength": 1, + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Kubernetes namespace", + "title": "Namespace" + }, + "command": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Command to run in the container", + "title": "Command" + }, + "args": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Arguments for the entry script", + "title": "Args" + }, + "environment": { + "anyOf": [ + { + "additionalProperties": { + "type": "string" + }, + "type": "object" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Environment variables as key_value pairs", + "title": "Environment" + }, + "pull_policy": { + "anyOf": [ + { + "minLength": 1, + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Image pull policy", + "title": "Pull Policy" + }, + "instance_type": { + "anyOf": [ + { + "minLength": 1, + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Instance type for training", + "title": "Instance Type" + }, + "node_count": { + "anyOf": [ + { + "minimum": 1, + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Number of nodes", + "title": "Node Count" + }, + "tasks_per_node": { + "anyOf": [ + { + "minimum": 1, + "type": "string" + }, + { + "type": "null" + } + ], + "default": "auto", + "description": "Number of workers per node; supported values: [auto,cpu, gpu, int]", + "title": "Tasks Per Node" + }, + "label_selector": { + "anyOf": [ + { + "additionalProperties": { + "type": "string" + }, + "type": "object" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Node label selector as key_value pairs", + "title": "Label Selector" + }, + "deep_health_check_passed_nodes_only": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "default": false, + "description": "Schedule pods only on nodes that passed deep health check", + "title": "Deep Health Check Passed Nodes Only" + }, + "scheduler_type": { + "anyOf": [ + { + "minLength": 1, + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "If specified, training job pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler.", + "title": "Scheduler Type" + }, + "queue_name": { + "anyOf": [ + { + "maxLength": 63, + "minLength": 1, + "pattern": "^[a-z0-9]([-a-z0-9]*[a-z0-9])?$", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Queue name for job scheduling", + "title": "Queue Name" + }, + "accelerators": { + "type": "integer", + "minimum": 0, + "description": "Number of accelerators (GPUs/TPUs)" + }, + "vcpu": { + "type": "float", + "minimum": 0, + "description": "Number of vCPUs" + }, + "memory": { + "type": "float", + "minimum": 0, + "description": "Amount of memory in GiB" + }, + "accelerators_limit": { + "type": "integer", + "minimum": 0, + "description": "Limit for the number of accelerators (GPUs/TPUs)" + }, + "vcpu_limit": { + "type": "float", + "minimum": 0, + "description": "Limit for the number of vCPUs" + }, + "memory_limit": { + "type": "float", + "minimum": 0, + "description": "Limit for the amount of memory in GiB" + }, + "priority": { + "anyOf": [ + { + "minLength": 1, + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Priority class for job scheduling", + "title": "Priority" + }, + "max_retry": { + "anyOf": [ + { + "minimum": 0, + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Maximum number of job retries", + "title": "Max Retry" + }, + "volume": { + "anyOf": [ + { + "items": { + "$ref": "#/$defs/VolumeConfig" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "List of volume configurations. Command structure: --volume name=,type=,mount_path=, For hostPath: --volume name=model-data,type=hostPath,mount_path=/data,path=/data For persistentVolumeClaim: --volume name=training-output,type=pvc,mount_path=/mnt/output,claim_name=training-output-pvc,read_only=false If multiple --volume flag if multiple volumes are needed ", + "title": "Volume" + }, + "service_account_name": { + "anyOf": [ + { + "minLength": 1, + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Service account name", + "title": "Service Account Name" + }, + "preferred_topology": { + "type": "string", + "description": "Preferred topology annotation for scheduling", + "$ref": "#/$defs/topologyLabels" + }, + "required_topology": { + "type": "string", + "description": "Required topology annotation for scheduling", + "$ref": "#/$defs/topologyLabels" + } + }, + "required": [ + "job_name", + "image" + ], + "title": "PyTorchJobConfig", + "type": "object" +} \ No newline at end of file diff --git a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/template.py b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/template.py new file mode 100644 index 00000000..4348d6cc --- /dev/null +++ b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/template.py @@ -0,0 +1,157 @@ +TEMPLATE_CONTENT = """ +apiVersion: sagemaker.amazonaws.com/v1 +kind: HyperPodPyTorchJob +metadata: + name: {{ job_name }} + namespace: {{ namespace }} +{%- if queue_name or priority %} + labels: +{%- if queue_name %} + kueue.x-k8s.io/queue-name: {{ queue_name }} +{%- endif %} +{%- if priority %} + kueue.x-k8s.io/priority-class: {{ priority }} +{%- endif %} +{%- endif %} +{%- if preferred_topology or required_topology %} + annotations: +{%- if preferred_topology %} + kueue.x-k8s.io/podset-preferred-topology: {{ preferred_topology }} +{%- endif %} +{%- if required_topology %} + kueue.x-k8s.io/podset-required-topology: {{ required_topology }} +{%- endif %} +{%- endif %} +spec: +{%- if tasks_per_node %} + nprocPerNode: "{{ tasks_per_node }}" +{%- endif %} + replicaSpecs: + - name: pod + {%- if node_count %} + replicas: {{ node_count }} + {%- endif %} + template: + metadata: + name: {{ job_name }} + namespace: {{ namespace }} +{%- if queue_name or priority %} + labels: +{%- if queue_name %} + kueue.x-k8s.io/queue-name: {{ queue_name }} +{%- endif %} +{%- if priority %} + kueue.x-k8s.io/priority-class: {{ priority }} +{%- endif %} +{%- endif %} +{%- if preferred_topology or required_topology %} + annotations: +{%- if preferred_topology %} + kueue.x-k8s.io/podset-preferred-topology: {{ preferred_topology }} +{%- endif %} +{%- if required_topology %} + kueue.x-k8s.io/podset-required-topology: {{ required_topology }} +{%- endif %} +{%- endif %} + spec: + containers: + - name: pytorch-job-container + image: {{ image }} +{%- if pull_policy %} + imagePullPolicy: {{ pull_policy }} +{%- endif %} +{%- if command %} + command: {{ command | tojson }} +{%- endif %} +{%- if args %} + args: {{ args | tojson }} +{%- endif %} +{%- if environment %} + env: +{%- for key, value in environment.items() %} + - name: {{ key }} + value: "{{ value }}" +{%- endfor %} +{%- endif %} +{%- if volume %} + volumeMounts: +{%- for vol in volume %} + - name: {{ vol.name }} + mountPath: {{ vol.mount_path }} +{%- if vol.read_only is defined %} + readOnly: {{ vol.read_only }} +{%- endif %} +{%- endfor %} +{%- endif %} + resources: +{%- if accelerators or vcpu or memory %} + requests: +{%- if accelerators %} + nvidia.com/gpu: {{ accelerators }} +{%- endif %} +{%- if vcpu %} + cpu: {{ vcpu }} +{%- endif %} +{%- if memory %} + memory: {{ memory }}Gi +{%- endif %} +{%- else %} + requests: + nvidia.com/gpu: "0" +{%- endif %} +{%- if accelerators_limit or vcpu_limit or memory_limit %} + limits: +{%- if accelerators_limit %} + nvidia.com/gpu: {{ accelerators_limit }} +{%- endif %} +{%- if vcpu_limit %} + cpu: {{ vcpu_limit }} +{%- endif %} +{%- if memory_limit %} + memory: {{ memory_limit }}Gi +{%- endif %} +{%- else %} + limits: + nvidia.com/gpu: "0" +{%- endif %} +{%- if instance_type or label_selector or deep_health_check_passed_nodes_only %} + nodeSelector: +{%- if instance_type %} + node.kubernetes.io/instance-type: {{ instance_type }} +{%- endif %} +{%- if label_selector %} +{%- for key, value in label_selector.items() %} + {{ key }}: {{ value }} +{%- endfor %} +{%- endif %} +{%- if deep_health_check_passed_nodes_only %} + deep-health-check-passed: "true" +{%- endif %} +{%- endif %} +{%- if service_account_name %} + serviceAccountName: {{ service_account_name }} +{%- endif %} +{%- if scheduler_type %} + schedulerName: {{ scheduler_type }} +{%- endif %} +{%- if volume %} + volumes: +{%- for vol in volume %} + - name: {{ vol.name }} +{%- if vol.type == "hostPath" %} + hostPath: + path: {{ vol.path }} +{%- elif vol.type == "pvc" %} + persistentVolumeClaim: + claimName: {{ vol.claim_name }} +{%- if vol.read_only is defined %} + readOnly: {{ vol.read_only }} +{%- endif %} +{%- endif %} +{%- endfor %} +{%- endif %} +{%- if max_retry %} + runPolicy: + cleanPodPolicy: "None" + jobMaxRetryCount: {{ max_retry }} +{%- endif %}""" diff --git a/hyperpod-pytorch-job-template/pyproject.toml b/hyperpod-pytorch-job-template/pyproject.toml index 229116ad..2565dd5e 100644 --- a/hyperpod-pytorch-job-template/pyproject.toml +++ b/hyperpod-pytorch-job-template/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "hyperpod-pytorch-job-template" -version = "1.0.1" +version = "1.1.3" readme = "README.md" authors = [{name = "Amazon Web Services"}] license = {text = "Apache-2.0"} @@ -25,7 +25,4 @@ include-package-data = true [tool.setuptools.package-data] # for each versioned subpackage, include schema.json -"hyperpod_pytorch_job_template.v1_0" = ["schema.json"] - -[project.entry-points."mycli.config_versions"] -"1.0" = "hyperpod_pytorch_job_template.v1_0:PyTorchJobConfig" \ No newline at end of file +"*" = ["schema.json"] diff --git a/pyproject.toml b/pyproject.toml index cb048c24..67920606 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta" [project] dynamic = ["dependencies"] name = "sagemaker-hyperpod" -version = "3.0.0" +version = "3.3.0" description = "Amazon SageMaker HyperPod SDK and CLI" readme = "README.md" requires-python = ">=3.8" @@ -112,4 +112,4 @@ docstring-code-format = false # # This only has an effect when the `docstring-code-format` setting is # enabled. -docstring-code-line-length = "dynamic" \ No newline at end of file +docstring-code-line-length = "dynamic" diff --git a/setup.cfg b/setup.cfg index d048030d..e883c540 100644 --- a/setup.cfg +++ b/setup.cfg @@ -50,7 +50,7 @@ xfail_strict = true addopts = --verbose --ignore=build/private - --cov hyperpod_cli + --cov sagemaker.hyperpod --cov-config setup.cfg --cov-report term-missing --cov-report html:build/hyperpod-documentation/coverage @@ -59,8 +59,8 @@ addopts = --durations=5 # Default to colorful output --color=yes - # Uncomment to enforce a minimum code coverage threshold. - # --cov-fail-under 50 + # Enforce a minimum code coverage threshold + --cov-fail-under 50 testpaths = test looponfailroots = src test diff --git a/setup.py b/setup.py index 6efc713f..70104b3e 100644 --- a/setup.py +++ b/setup.py @@ -47,7 +47,7 @@ setup( data_files=sagemaker_hyperpod_recipes, name="sagemaker-hyperpod", - version="3.0.0", + version="3.3.0", description="Amazon SageMaker HyperPod SDK and CLI", long_description=open("README.md").read(), long_description_content_type="text/markdown", @@ -89,7 +89,8 @@ "pydantic>=2.10.6,<3.0.0", "hyperpod-pytorch-job-template>=1.0.0, <2.0.0", "hyperpod-custom-inference-template>=1.0.0, <2.0.0", - "hyperpod-jumpstart-inference-template>=1.0.0, <2.0.0" + "hyperpod-jumpstart-inference-template>=1.0.0, <2.0.0", + "hyperpod-cluster-stack-template>=1.0.0, <2.0.0" ], entry_points={ "console_scripts": [ diff --git a/src/sagemaker/hyperpod/cli/__init__.py b/src/sagemaker/hyperpod/cli/__init__.py index e69de29b..36f7d15e 100644 --- a/src/sagemaker/hyperpod/cli/__init__.py +++ b/src/sagemaker/hyperpod/cli/__init__.py @@ -0,0 +1,9 @@ +import warnings +# Reset warnings and show all except Pydantic serialization warnings +warnings.resetwarnings() +warnings.simplefilter("always") +# Suppress specific Pydantic serialization warnings globally (this is ignored due to customized parsing logic) +warnings.filterwarnings("ignore", message=".*PydanticSerializationUnexpectedValue.*", category=UserWarning) +warnings.filterwarnings("ignore", message=".*serializer.*", category=UserWarning, module="pydantic") +# Suppress kubernetes urllib3 deprecation warning (this is internal dependencies) +warnings.filterwarnings("ignore", message=".*HTTPResponse.getheaders.*", category=DeprecationWarning, module="kubernetes") \ No newline at end of file diff --git a/src/sagemaker/hyperpod/cli/clients/kubernetes_client.py b/src/sagemaker/hyperpod/cli/clients/kubernetes_client.py index 54cfaefd..3e6d0202 100644 --- a/src/sagemaker/hyperpod/cli/clients/kubernetes_client.py +++ b/src/sagemaker/hyperpod/cli/clients/kubernetes_client.py @@ -51,14 +51,10 @@ class KubernetesClient: _instance = None _kube_client = None - def __new__(cls, is_get_capacity: bool = False) -> "KubernetesClient": + def __new__(cls, config_file: Optional[str] = None) -> "KubernetesClient": if cls._instance is None: cls._instance = super(KubernetesClient, cls).__new__(cls) - config.load_kube_config( - config_file=KUBE_CONFIG_PATH - if not is_get_capacity - else TEMP_KUBE_CONFIG_FILE - ) # or config.load_incluster_config() for in-cluster config + config.load_kube_config(config_file=config_file or KUBE_CONFIG_PATH) cls._instance._kube_client = client.ApiClient() return cls._instance diff --git a/src/sagemaker/hyperpod/cli/cluster_stack_utils.py b/src/sagemaker/hyperpod/cli/cluster_stack_utils.py new file mode 100644 index 00000000..5d3c7ad5 --- /dev/null +++ b/src/sagemaker/hyperpod/cli/cluster_stack_utils.py @@ -0,0 +1,498 @@ +""" +CloudFormation cluster stack deletion utilities. + +This module provides utilities for managing CloudFormation stack deletion operations +with support for both CLI and SDK interfaces through a callback pattern. + +Public Interface: + delete_stack_with_confirmation() - Main orchestration function for stack deletion + StackNotFoundError - Exception raised when stack is not found + +All other functions are private implementation details and should not be used directly. +""" + +import boto3 +import click +import logging +from typing import List, Dict, Any, Optional, Tuple, Callable +from botocore.exceptions import ClientError +from sagemaker.hyperpod.cli.common_utils import ( + parse_comma_separated_list, + categorize_resources_by_type +) + + +class _StackNotFoundError(Exception): + """Exception raised when a CloudFormation stack is not found.""" + pass + + +# Make the exception available with the original name +StackNotFoundError = _StackNotFoundError + +MessageCallback = Callable[[str], None] +ConfirmCallback = Callable[[str], bool] +SuccessCallback = Callable[[str], None] + + +def _get_stack_resources(stack_name: str, region: str, logger: Optional[logging.Logger] = None) -> List[Dict[str, Any]]: + """Get all resources in a CloudFormation stack. + + Args: + stack_name: Name of the CloudFormation stack + region: AWS region for CloudFormation operations + logger: Optional logger for debug information + + Returns: + List of resource summaries from CloudFormation + + Raises: + _StackNotFoundError: When stack doesn't exist + ClientError: For other CloudFormation errors + """ + if logger: + logger.debug(f"Fetching resources for stack '{stack_name}' in region '{region}'") + + cf_client = boto3.client('cloudformation', region_name=region) + try: + resources_response = cf_client.list_stack_resources(StackName=stack_name) + resources = resources_response.get('StackResourceSummaries', []) + + if logger: + logger.debug(f"Found {len(resources)} resources in stack '{stack_name}'") + + return resources + except ClientError as e: + error_code = e.response['Error']['Code'] + if error_code == 'ValidationError' and "does not exist" in str(e): + raise _StackNotFoundError(f"Stack '{stack_name}' not found") + raise + + +def _validate_retain_resources(retain_list: List[str], existing_resources: List[Dict[str, Any]]) -> Tuple[List[str], List[str]]: + """Validate that retain resources exist in the stack. + + Args: + retain_list: List of logical resource IDs to retain + existing_resources: List of existing stack resources + + Returns: + Tuple of (valid_resources, invalid_resources) + """ + if not retain_list: + return [], [] + + existing_resource_names = {r.get('LogicalResourceId', '') for r in existing_resources} + valid_retain_resources = [] + invalid_retain_resources = [] + + for resource in retain_list: + if resource in existing_resource_names: + valid_retain_resources.append(resource) + else: + invalid_retain_resources.append(resource) + + return valid_retain_resources, invalid_retain_resources + + +def _categorize_stack_resources(resources: List[Dict[str, Any]]) -> Dict[str, List[str]]: + """Categorize CloudFormation resources by type using generic utility.""" + type_mappings = { + "EC2 Instances": ["AWS::EC2::Instance"], + "Networking": ["AWS::EC2::VPC", "AWS::EC2::Subnet", "AWS::EC2::SecurityGroup", + "AWS::EC2::InternetGateway", "AWS::EC2::RouteTable", "AWS::EC2::Route"], + "IAM": ["AWS::IAM::Role", "AWS::IAM::Policy", "AWS::IAM::InstanceProfile"], + "Storage": ["AWS::S3::Bucket", "AWS::EBS::Volume", "AWS::EFS::FileSystem"] + } + + return categorize_resources_by_type(resources, type_mappings) + + +def _compare_resource_states(original_resources: List[Dict[str, Any]], current_resources: List[Dict[str, Any]]) -> Tuple[set[str], set[str]]: + """Compare original and current resource states to identify changes. + + Args: + original_resources: Resources before deletion attempt + current_resources: Resources after deletion attempt + + Returns: + Tuple of (deleted_resources, remaining_resources) + """ + original_names = {r['LogicalResourceId'] for r in original_resources} + current_names = {r['LogicalResourceId'] for r in current_resources} + + deleted_resources = original_names - current_names + remaining_resources = current_names + + return deleted_resources, remaining_resources + + +def _display_deletion_warning(categorized_resources: Dict[str, List[str]], message_callback: MessageCallback) -> None: + """Display warning about resources to be deleted.""" + total_count = sum(len(item_list) for item_list in categorized_resources.values()) + message_callback(f"\n⚠ WARNING: This will delete the following {total_count} resources:\n") + + for category, item_list in categorized_resources.items(): + if item_list: + message_callback(f"{category} ({len(item_list)}):") + for item in item_list: + message_callback(f" - {item}") + message_callback("") + + +def _display_invalid_resources_warning(invalid_resources: List[str], message_callback: MessageCallback) -> None: + """Display warning about invalid retain resources.""" + if not invalid_resources: + return + + message_callback(f"⚠️ Warning: The following {len(invalid_resources)} resources don't exist in the stack:") + for resource in invalid_resources: + message_callback(f" - {resource} (not found)") + message_callback("") + + +def _display_retention_info(retained_items: List[str], message_callback: MessageCallback) -> None: + """Display information about items that will be retained.""" + if retained_items: + message_callback(f"\nThe following {len(retained_items)} resources will be RETAINED:") + for item in retained_items: + message_callback(f" ✓ {item} (retained)") + + + + +def _handle_termination_protection_error(stack_name: str, region: str, message_callback: MessageCallback) -> None: + """Handle termination protection error.""" + message_callback("❌ Stack deletion blocked: Termination Protection is enabled") + message_callback("") + message_callback("To delete this stack, first disable termination protection:") + message_callback(f"aws cloudformation update-termination-protection --no-enable-termination-protection --stack-name {stack_name} --region {region}") + message_callback("") + message_callback("Then retry the delete command.") + + +def _handle_retention_limitation_error(stack_name: str, retain_resources: str, region: str, message_callback: MessageCallback) -> None: + """Handle CloudFormation retention limitation error.""" + message_callback("❌ CloudFormation limitation: --retain-resources only works on failed deletions") + message_callback("") + message_callback("💡 Recommended workflow:") + message_callback("1. First try deleting without --retain-resources:") + message_callback(f" hyp delete cluster-stack {stack_name} --region {region}") + message_callback("") + message_callback("2. If deletion fails, the stack will be in DELETE_FAILED state") + message_callback("3. Then retry with --retain-resources to keep specific resources:") + message_callback(f" hyp delete cluster-stack {stack_name} --retain-resources {retain_resources} --region {region}") + + +def _handle_generic_deletion_error(error_str: str, message_callback: MessageCallback) -> None: + """Handle generic deletion errors.""" + if "does not exist" in error_str: + message_callback("❌ Stack not found") + elif "AccessDenied" in error_str: + message_callback("❌ Access denied. Check AWS permissions") + else: + message_callback(f"❌ Error deleting stack: {error_str}") + + +def _handle_partial_deletion_failure(stack_name: str, region: str, original_resources: List[Dict[str, Any]], + retain_list: List[str], message_callback: MessageCallback) -> None: + """Handle partial deletion failures by showing what succeeded vs failed. + + Args: + stack_name: Name of the stack + region: AWS region + original_resources: Resources before deletion attempt + retain_list: List of resources that were supposed to be retained + message_callback: Function to call for outputting messages + """ + message_callback("✗ Stack deletion failed") + + try: + cf_client = boto3.client('cloudformation', region_name=region) + current_resources_response = cf_client.list_stack_resources(StackName=stack_name) + current_resources = current_resources_response.get('StackResourceSummaries', []) + + deleted_resources, remaining_resources = _compare_resource_states( + original_resources, current_resources + ) + + # Show what was successfully deleted + if deleted_resources: + message_callback("") + message_callback(f"Successfully deleted ({len(deleted_resources)}):") + for resource in deleted_resources: + message_callback(f" ✓ {resource}") + + # Show what failed to delete (excluding retained resources) + failed_resources = remaining_resources - set(retain_list) if retain_list else remaining_resources + if failed_resources: + message_callback("") + message_callback(f"Failed to delete ({len(failed_resources)}):") + for resource in failed_resources: + message_callback(f" ✗ {resource} (DependencyViolation: has dependent resources)") + + # Show retained resources + if retain_list: + message_callback("") + message_callback(f"Successfully retained as requested ({len(retain_list)}):") + for resource in retain_list: + message_callback(f" ✓ {resource} (retained)") + + message_callback("") + message_callback("💡 Note: Some resources may have dependencies preventing deletion") + message_callback(" Check the AWS CloudFormation console for detailed dependency information") + + except Exception: + # If we can't get current resources, show generic error + message_callback("Unable to determine which resources were deleted") + +def _parse_retain_resources(retain_resources_str: str) -> List[str]: + """Parse comma-separated retain resources string.""" + return parse_comma_separated_list(retain_resources_str) + + +def _perform_stack_deletion(stack_name: str, region: str, retain_list: List[str], + logger: Optional[logging.Logger] = None) -> None: + """Perform the actual CloudFormation stack deletion. + + This is a private low-level function that directly calls the CloudFormation delete_stack API. + Use delete_stack_with_confirmation() for the public interface. + + Args: + stack_name: Name of the stack to delete + region: AWS region + retain_list: List of resources to retain during deletion + logger: Optional logger for debug information + + Raises: + ClientError: If deletion fails due to CloudFormation errors + Exception: For other deletion failures + """ + if logger: + logger.debug(f"Initiating deletion of stack '{stack_name}' in region '{region}'") + if retain_list: + logger.debug(f"Retaining resources: {retain_list}") + + cf_client = boto3.client('cloudformation', region_name=region) + + delete_params = {'StackName': stack_name} + if retain_list: + delete_params['RetainResources'] = retain_list + + cf_client.delete_stack(**delete_params) + + if logger: + logger.info(f"Stack '{stack_name}' deletion initiated successfully") + + + + +def _get_stack_resources_and_validate_retention(stack_name: str, region: str, retain_resources_str: str, + logger: Optional[logging.Logger] = None) -> Tuple[List[Dict[str, Any]], List[str], List[str]]: + """Get stack resources and validate retention list. + + Args: + stack_name: Name of the CloudFormation stack + region: AWS region + retain_resources_str: Comma-separated retain resources string + logger: Optional logger for debug information + + Returns: + Tuple of (all_resources, valid_retain_list, invalid_retain_list) + + Raises: + StackNotFoundError: When stack doesn't exist + """ + resources = _get_stack_resources(stack_name, region, logger) + if not resources: + raise _StackNotFoundError(f"No resources found in stack '{stack_name}'") + + retain_list = _parse_retain_resources(retain_resources_str) + valid_retain, invalid_retain = _validate_retain_resources(retain_list, resources) + + if logger and retain_list: + logger.debug(f"Retention validation - Valid: {len(valid_retain)}, Invalid: {len(invalid_retain)}") + + return resources, valid_retain, invalid_retain + + +def _handle_stack_deletion_error(error: Exception, stack_name: str, region: str, retain_resources: Optional[str] = None, + message_callback: Optional[MessageCallback] = None, + logger: Optional[logging.Logger] = None) -> bool: + """Handle various CloudFormation deletion errors with customizable output. + + Args: + error: The exception that occurred + stack_name: Name of the stack being deleted + region: AWS region + retain_resources: Original retain resources string (for error messages) + message_callback: Function to call for outputting messages (default: click.echo) + logger: Optional logger for debug information + + Returns: + True if error was handled gracefully (don't re-raise), False if should re-raise + """ + if message_callback is None: + message_callback = click.echo + + error_str = str(error) + + if logger: + logger.debug(f"Handling deletion error for stack '{stack_name}': {error_str}") + + # Handle termination protection specifically + if "TerminationProtection is enabled" in error_str: + _handle_termination_protection_error(stack_name, region, message_callback) + return False # Should re-raise + + # Handle CloudFormation retain-resources limitation + # Always re-raise for SDK usage to ensure clear exceptions + if retain_resources and "specify which resources to retain only when the stack is in the DELETE_FAILED state" in error_str: + _handle_retention_limitation_error(stack_name, retain_resources, region, message_callback) + return False # ensure SDK gets the exception + + # Handle other deletion errors + _handle_generic_deletion_error(error_str, message_callback) + return False # Should re-raise + + +def _display_stack_deletion_confirmation(resources: List[Dict[str, Any]], valid_retain_list: List[str], + invalid_retain_list: List[str], + message_callback: Optional[MessageCallback] = None, + confirm_callback: Optional[ConfirmCallback] = None, + logger: Optional[logging.Logger] = None) -> bool: + """Display deletion warnings and get user confirmation with customizable output. + + Args: + resources: All stack resources + valid_retain_list: Valid resources to retain + invalid_retain_list: Invalid resources that don't exist + message_callback: Function to call for outputting messages (default: click.echo) + confirm_callback: Function to call for confirmation (default: click.confirm) + logger: Optional logger for debug information + + Returns: + True if user confirms deletion, False otherwise + """ + if message_callback is None: + message_callback = click.echo + if confirm_callback is None: + confirm_callback = lambda msg: click.confirm("Continue?", default=False) + + if logger: + logger.debug(f"Displaying confirmation for {len(resources)} resources, {len(valid_retain_list)} to retain") + + # Show warning for invalid retain resources + _display_invalid_resources_warning(invalid_retain_list, message_callback) + + # Display deletion warning + resource_categories = _categorize_stack_resources(resources) + _display_deletion_warning(resource_categories, message_callback) + + # Show retention info + _display_retention_info(valid_retain_list, message_callback) + + return confirm_callback("Continue with deletion?") + + +def _handle_stack_deletion_partial_failure(stack_name: str, region: str, original_resources: List[Dict[str, Any]], + retain_list: List[str], message_callback: Optional[MessageCallback] = None) -> None: + """Handle partial deletion failures by showing what succeeded vs failed. + + Args: + stack_name: Name of the stack + region: AWS region + original_resources: Resources before deletion attempt + retain_list: List of resources that were supposed to be retained + message_callback: Function to call for outputting messages (default: click.echo) + """ + if message_callback is None: + message_callback = click.echo + + _handle_partial_deletion_failure(stack_name, region, original_resources, retain_list, message_callback) + + + + +def delete_stack_with_confirmation(stack_name: str, region: str, retain_resources_str: str = "", + message_callback: Optional[MessageCallback] = None, + confirm_callback: Optional[ConfirmCallback] = None, + success_callback: Optional[SuccessCallback] = None, + logger: Optional[logging.Logger] = None) -> None: + """ + This is the main public interface for stack deletion, supporting both CLI and SDK + usage through customizable callback functions. It handles resource validation, + user confirmation, deletion execution, and comprehensive error handling. + + Args: + stack_name: Name of the stack to delete + region: AWS region + retain_resources_str: Comma-separated retain resources string + message_callback: Function to call for outputting messages (default: click.echo) + confirm_callback: Function to call for confirmation (default: click.confirm) + success_callback: Function to call on successful deletion (default: click.echo) + logger: Optional logger for debug information + + Raises: + StackNotFoundError: When stack doesn't exist + click.ClickException: For CLI usage + Exception: For SDK usage (depending on callback implementation) + + Example: + # CLI usage + delete_stack_with_confirmation( + stack_name="my-stack", + region="us-west-2", + message_callback=click.echo, + confirm_callback=lambda msg: click.confirm("Continue?", default=False) + ) + + # SDK usage + delete_stack_with_confirmation( + stack_name="my-stack", + region="us-west-2", + message_callback=logger.info, + confirm_callback=lambda msg: True # Auto-confirm + ) + """ + if message_callback is None: + message_callback = click.echo + if success_callback is None: + success_callback = lambda msg: click.echo(f"✓ {msg}") + + if logger: + logger.info(f"Starting deletion workflow for stack '{stack_name}' in region '{region}'") + + # 1. Get and validate resources + resources, valid_retain, invalid_retain = _get_stack_resources_and_validate_retention( + stack_name, region, retain_resources_str, logger + ) + + # 2. Display warnings and get confirmation + if not _display_stack_deletion_confirmation(resources, valid_retain, invalid_retain, + message_callback, confirm_callback, logger): + message_callback("Operation cancelled.") + return + + # 3. Perform deletion + try: + _perform_stack_deletion(stack_name, region, valid_retain, logger) + success_callback(f"Stack '{stack_name}' deletion initiated successfully") + except Exception as e: + # Handle deletion errors + should_handle_gracefully = _handle_stack_deletion_error( + e, stack_name, region, retain_resources_str, message_callback, logger + ) + + if should_handle_gracefully: + return # Exit gracefully for retention limitation error + + # For other errors, try to show partial failure info if possible + try: + _handle_stack_deletion_partial_failure(stack_name, region, resources, valid_retain, message_callback) + except Exception: + if logger: + logger.debug("Failed to show partial failure information") + + # Re-raise the original exception + raise diff --git a/src/sagemaker/hyperpod/cli/cluster_utils.py b/src/sagemaker/hyperpod/cli/cluster_utils.py new file mode 100644 index 00000000..cc7da3aa --- /dev/null +++ b/src/sagemaker/hyperpod/cli/cluster_utils.py @@ -0,0 +1,145 @@ +""" +Cluster utilities for EKS access validation and management. +""" + +import logging +from typing import Optional, Tuple, Dict, Any + +import boto3 +import botocore +from botocore.exceptions import ClientError + +logger = logging.getLogger(__name__) + + +def _get_current_aws_identity(session: boto3.Session) -> Tuple[str, str]: + """ + Get the current AWS identity (ARN and type). + + Args: + session: Boto3 session + + Returns: + Tuple of (principal_arn, identity_type) + """ + sts_client = session.client('sts') + identity = sts_client.get_caller_identity() + + arn = identity['Arn'] + + # Determine identity type + if ':user/' in arn: + identity_type = 'user' + elif ':role/' in arn: + identity_type = 'role' + elif ':assumed-role/' in arn: + identity_type = 'assumed-role' + # For assumed roles, we need to get the base role ARN + # arn:aws:sts::123456789012:assumed-role/MyRole/session-name + # becomes arn:aws:iam::123456789012:role/MyRole + parts = arn.split('/') + if len(parts) >= 3: + base_arn = arn.replace(':sts:', ':iam:').replace(':assumed-role/', ':role/').rsplit('/', 1)[0] + arn = base_arn + else: + identity_type = 'unknown' + + return arn, identity_type + + +def _check_access_entry_exists( + eks_client: botocore.client.BaseClient, + cluster_name: str, + principal_arn: str +) -> Tuple[bool, Optional[Dict[str, Any]], Optional[str]]: + """ + Check if the given principal has an access entry for the EKS cluster. + + Args: + eks_client: Boto3 EKS client + cluster_name: Name of the EKS cluster + principal_arn: ARN of the principal to check + + Returns: + Tuple of (has_access, access_entry_details, error_message) + """ + try: + response = eks_client.describe_access_entry( + clusterName=cluster_name, + principalArn=principal_arn + ) + return True, response.get('accessEntry'), None + + except ClientError as e: + error_code = e.response['Error']['Code'] + + if error_code == 'ResourceNotFoundException': + # No access entry found for this principal + return False, None, f"No access entry found for principal: {principal_arn}" + elif error_code == 'AccessDeniedException': + # User doesn't have permission to check access entries + return False, None, f"Access denied when checking access entries. You may not have eks:DescribeAccessEntry permission." + elif error_code == 'ClusterNotFoundException': + # Cluster doesn't exist + return False, None, f"EKS cluster '{cluster_name}' not found." + else: + # Other error + return False, None, f"Error checking access entry: {e.response['Error']['Message']}" + + except Exception as e: + return False, None, f"Unexpected error checking access entry: {str(e)}" + + +def validate_eks_access_before_kubeconfig_update( + session: boto3.Session, + cluster_name: str, + eks_name: str +) -> Tuple[bool, str]: + """ + Validate that the current user has EKS access before attempting kubeconfig update. + + Args: + session: Boto3 session + cluster_name: Name of the HyperPod cluster (for error messages) + eks_name: Name of the EKS cluster + + Returns: + Tuple of (has_access, message) + """ + try: + # Get current AWS identity + principal_arn, identity_type = _get_current_aws_identity(session) + logger.debug(f"Current AWS identity: {principal_arn} (type: {identity_type})") + + # Create EKS client + eks_client = session.client('eks') + + # Check if the principal has an access entry + has_access, access_entry, error_msg = _check_access_entry_exists( + eks_client, eks_name, principal_arn + ) + + if has_access: + success_msg = f"✓ Access confirmed for {principal_arn}" + if access_entry: + kubernetes_groups = access_entry.get('kubernetesGroups', []) + username = access_entry.get('username', 'N/A') + success_msg += f"\n - Username: {username}" + success_msg += f"\n - Kubernetes Groups: {', '.join(kubernetes_groups) if kubernetes_groups else 'None'}" + return True, success_msg + else: + # Access validation failed - provide clear error message + error_message = ( + f"✗ Cannot connect to EKS cluster '{eks_name}': {error_msg}\n\n" + f"Your AWS identity '{principal_arn}' (type: {identity_type}) does not have an access entry " + f"for this EKS cluster.\n\n" + f"To resolve this issue:\n" + f"1. Contact your cluster administrator to add your identity to the EKS access entries\n" + f"2. Refer to this documentation to create an access entry: https://docs.aws.amazon.com/cli/latest/reference/eks/create-access-entry.html\n" + f"3. Verify your AWS credentials and region are correct\n" + f"4. Ensure you have the necessary EKS permissions (eks:DescribeAccessEntry)" + ) + return False, error_message + + except Exception as e: + return False, f"Unexpected error validating EKS access: {str(e)}" diff --git a/src/sagemaker/hyperpod/cli/commands/cluster.py b/src/sagemaker/hyperpod/cli/commands/cluster.py index 4f47dd3c..289a827a 100644 --- a/src/sagemaker/hyperpod/cli/commands/cluster.py +++ b/src/sagemaker/hyperpod/cli/commands/cluster.py @@ -14,8 +14,10 @@ import subprocess import json import sys +import signal import botocore.config from collections import defaultdict +from concurrent.futures import ThreadPoolExecutor, as_completed from typing import Any, Dict, List, Optional, Tuple import boto3 @@ -55,6 +57,9 @@ set_logging_level, store_current_hyperpod_context, ) +from sagemaker.hyperpod.cli.cluster_utils import ( + validate_eks_access_before_kubeconfig_update, +) from sagemaker.hyperpod.cli.validators.cluster_validator import ( ClusterValidator, ) @@ -72,6 +77,8 @@ _hyperpod_telemetry_emitter, ) from sagemaker.hyperpod.common.telemetry.constants import Feature +from sagemaker.hyperpod.cli.utils import convert_datetimes +from sagemaker_core.main.resources import Cluster RATE_LIMIT = 4 RATE_LIMIT_PERIOD = 1 # 1 second @@ -120,7 +127,7 @@ def list_cluster( debug: bool, namespace: Optional[List], ): - """List SageMaker Hyperpod Clusters with cluster metadata. + """List SageMaker Hyperpod Clusters with metadata. Example Usage: 1. List clusters with JSON output: hyperpod get-clusters -n hyperpod-ns-test-team @@ -191,30 +198,33 @@ def list_cluster( cluster_capacities: List[List[str]] = [] - counter = 0 - for cluster_name in cluster_names: - current_cluster_capacities_size = len(cluster_capacities) - rate_limited_operation( - cluster_name=cluster_name, - validator=validator, - sm_client=sm_client, - region=region, - temp_config_file=TEMP_KUBE_CONFIG_FILE, - cluster_capacities=cluster_capacities, - namespace=namespace, - ) - # cluster_capacities will only be updated when the cluster - # is a valid Hyperpod EKS cluster. This check avoid - # we skipped many Hyperpod Slurm clusters and didn't return - # any Hyperpod EKS clusters. - if len(cluster_capacities) > current_cluster_capacities_size: - counter += 1 - # Currently only support list <= 50 clusters - if counter >= 50: - logger.debug( - "The 'get-clusters' command has reached the maximum number of HyperPod clusters that can be listed, which is 50." - ) - break + # Process clusters in parallel with limited concurrency + if cluster_names: + with ThreadPoolExecutor(max_workers=len(cluster_names)) as executor: + futures = {} + counter = 0 + + for cluster_name in cluster_names[:50]: # Limit to 50 clusters + future = executor.submit( + rate_limited_operation, + cluster_name=cluster_name, + validator=validator, + sm_client=sm_client, + region=region, + temp_config_file=f"{TEMP_KUBE_CONFIG_FILE}_{cluster_name}", + namespace=namespace, + ) + futures[future] = cluster_name + + for future in as_completed(futures): + cluster_name = futures[future] + try: + result = future.result() + if result: # Only add if cluster processing was successful + cluster_capacities.extend(result) + counter += 1 + except Exception as e: + logger.error(f"Error processing cluster {cluster_name}: {e}") headers = [ "Cluster", @@ -233,7 +243,7 @@ def list_cluster( print(tabulate(cluster_capacities, headers=headers, tablefmt="presto")) elif output == OutputFormat.JSON.value: json_list = [dict(zip(headers, value)) for value in cluster_capacities] - _restructure_output(json_list, namespace) + json_list = _restructure_output(json_list, namespace) print(json.dumps(json_list, indent=4)) @@ -245,10 +255,42 @@ def rate_limited_operation( sm_client: BaseClient, region: Optional[str], temp_config_file: str, - cluster_capacities: List[List[str]], namespace: Optional[List[str]], -) -> None: +) -> Optional[List[List[str]]]: try: + cluster_capacities = [] # Initialize at the beginning + + # Get cluster details to check instance count + cluster_response = sm_client.describe_cluster(ClusterName=cluster_name) + cluster_status = cluster_response.get('ClusterStatus', 'Unknown') + + # Check if cluster has zero instances + instance_groups = cluster_response.get('InstanceGroups', []) + total_instances = sum( + group.get('CurrentCount', 0) for group in instance_groups + ) + + # If cluster has 0 instances, add it with 0 nodes + if total_instances == 0: + logger.info(f"Adding cluster {cluster_name} with 0 instances (status: {cluster_status})") + zero_instance_row = [ + cluster_name, + "N/A", # InstanceType + 0, # TotalNodes + 0, # AcceleratorDevicesAvailable + 0, # NodeHealthStatus=Schedulable + "N/A", # DeepHealthCheckStatus=Passed + ] + + # Add namespace columns with 0 values + if namespace: + for ns in namespace: + zero_instance_row.extend([0, 0]) # Total and Available accelerator devices + + cluster_capacities.append(zero_instance_row) + return cluster_capacities + + # Proceed with EKS validation for clusters with instances eks_cluster_arn = validator.validate_cluster_and_get_eks_arn( cluster_name, sm_client ) @@ -256,10 +298,10 @@ def rate_limited_operation( logger.warning( f"Cannot find EKS cluster behind {cluster_name}, continue..." ) - return + return None eks_cluster_name = get_name_from_arn(eks_cluster_arn) _update_kube_config(eks_cluster_name, region, temp_config_file) - k8s_client = KubernetesClient(is_get_capacity=True) + k8s_client = KubernetesClient(config_file=temp_config_file) nodes = k8s_client.list_node_with_temp_config( temp_config_file, SAGEMAKER_HYPERPOD_NAME_LABEL ) @@ -268,25 +310,27 @@ def rate_limited_operation( ns_nominal_quota = {} ns_quota_usage = {} - for ns in namespace: - sm_managed_namespace = k8s_client.get_sagemaker_managed_namespace(ns) - if sm_managed_namespace: - quota_allocation_id = sm_managed_namespace.metadata.labels[ - SAGEMAKER_QUOTA_ALLOCATION_LABEL - ] - cluster_queue_name = ( - HYPERPOD_NAMESPACE_PREFIX - + quota_allocation_id - + SAGEMAKER_MANAGED_CLUSTER_QUEUE_SUFFIX - ) - cluster_queue = k8s_client.get_cluster_queue(cluster_queue_name) - nominal_quota = _get_cluster_queue_nominal_quota(cluster_queue) - quota_usage = _get_cluster_queue_quota_usage(cluster_queue) - ns_nominal_quota[ns] = nominal_quota - ns_quota_usage[ns] = quota_usage - else: - ns_nominal_quota[ns] = {} - ns_quota_usage[ns] = {} + if namespace: + for ns in namespace: + sm_managed_namespace = k8s_client.get_sagemaker_managed_namespace(ns) + if sm_managed_namespace: + quota_allocation_id = sm_managed_namespace.metadata.labels[ + SAGEMAKER_QUOTA_ALLOCATION_LABEL + ] + cluster_queue_name = ( + HYPERPOD_NAMESPACE_PREFIX + + quota_allocation_id + + SAGEMAKER_MANAGED_CLUSTER_QUEUE_SUFFIX + ) + + cluster_queue = k8s_client.get_cluster_queue(cluster_queue_name) + nominal_quota = _get_cluster_queue_nominal_quota(cluster_queue) + quota_usage = _get_cluster_queue_quota_usage(cluster_queue) + ns_nominal_quota[ns] = nominal_quota + ns_quota_usage[ns] = quota_usage + else: + ns_nominal_quota[ns] = {} + ns_quota_usage[ns] = {} for instance_type, nodes_summary in nodes_info.items(): capacities = [ @@ -297,23 +341,26 @@ def rate_limited_operation( nodes_summary["schedulable"], nodes_summary["deep_health_check_passed"], ] - for ns in namespace: - capacities.append( - ns_nominal_quota.get(ns) - .get(instance_type, {}) - .get(NVIDIA_GPU_RESOURCE_LIMIT_KEY, "N/A") - ) - capacities.append( - _get_available_quota( - ns_nominal_quota.get(ns), - ns_quota_usage.get(ns), - instance_type, - NVIDIA_GPU_RESOURCE_LIMIT_KEY, + if namespace: + for ns in namespace: + capacities.append( + ns_nominal_quota.get(ns) + .get(instance_type, {}) + .get(NVIDIA_GPU_RESOURCE_LIMIT_KEY, "N/A") + ) + capacities.append( + _get_available_quota( + ns_nominal_quota.get(ns), + ns_quota_usage.get(ns), + instance_type, + NVIDIA_GPU_RESOURCE_LIMIT_KEY, + ) ) - ) cluster_capacities.append(capacities) + return cluster_capacities except Exception as e: logger.error(f"Error processing cluster {cluster_name}: {e}, continue...") + return None def _get_cluster_queue_nominal_quota(cluster_queue): @@ -379,23 +426,34 @@ def _get_hyperpod_clusters(sm_client: boto3.client) -> List[str]: def _restructure_output(summary_list, namespaces): - if not namespaces: - return + cluster_dict = dict() for node_summary in summary_list: - node_summary["Namespaces"] = {} - for ns in namespaces: - available_accelerators = node_summary[ - ns + AVAILABLE_ACCELERATOR_DEVICES_KEY - ] - total_accelerators = node_summary[ns + TOTAL_ACCELERATOR_DEVICES_KEY] - quota_accelerator_info = { - AVAILABLE_ACCELERATOR_DEVICES_KEY: available_accelerators, - TOTAL_ACCELERATOR_DEVICES_KEY: total_accelerators, + cluster_name = node_summary["Cluster"] + if cluster_name not in cluster_dict: + cluster_dict[cluster_name] = { + "Cluster": cluster_name, + "Instances": [] } - node_summary["Namespaces"][ns] = quota_accelerator_info - node_summary.pop(ns + AVAILABLE_ACCELERATOR_DEVICES_KEY, None) - node_summary.pop(ns + TOTAL_ACCELERATOR_DEVICES_KEY, None) + node_summary.pop("Cluster") + if namespaces: + node_summary["Namespaces"] = {} + for ns in namespaces: + available_accelerators = node_summary[ + ns + AVAILABLE_ACCELERATOR_DEVICES_KEY + ] + total_accelerators = node_summary[ns + TOTAL_ACCELERATOR_DEVICES_KEY] + quota_accelerator_info = { + AVAILABLE_ACCELERATOR_DEVICES_KEY: available_accelerators, + TOTAL_ACCELERATOR_DEVICES_KEY: total_accelerators, + } + node_summary["Namespaces"][ns] = quota_accelerator_info + node_summary.pop(ns + AVAILABLE_ACCELERATOR_DEVICES_KEY, None) + node_summary.pop(ns + TOTAL_ACCELERATOR_DEVICES_KEY, None) + cluster_dict[cluster_name]["Instances"].append(node_summary) + + return list(cluster_dict.values()) + def _aggregate_nodes_info( @@ -508,19 +566,34 @@ def set_cluster_context( """ if debug: set_logging_level(logger, logging.DEBUG) - validator = ClusterValidator() - botocore_config = botocore.config.Config( - user_agent_extra=get_user_agent_extra_suffix() - ) - session = boto3.Session(region_name=region) if region else boto3.Session() - if not validator.validate_aws_credential(session): - logger.error("Cannot connect to HyperPod cluster due to aws credentials error") - sys.exit(1) - + + timeout = 60 # 1 minute + + def timeout_handler(signum, frame): + raise TimeoutError(f"Operation timed out after {timeout} seconds") + + # Set up timeout + signal.signal(signal.SIGALRM, timeout_handler) + signal.alarm(timeout) + try: + validator = ClusterValidator() + botocore_config = botocore.config.Config( + user_agent_extra=get_user_agent_extra_suffix() + ) + session = boto3.Session(region_name=region) if region else boto3.Session() + if not validator.validate_aws_credential(session): + logger.error("Cannot connect to HyperPod cluster due to aws credentials error") + sys.exit(1) + sm_client = get_sagemaker_client(session, botocore_config) hp_cluster_details = sm_client.describe_cluster(ClusterName=cluster_name) logger.debug("Fetched hyperpod cluster details") + + # Check if cluster is EKS-orchestrated + if "Orchestrator" not in hp_cluster_details or "Eks" not in hp_cluster_details.get("Orchestrator", {}): + raise ValueError(f"Cluster '{cluster_name}' is not EKS-orchestrated. HyperPod CLI only supports EKS-orchestrated clusters.") + store_current_hyperpod_context(hp_cluster_details) eks_cluster_arn = hp_cluster_details["Orchestrator"]["Eks"]["ClusterArn"] logger.debug( @@ -528,9 +601,40 @@ def set_cluster_context( ) eks_name = get_name_from_arn(eks_cluster_arn) + + # Proactively validate EKS access before attempting kubeconfig update + logger.debug("Validating EKS access entries before kubeconfig update...") + try: + has_access, message = validate_eks_access_before_kubeconfig_update( + session, cluster_name, eks_name + ) + + if has_access: + logger.debug(message) + else: + # Access validation failed - provide clear error message + logger.error(message) + sys.exit(1) + + except Exception as validation_error: + # If access validation fails unexpectedly, log warning but continue + # This ensures backward compatibility if the validation has issues + logger.warning( + f"Could not validate EKS access entries: {validation_error}. " + f"Proceeding with kubeconfig update..." + ) + _update_kube_config(eks_name, region, None) k8s_client = KubernetesClient() k8s_client.set_context(eks_cluster_arn, namespace) + + # Cancel the alarm if operation completes successfully + signal.alarm(0) + logger.info(f"Successfully connected to cluster {cluster_name}") + + except TimeoutError as e: + logger.error("Timed out - Please check credentials, setup configurations and try again") + sys.exit(1) except botocore.exceptions.NoRegionError: logger.error( f"Please ensure you configured AWS default region or use '--region' argument to specify the region" @@ -541,6 +645,9 @@ def set_cluster_context( f"Unexpected error happens when try to connect to cluster {cluster_name}. Error: {e}" ) sys.exit(1) + finally: + # Ensure alarm is cancelled in all cases + signal.alarm(0) @click.command() @@ -553,7 +660,7 @@ def get_cluster_context( debug: bool, ) -> Tuple[Any, str]: """ - Get all the context related to the current set Cluster + Get context related to the current set cluster. Args: debug (bool): Enable debug mode. @@ -579,12 +686,81 @@ def get_cluster_context( sys.exit(1) +@click.command("cluster") +@click.argument("cluster-name", required=True) +@click.option("--region", help="AWS region") +@click.option("--debug", is_flag=True, help="Enable debug logging") +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "describe_cluster_cli") +def describe_cluster(cluster_name: str, debug: bool, region: str) -> None: + """Describe the status of a HyperPod cluster. + Shows detailed information about a SageMaker HyperPod cluster including its current status, + instance groups, orchestrator details, and configuration. + Usage Examples + # Describe a cluster + hyp describe cluster my-cluster-name + # Describe with specific region + hyp describe cluster my-cluster-name --region us-west-2 + """ + if debug: + set_logging_level(logger, logging.DEBUG) + + try: + botocore_config = botocore.config.Config( + user_agent_extra=get_user_agent_extra_suffix() + ) + session = boto3.Session(region_name=region) if region else boto3.Session() + sm_client = get_sagemaker_client(session, botocore_config) + + # Get cluster details using SageMaker client + cluster_dict = sm_client.describe_cluster(ClusterName=cluster_name) + + # Convert datetimes for display + cluster_dict = convert_datetimes(cluster_dict) + + logger.debug(f"Describing cluster name: {cluster_name}\ninfo: {json.dumps(cluster_dict, indent=2, default=str)}") + + click.echo(f"📋 Cluster Details for: {cluster_name}") + + # Highlight cluster status + cluster_status = cluster_dict.get('ClusterStatus', 'UNKNOWN') + click.echo(f"Status: ", nl=False) + click.secho(cluster_status) + + table_data = [] + for key, value in cluster_dict.items(): + if isinstance(value, (dict, list)): + formatted_value = json.dumps(value, indent=2, default=str) + else: + formatted_value = str(value) + table_data.append([key, formatted_value]) + + # Only display table if we have data + if table_data: + click.echo(tabulate(table_data, tablefmt="presto")) + else: + click.echo("No cluster data available") + + except Exception as e: + logger.error(f"Failed to describe cluster: {e}") + if debug: + logger.exception("Detailed error information:") + + if "does not exist" in str(e) or "not found" in str(e).lower(): + click.echo(f"❌ Cluster '{cluster_name}' not found") + elif "AccessDenied" in str(e): + click.echo("❌ Access denied. Check AWS permissions") + else: + click.echo(f"❌ Error describing cluster: {e}") + + sys.exit(1) + + @click.command() @click.option("--grafana", is_flag=True, help="Returns Grafana Dashboard URL") @click.option("--prometheus", is_flag=True, help="Returns Prometheus Workspace URL") @click.option("--list", is_flag=True, help="Returns list of available metrics") def get_monitoring(grafana: bool, prometheus: bool, list: bool) -> None: - """Get monitoring configurations for Hyperpod cluster""" + """Get monitoring configurations for Hyperpod cluster.""" try: if not any([grafana, prometheus, list]): print("Error: Please select at least one option") diff --git a/src/sagemaker/hyperpod/cli/commands/cluster_stack.py b/src/sagemaker/hyperpod/cli/commands/cluster_stack.py new file mode 100644 index 00000000..2a278086 --- /dev/null +++ b/src/sagemaker/hyperpod/cli/commands/cluster_stack.py @@ -0,0 +1,375 @@ +""" +Command module for HyperPod cluster stack operations. +""" + +import ast +import logging +import click +import json +import os +from typing import Optional + +from sagemaker_core.main.resources import Cluster +from sagemaker_core.main.shapes import ClusterInstanceGroupSpecification + +from tabulate import tabulate +from sagemaker.hyperpod.cluster_management.hp_cluster_stack import HpClusterStack +from sagemaker.hyperpod.common.telemetry import _hyperpod_telemetry_emitter +from sagemaker.hyperpod.common.telemetry.constants import Feature +from sagemaker.hyperpod.common.utils import setup_logging +from sagemaker.hyperpod.cli.utils import convert_datetimes +from sagemaker.hyperpod.cli.init_utils import _filter_cli_metadata_fields +from sagemaker.hyperpod.cli.init_utils import load_config +from sagemaker.hyperpod.cli.constants.init_constants import TEMPLATES +from pathlib import Path +from sagemaker.hyperpod.cli.cluster_stack_utils import ( + StackNotFoundError, + delete_stack_with_confirmation +) + +logger = logging.getLogger(__name__) + + +def parse_status_list(ctx, param, value): + """Parse status list from string format like "['CREATE_COMPLETE', 'UPDATE_COMPLETE']" """ + if not value: + return None + + try: + # Handle both string representation and direct list + if isinstance(value, str): + # Parse string like "['item1', 'item2']" + parsed = ast.literal_eval(value) + if isinstance(parsed, list): + return parsed + else: + raise click.BadParameter(f"Expected list format, got: {type(parsed).__name__}") + return value + except (ValueError, SyntaxError) as e: + raise click.BadParameter(f"Invalid list format. Use: \"['STATUS1', 'STATUS2']\". Error: {e}") + + +@click.command("cluster-stack") +@click.argument("config-file", required=True) +@click.argument("stack-name", required=True) +@click.option("--region", help="AWS region") +@click.option("--template-version", type=click.INT, help="Version number of cluster creation template") +@click.option("--debug", is_flag=True, help="Enable debug logging") +def create_cluster_stack(config_file, region, template_version, debug): + """Create a new HyperPod cluster stack using the provided configuration. + + Creates a CloudFormation stack for a HyperPod cluster using settings from a YAML configuration file. + The stack will provision all necessary AWS resources for the cluster. + + .. dropdown:: Usage Examples + :open: + + .. code-block:: bash + + # Create cluster stack with config file + hyp create hyp-cluster cluster-config.yaml my-stack-name --region us-west-2 --template-version 1 + + # Create with debug logging + hyp create hyp-cluster cluster-config.yaml my-stack-name --debug + """ + try: + # Validate the config file path + if not os.path.exists(config_file): + logger.error(f"Config file not found: {config_file}") + return + + # Load config to get template and version + + config_dir = Path(config_file).parent + data, template, version = load_config(config_dir) + + # Get model from registry + registry = TEMPLATES[template]["registry"] + model_class = registry.get(str(version)) + + if model_class: + # Filter out CLI metadata fields + filtered_config = _filter_cli_metadata_fields(data) + + # Create model instance and domain + model_instance = model_class(**filtered_config) + config = model_instance.to_config(region=region) + + # Create the cluster stack + stack_id = HpClusterStack(**config).create(region, template_version) + + logger.info(f"Stack creation initiated successfully with ID: {stack_id}") + logger.info("You can monitor the stack creation in the AWS CloudFormation console.") + + except Exception as e: + logger.error(f"Failed to create cluster stack: {e}") + if debug: + logger.exception("Detailed error information:") + raise click.ClickException(str(e)) + + +@click.command("cluster-stack") +@click.argument("stack-name", required=True) +@click.option("--region", help="AWS region") +@click.option("--debug", is_flag=True, help="Enable debug logging") +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "describe_cluster_stack_cli") +def describe_cluster_stack(stack_name: str, debug: bool, region: str) -> None: + """Describe the status of a HyperPod cluster stack. + + Shows detailed information about a CloudFormation stack including its current status, + resources, and configuration parameters. + + .. dropdown:: Usage Examples + :open: + + .. code-block:: bash + + # Describe a cluster stack + hyp describe hyp-cluster my-stack-name + + # Describe with specific region + hyp describe hyp-cluster my-stack-name --region us-west-2 + """ + logger = setup_logging(logging.getLogger(__name__), debug) + + try: + stack_info = HpClusterStack.describe(stack_name=stack_name, region=region) + + if not stack_info or 'Stacks' not in stack_info or not stack_info['Stacks']: + click.secho(f"❌ Stack '{stack_name}' not found", fg='red') + return + + stack = stack_info['Stacks'][0] + + logger.debug(f"Describing stack name: {stack_name}\ninfo: {json.dumps(stack_info, indent=2, default=str)}") + + click.echo(f"📋 Stack Details for: {stack_name}") + + # Highlight stack status + stack_status = stack.get('StackStatus', 'UNKNOWN') + click.echo(f"Status: ", nl=False) + click.secho(stack_status) + + table_data = [] + for key, value in stack.items(): + if isinstance(value, (dict, list)): + formatted_value = json.dumps(value, indent=2, default=str) + else: + formatted_value = str(value) + table_data.append([key, formatted_value]) + + # Calculate column widths + max_field_width = max(len(str(row[0])) for row in table_data) + max_value_width = max(len(str(row[1]).split('\n')[0]) for row in table_data) # First line only for width calc + + # Add headers with matching separators (presto format adds spaces around |) + field_header = "Field".ljust(max_field_width) + value_header = "Value".ljust(max_value_width) + click.echo(f" {field_header} | {value_header} ") + click.echo(f"-{'-' * max_field_width}-+-{'-' * max_value_width}-") + + click.echo(tabulate(table_data, tablefmt="presto")) + + except Exception as e: + logger.error(f"Failed to describe stack: {e}") + if debug: + logger.exception("Detailed error information:") + + if "does not exist" in str(e): + click.echo(f"❌ Stack '{stack_name}' not found") + elif "AccessDenied" in str(e): + click.echo("❌ Access denied. Check AWS permissions") + else: + click.echo(f"❌ Error describing stack: {e}") + + raise click.ClickException(str(e)) + + +@click.command("cluster-stack") +@click.option("--region", help="AWS region") +@click.option("--debug", is_flag=True, help="Enable debug logging") +@click.option("--status", + callback=parse_status_list, + help="Filter by stack status. Format: \"['CREATE_COMPLETE', 'UPDATE_COMPLETE']\"") +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "list_cluster_stack_cli") +def list_cluster_stacks(region, debug, status): + """List all HyperPod cluster stacks. + + Displays a summary of all CloudFormation stacks related to HyperPod clusters + in the specified region or default region. + + .. dropdown:: Usage Examples + :open: + + .. code-block:: bash + + # List all cluster stacks + hyp list hyp-cluster + + # List stacks in specific region + hyp list hyp-cluster --region us-east-1 + """ + logger = setup_logging(logging.getLogger(__name__), debug) + + try: + stacks_info = HpClusterStack.list(region=region, stack_status_filter=status) + + if not stacks_info or 'StackSummaries' not in stacks_info: + click.secho("No stacks found", fg='yellow') + return + + stack_summaries = stacks_info['StackSummaries'] + + # Convert datetimes for display + stack_summaries = [convert_datetimes(stack) for stack in stack_summaries] + + logger.debug(f"Listing stacks in region: {region or 'default'}") + + click.echo(f"📋 HyperPod Cluster Stacks ({len(stack_summaries)} found)") + + if stack_summaries: + for i, stack in enumerate(stack_summaries, 1): + try: + click.echo(f"\n[{i}] Stack Details:") + + table_data = [] + for key, value in stack.items(): + table_data.append([key, str(value)]) + + click.echo(tabulate(table_data, headers=["Field", "Value"], tablefmt="presto")) + except Exception as e: + logger.error(f"Error processing stack {i}: {e}") + click.echo(f"❌ Error processing stack {i}: {stack.get('StackName', 'Unknown')}") + continue + else: + click.echo("No stacks found") + + except Exception as e: + logger.error(f"Failed to list stacks: {e}") + if debug: + logger.exception("Detailed error information:") + + if "AccessDenied" in str(e) or "Insufficient permissions" in str(e): + click.secho("❌ Access denied. Check AWS permissions", fg='red') + else: + click.secho(f"❌ Error listing stacks: {e}", fg='red') + + raise click.ClickException(str(e)) + + +@click.command("cluster-stack") +@click.argument("stack-name", required=True) +@click.option("--retain-resources", help="Comma-separated list of logical resource IDs to retain during deletion (only works on DELETE_FAILED stacks). Resource names are shown in failed deletion output, or use AWS CLI: 'aws cloudformation list-stack-resources --stack-name STACK_NAME --region REGION'") +@click.option("--region", required=True, help="AWS region") +@click.option("--debug", is_flag=True, help="Enable debug logging") +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "delete_cluster_stack_cli") +def delete_cluster_stack(stack_name: str, retain_resources: str, region: str, debug: bool) -> None: + """Delete a HyperPod cluster stack. + + Removes the specified CloudFormation stack and all associated AWS resources. + This operation cannot be undone. + + .. dropdown:: Usage Examples + :open: + + .. code-block:: bash + + # Delete a cluster stack + hyp delete cluster-stack my-stack-name --region us-west-2 + + # Delete with retained resources (only works on DELETE_FAILED stacks) + hyp delete cluster-stack my-stack-name --retain-resources S3Bucket-TrainingData,EFSFileSystem-Models --region us-west-2 + hyp delete cluster-stack my-stack-name --region us-west-2 + + # Delete with retained resources (only works on DELETE_FAILED stacks) + hyp delete cluster-stack my-stack-name --retain-resources S3Bucket-TrainingData,EFSFileSystem-Models --region us-west-2 + """ + logger = setup_logging(logging.getLogger(__name__), debug) + + try: + # Use the high-level orchestration function with CLI-specific callbacks + delete_stack_with_confirmation( + stack_name=stack_name, + region=region, + retain_resources_str=retain_resources or "", + message_callback=click.echo, + confirm_callback=lambda msg: click.confirm("Continue?", default=False), + success_callback=lambda msg: click.echo(f"✓ {msg}") + ) + + except StackNotFoundError: + click.secho(f"❌ Stack '{stack_name}' not found", fg='red') + except click.ClickException: + # Re-raise ClickException for proper CLI error handling + raise + except Exception as e: + logger.error(f"Failed to delete stack: {e}") + if debug: + logger.exception("Detailed error information:") + raise click.ClickException(str(e)) + + +@click.command("cluster") +@click.option("--cluster-name", required=True, help="The name of the cluster to update") +@click.option("--instance-groups", help="Instance Groups JSON string") +@click.option("--instance-groups-to-delete", help="Instance Groups to delete JSON string") +@click.option("--region", help="Region") +@click.option("--node-recovery", help="Node Recovery (Automatic or None)") +@click.option("--debug", is_flag=True, help="Enable debug logging") +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "update_cluster_cli") +def update_cluster( + cluster_name: str, + instance_groups: Optional[str], + instance_groups_to_delete: Optional[str], + region: Optional[str], + node_recovery: Optional[str], + debug: bool) -> None: + """Update an existing HyperPod cluster configuration. + + Modifies cluster settings such as instance groups and node recovery policies. + At least one update parameter must be provided. + + .. dropdown:: Usage Examples + :open: + + .. code-block:: bash + + # Update cluster with new instance groups + hyp update hyp-cluster --cluster-name my-cluster --instance-groups '{"group1": {...}}' + + # Update node recovery setting + hyp update hyp-cluster --cluster-name my-cluster --node-recovery Automatic + """ + """Update an existing HyperPod cluster configuration.""" + logger = setup_logging(logging.getLogger(__name__), debug) + + # Validate that at least one parameter is provided + if not any([instance_groups, instance_groups_to_delete, node_recovery]): + raise click.ClickException("At least one of --instance-groups, --instance-groups-to-delete, or --node-recovery must be provided") + + cluster = Cluster.get(cluster_name=cluster_name, region=region) + + # Prepare update parameters + update_params = {} + + # Convert instance_groups to list of ClusterInstanceGroupSpecification + if instance_groups: + if isinstance(instance_groups, str): + instance_groups = json.loads(instance_groups) + update_params['instance_groups'] = [ClusterInstanceGroupSpecification(**ig) for ig in instance_groups] + + # Convert instance_groups_to_delete to list of strings + if instance_groups_to_delete: + if isinstance(instance_groups_to_delete, str): + instance_groups_to_delete = json.loads(instance_groups_to_delete) + update_params['instance_groups_to_delete'] = instance_groups_to_delete + + # Add node_recovery if provided + if node_recovery: + update_params['node_recovery'] = node_recovery + + click.secho(f"Update Params: {update_params}") + cluster.update(**update_params) + + logger.info("Cluster has been updated") + click.secho(f"Cluster {cluster_name} has been updated") diff --git a/src/sagemaker/hyperpod/cli/commands/inference.py b/src/sagemaker/hyperpod/cli/commands/inference.py index 35b44d02..f63cb590 100644 --- a/src/sagemaker/hyperpod/cli/commands/inference.py +++ b/src/sagemaker/hyperpod/cli/commands/inference.py @@ -10,50 +10,48 @@ from sagemaker.hyperpod.inference.hp_jumpstart_endpoint import HPJumpStartEndpoint from sagemaker.hyperpod.inference.hp_endpoint import HPEndpoint from sagemaker_core.resources import Endpoint +from sagemaker.hyperpod.common.telemetry.telemetry_logging import ( + _hyperpod_telemetry_emitter, +) +from sagemaker.hyperpod.common.telemetry.constants import Feature +from sagemaker.hyperpod.common.cli_decorators import handle_cli_exceptions +from sagemaker.hyperpod.common.utils import display_formatted_logs # CREATE @click.command("hyp-jumpstart-endpoint") -@click.option( - "--namespace", - type=click.STRING, - required=False, - default="default", - help="Optional. The namespace of the jumpstart model endpoint to create. Default set to 'default'", -) @click.option("--version", default="1.0", help="Schema version to use") +@click.option("--debug", default=False, help="Enable debug mode") @generate_click_command( schema_pkg="hyperpod_jumpstart_inference_template", registry=JS_REG, ) -def js_create(namespace, version, js_endpoint): +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "create_js_endpoint_cli") +@handle_cli_exceptions() +def js_create(version, debug, js_endpoint): """ Create a jumpstart model endpoint. """ - - js_endpoint.create(namespace=namespace) + click.echo(f"Using version: {version}") + js_endpoint.create(debug=debug) @click.command("hyp-custom-endpoint") -@click.option( - "--namespace", - type=click.STRING, - required=False, - default="default", - help="Optional. The namespace of the jumpstart model endpoint to create. Default set to 'default'", -) @click.option("--version", default="1.0", help="Schema version to use") +@click.option("--debug", default=False, help="Enable debug mode") @generate_click_command( schema_pkg="hyperpod_custom_inference_template", registry=C_REG, ) -def custom_create(namespace, version, custom_endpoint): +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "create_custom_endpoint_cli") +@handle_cli_exceptions() +def custom_create(version, debug, custom_endpoint): """ Create a custom model endpoint. """ - - custom_endpoint.create(namespace=namespace) - + click.echo(f"Using version: {version}") + custom_endpoint.create(debug=debug) + # INVOKE @click.command("hyp-custom-endpoint") @@ -76,13 +74,15 @@ def custom_create(namespace, version, custom_endpoint): default="application/json", help="Optional. The content type of the request to invoke. Default set to 'application/json'", ) +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "invoke_custom_endpoint_cli") +@handle_cli_exceptions() def custom_invoke( endpoint_name: str, body: str, content_type: Optional[str] ): """ - Invoke a model endpoint. + Invoke a custom model endpoint. """ try: payload = json.dumps(json.loads(body)) @@ -128,13 +128,14 @@ def custom_invoke( default="default", help="Optional. The namespace of the jumpstart model endpoint to list. Default set to 'default'", ) +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "list_js_endpoints_cli") +@handle_cli_exceptions() def js_list( namespace: Optional[str], ): """ - List jumpstart model endpoints with provided namespace. + List all Hyperpod Jumpstart model endpoints. """ - endpoints = HPJumpStartEndpoint.model_construct().list(namespace) data = [ep.model_dump() for ep in endpoints] @@ -170,13 +171,14 @@ def js_list( default="default", help="Optional. The namespace of the custom model endpoint to list. Default set to 'default'", ) +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "list_custom_endpoints_cli") +@handle_cli_exceptions() def custom_list( namespace: Optional[str], ): """ - List custom model endpoints with provided namespace. + List all Hyperpod custom model endpoints. """ - endpoints = HPEndpoint.model_construct().list(namespace) data = [ep.model_dump() for ep in endpoints] @@ -226,15 +228,16 @@ def custom_list( required=False, help="Optional. If set to `True`, the full json will be displayed", ) +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "get_js_endpoint_cli") +@handle_cli_exceptions() def js_describe( name: str, namespace: Optional[str], full: bool ): """ - Describe a jumpstart model endpoint with provided name and namespace. + Describe a Hyperpod Jumpstart model endpoint. """ - my_endpoint = HPJumpStartEndpoint.model_construct().get(name, namespace) data = my_endpoint.model_dump() @@ -246,15 +249,27 @@ def js_describe( if not isinstance(data, dict): click.echo("Invalid data received: expected a dictionary.") return - + + click.echo("\nDeployment (should be completed in 1-5 min):") + status = data.get("status") or {} metadata = data.get("metadata") or {} model = data.get("model") or {} server = data.get("server") or {} tls = data.get("tlsConfig") or {} + raw_state = status.get("deploymentStatus", {}) \ + .get("deploymentObjectOverallState", "") or "" + if raw_state == "DeploymentComplete": + fg = "green" + elif raw_state == "DeploymentInProgress": + fg = "yellow" + else: + fg = "red" + colored_state = click.style(raw_state, fg=fg, bold=True) + summary = [ - ("Deployment State:", status.get("deploymentStatus", {}).get("deploymentObjectOverallState", "")), + ("Status:", colored_state), ("Metadata Name:", metadata.get("name", "")), ("Namespace:", metadata.get("namespace", "")), ("Label:", metadata.get("label", "")), @@ -266,27 +281,16 @@ def js_describe( ] click.echo(tabulate(summary, tablefmt="plain")) - click.echo("\nSageMaker Endpoint:") - status = data.get("status") or {} - endpoints = status.get("endpoints") or {} - sagemaker_info = endpoints.get("sagemaker") - if not sagemaker_info: - click.secho(" ", fg="yellow") - else: - ep_rows = [ - ("State:", data.get("status", {}).get("endpoints", {}).get("sagemaker", {}).get("state")), - ("Name:", data.get("sageMakerEndpoint", {}).get("name")), - ("ARN:", data.get("status", {}).get("endpoints", {}).get("sagemaker", {}).get("endpointArn")), - ] - click.echo(tabulate(ep_rows, tablefmt="plain")) - - click.echo("\nConditions:") + click.echo("\nDeployment Status Conditions:") status = data.get("status") if isinstance(data, dict) else {} - status = status or {} - conds = status.get("conditions", []) + status = status or {} - if isinstance(conds, list) and conds: + deployment_status = status.get("deploymentStatus") or {} + dep_status_inner = deployment_status.get("status") or {} + dep_conds = dep_status_inner.get("conditions") or [] + + if isinstance(dep_conds, list) and dep_conds: headers = ["TYPE", "STATUS", "LAST TRANSITION", "LAST UPDATE", "MESSAGE"] rows = [ [ @@ -296,22 +300,45 @@ def js_describe( c.get("lastUpdateTime", ""), c.get("message") or "" ] - for c in conds if isinstance(c, dict) + for c in dep_conds if isinstance(c, dict) ] click.echo(tabulate(rows, headers=headers, tablefmt="github")) else: click.echo(" ") - click.echo("\nDeploymentStatus Conditions:") + click.echo() + click.echo(click.style("─" * 60, fg="white")) + + click.echo("\nSageMaker Endpoint (takes ~10 min to create):") + status = data.get("status") or {} + endpoints = status.get("endpoints") or {} + sagemaker_info = endpoints.get("sagemaker") - status = data.get("status") if isinstance(data, dict) else {} - status = status or {} + if not sagemaker_info: + click.secho(" ", fg="yellow") + else: + raw_state = sagemaker_info.get("state", "") or "" + if raw_state == "CreationCompleted": + fg = "green" + elif raw_state == "CreationInProgress": + fg = "yellow" + else: + fg = "red" + colored_state = click.style(raw_state, fg=fg, bold=True) + ep_rows = [ + ("Status:", colored_state), + ("Name:", data.get("sageMakerEndpoint", {}).get("name")), + ("ARN:", sagemaker_info.get("endpointArn")), + ] + click.echo(tabulate(ep_rows, tablefmt="plain")) - deployment_status = status.get("deploymentStatus") or {} - dep_status_inner = deployment_status.get("status") or {} - dep_conds = dep_status_inner.get("conditions") or [] + click.echo("\nSagemaker Endpoint Status Conditions:") - if isinstance(dep_conds, list) and dep_conds: + status = data.get("status") if isinstance(data, dict) else {} + status = status or {} + conds = status.get("conditions", []) + + if isinstance(conds, list) and conds: headers = ["TYPE", "STATUS", "LAST TRANSITION", "LAST UPDATE", "MESSAGE"] rows = [ [ @@ -321,7 +348,7 @@ def js_describe( c.get("lastUpdateTime", ""), c.get("message") or "" ] - for c in dep_conds if isinstance(c, dict) + for c in conds if isinstance(c, dict) ] click.echo(tabulate(rows, headers=headers, tablefmt="github")) else: @@ -350,15 +377,16 @@ def js_describe( required=False, help="Optional. If set to `True`, the full json will be displayed", ) +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "get_custom_endpoint_cli") +@handle_cli_exceptions() def custom_describe( name: str, namespace: Optional[str], full: bool ): """ - Describe a custom model endpoint with provided name and namespace. + Describe a Hyperpod custom model endpoint. """ - my_endpoint = HPEndpoint.model_construct().get(name, namespace) data = my_endpoint.model_dump() @@ -371,7 +399,8 @@ def custom_describe( click.echo("Invalid data received: expected a dictionary.") return - # Safe access blocks + click.echo("\nDeployment (should be completed in 1-5 min):") + status = data.get("status") or {} metadata = data.get("metadata") or {} metrics = data.get("metrics") or {} @@ -385,8 +414,18 @@ def custom_describe( model_port = worker.get("modelInvocationPort") or {} cloudwatch = data.get("autoScalingSpec", {}).get("cloudWatchTrigger") or {} + raw_state = status.get("deploymentStatus", {}) \ + .get("deploymentObjectOverallState", "") or "" + if raw_state == "DeploymentComplete": + fg = "green" + elif raw_state == "DeploymentInProgress": + fg = "yellow" + else: + fg = "red" + colored_state = click.style(raw_state, fg=fg, bold=True) + summary = [ - ("Deployment State:", status.get("deploymentStatus", {}).get("deploymentObjectOverallState", "")), + ("Deployment State:", colored_state), ("Metadata Name:", metadata.get("name", "")), ("Namespace:", metadata.get("namespace", "")), ("Label:", metadata.get("label", "")), @@ -425,22 +464,16 @@ def custom_describe( click.echo(tabulate(summary, tablefmt="plain")) - click.echo("\nSageMaker Endpoint:") - sm_endpoints = status.get("endpoints") or {} - sagemaker_info = sm_endpoints.get("sagemaker") - if not sagemaker_info: - click.secho(" ", fg="yellow") - else: - ep_rows = [ - ("State:", sm_endpoints.get("sagemaker", {}).get("state", "")), - ("Name:", data.get("sageMakerEndpoint", {}).get("name", "")), - ("ARN:", sm_endpoints.get("sagemaker", {}).get("endpointArn", "")), - ] - click.echo(tabulate(ep_rows, tablefmt="plain")) + click.echo("\nDeployment Status Conditions:") - click.echo("\nConditions:") - conds = status.get("conditions", []) - if isinstance(conds, list) and conds: + status = data.get("status") if isinstance(data, dict) else {} + status = status or {} + + deployment_status = status.get("deploymentStatus") or {} + dep_status_inner = deployment_status.get("status") or {} + dep_conds = dep_status_inner.get("conditions") or [] + + if isinstance(dep_conds, list) and dep_conds: headers = ["TYPE", "STATUS", "LAST TRANSITION", "LAST UPDATE", "MESSAGE"] rows = [ [ @@ -450,17 +483,45 @@ def custom_describe( c.get("lastUpdateTime", ""), c.get("message") or "" ] - for c in conds if isinstance(c, dict) + for c in dep_conds if isinstance(c, dict) ] click.echo(tabulate(rows, headers=headers, tablefmt="github")) else: click.echo(" ") - click.echo("\nDeploymentStatus Conditions:") - deployment_status = status.get("deploymentStatus") or {} - dep_status_inner = deployment_status.get("status") or {} - dep_conds = dep_status_inner.get("conditions") or [] - if isinstance(dep_conds, list) and dep_conds: + click.echo() + click.echo(click.style("─" * 60, fg="white")) + + click.echo("\nSageMaker Endpoint (takes ~10 min to create):") + status = data.get("status") or {} + endpoints = status.get("endpoints") or {} + sagemaker_info = endpoints.get("sagemaker") + + if not sagemaker_info: + click.secho(" ", fg="yellow") + else: + raw_state = sagemaker_info.get("state", "") or "" + if raw_state == "CreationCompleted": + fg = "green" + elif raw_state == "CreationInProgress": + fg = "yellow" + else: + fg = "red" + colored_state = click.style(raw_state, fg=fg, bold=True) + ep_rows = [ + ("Status:", colored_state), + ("Name:", data.get("sageMakerEndpoint", {}).get("name")), + ("ARN:", sagemaker_info.get("endpointArn")), + ] + click.echo(tabulate(ep_rows, tablefmt="plain")) + + click.echo("\nSagemaker Endpoint Status Conditions:") + + status = data.get("status") if isinstance(data, dict) else {} + status = status or {} + conds = status.get("conditions", []) + + if isinstance(conds, list) and conds: headers = ["TYPE", "STATUS", "LAST TRANSITION", "LAST UPDATE", "MESSAGE"] rows = [ [ @@ -470,7 +531,7 @@ def custom_describe( c.get("lastUpdateTime", ""), c.get("message") or "" ] - for c in dep_conds if isinstance(c, dict) + for c in conds if isinstance(c, dict) ] click.echo(tabulate(rows, headers=headers, tablefmt="github")) else: @@ -491,13 +552,17 @@ def custom_describe( default="default", help="Optional. The namespace of the jumpstart model endpoint to delete. Default set to 'default'.", ) +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "delete_js_endpoint_cli") +@handle_cli_exceptions() def js_delete( name: str, namespace: Optional[str], ): """ - Delete a jumpstart model endpoint with provided name and namespace. + Delete a Hyperpod Jumpstart model endpoint. """ + # Auto-detects the endpoint type and operation + # 0Provides 404 message: "❓ JumpStart endpoint 'missing-name' not found..." my_endpoint = HPJumpStartEndpoint.model_construct().get(name, namespace) my_endpoint.delete() @@ -516,12 +581,14 @@ def js_delete( default="default", help="Optional. The namespace of the custom model endpoint to delete. Default set to 'default'.", ) +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "delete_custom_endpoint_cli") +@handle_cli_exceptions() def custom_delete( name: str, namespace: Optional[str], ): """ - Delete a custom model endpoint with provided name and namespace. + Delete a Hyperpod custom model endpoint. """ my_endpoint = HPEndpoint.model_construct().get(name, namespace) my_endpoint.delete() @@ -535,14 +602,23 @@ def custom_delete( default="default", help="Optional. The namespace of the jumpstart model to list pods for. Default set to 'default'.", ) +@click.option( + "--endpoint-name", + type=click.STRING, + required=False, + help="Optional. The name of the jumpstart endpoint to list pods.", +) +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "list_pods_js_endpoint_cli") +@handle_cli_exceptions() def js_list_pods( namespace: Optional[str], + endpoint_name: Optional[str], ): """ - Get specific pod log for jumpstart model endpoint. + List all pods related to jumpstart model endpoint. """ my_endpoint = HPJumpStartEndpoint.model_construct() - pods = my_endpoint.list_pods(namespace=namespace) + pods = my_endpoint.list_pods(namespace=namespace, endpoint_name=endpoint_name) click.echo(pods) @@ -554,14 +630,23 @@ def js_list_pods( default="default", help="Optional. The namespace of the custom model to list pods for. Default set to 'default'.", ) +@click.option( + "--endpoint-name", + type=click.STRING, + required=False, + help="Optional. The name of the custom model endpoint to list pods.", +) +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "list_pods_custom_endpoint_cli") +@handle_cli_exceptions() def custom_list_pods( namespace: Optional[str], + endpoint_name: Optional[str], ): """ - Get specific pod log for custom model endpoint. + List all pods related to custom model endpoint. """ my_endpoint = HPEndpoint.model_construct() - pods = my_endpoint.list_pods(namespace=namespace) + pods = my_endpoint.list_pods(namespace=namespace, endpoint_name=endpoint_name) click.echo(pods) @@ -585,6 +670,8 @@ def custom_list_pods( default="default", help="Optional. The namespace of the jumpstart model to get logs for. Default set to 'default'.", ) +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "get_logs_js_endpoint") +@handle_cli_exceptions() def js_get_logs( pod_name: str, container: Optional[str], @@ -595,7 +682,10 @@ def js_get_logs( """ my_endpoint = HPJumpStartEndpoint.model_construct() logs = my_endpoint.get_logs(pod=pod_name, container=container, namespace=namespace) - click.echo(logs) + + # Use common log display utility for consistent formatting across all job types + container_info = f" (container: {container})" if container else "" + display_formatted_logs(logs, title=f"JumpStart Endpoint Logs for {pod_name}{container_info}") @click.command("hyp-custom-endpoint") @@ -618,6 +708,8 @@ def js_get_logs( default="default", help="Optional. The namespace of the custom model to get logs for. Default set to 'default'.", ) +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "get_logs_custom_endpoint") +@handle_cli_exceptions() def custom_get_logs( pod_name: str, container: Optional[str], @@ -628,7 +720,10 @@ def custom_get_logs( """ my_endpoint = HPEndpoint.model_construct() logs = my_endpoint.get_logs(pod=pod_name, container=container, namespace=namespace) - click.echo(logs) + + # Use common log display utility for consistent formatting across all job types + container_info = f" (container: {container})" if container else "" + display_formatted_logs(logs, title=f"Custom Endpoint Logs for {pod_name}{container_info}") @click.command("hyp-jumpstart-endpoint") @@ -638,11 +733,13 @@ def custom_get_logs( required=True, help="Required. The time frame to get logs for.", ) +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "get_js_operator_logs") +@handle_cli_exceptions() def js_get_operator_logs( since_hours: float, ): """ - Get operator logs for jumpstart model endpoint in the set time frame. + Get operator logs for jumpstart model endpoint. """ my_endpoint = HPJumpStartEndpoint.model_construct() logs = my_endpoint.get_operator_logs(since_hours=since_hours) @@ -656,11 +753,13 @@ def js_get_operator_logs( required=True, help="Required. The time frame get logs for.", ) +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "get_custom_operator_logs") +@handle_cli_exceptions() def custom_get_operator_logs( since_hours: float, ): """ - Get operator logs for custom model endpoint in the set time frame. + Get operator logs for custom model endpoint. """ my_endpoint = HPEndpoint.model_construct() logs = my_endpoint.get_operator_logs(since_hours=since_hours) diff --git a/src/sagemaker/hyperpod/cli/commands/init.py b/src/sagemaker/hyperpod/cli/commands/init.py new file mode 100644 index 00000000..66ce7068 --- /dev/null +++ b/src/sagemaker/hyperpod/cli/commands/init.py @@ -0,0 +1,387 @@ +import click +import yaml +import sys +from pathlib import Path +from datetime import datetime +from jinja2 import Template +import shutil +from sagemaker.hyperpod.cli.constants.init_constants import ( + USAGE_GUIDE_TEXT_CFN, + USAGE_GUIDE_TEXT_CRD, + CFN +) +from sagemaker.hyperpod.cluster_management.hp_cluster_stack import HpClusterStack +from sagemaker.hyperpod.cli.init_utils import ( + generate_click_command, + save_config_yaml, + TEMPLATES, + load_config, + load_config_and_validate, + validate_config_against_model, + filter_validation_errors_for_user_input, + display_validation_results, + build_config_from_schema, + save_template, + get_default_version_for_template, + create_from_k8s_yaml +) +from sagemaker.hyperpod.common.utils import get_aws_default_region +from sagemaker.hyperpod.common.telemetry.telemetry_logging import ( + _hyperpod_telemetry_emitter, +) +from sagemaker.hyperpod.common.telemetry.constants import Feature + +@click.command("init") +@click.argument("template", type=click.Choice(list(TEMPLATES.keys()))) +@click.argument("directory", type=click.Path(file_okay=False), default=".") +@click.option("--version", "-v", default=None, help="Schema version") +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "init_template_cli") +def init( + template: str, + directory: str, + version: str, +): + """ + Initialize a TEMPLATE scaffold in DIRECTORY. + + This command creates a complete project scaffold for the specified template type. + It performs the following steps: + + 1. Checks if the directory already contains a config.yaml and handles existing configurations + 2. Creates the target directory if it doesn't exist + 3. Generates a config.yaml file with schema-based default values + 4. Creates a template file (.jinja) for the specified template type + 5. Adds a README.md with usage instructions + + The generated files provide a starting point for configuring and submitting + jobs to SageMaker HyperPod clusters orchestrated by Amazon EKS. + """ + dir_path = Path(directory).resolve() + config_file = dir_path / "config.yaml" + skip_readme = False + + # 1) Inspect existing config.yaml + try: + if config_file.is_file(): + try: + existing = yaml.safe_load(config_file.read_text()) or {} + existing_template = existing.get("template") + except Exception as e: + click.echo("Could not parse existing config.yaml: %s", e) + existing_template = None + + if existing_template == template: + click.echo(f"⚠️ config.yaml already initialized as '{template}'.") + if not click.confirm("Override?", default=False): + click.echo("Aborting init.") + return + click.echo("Overriding config.yaml...") + skip_readme = True + else: + click.echo(f"⚠️ Directory already initialized as '{existing_template}'.") + click.secho(f"⚠️ It is highly unrecommended to initiate this directory with a different template.", fg="red") + click.echo(f"⚠️ Recommended path is create a new folder and then init with '{template}'.") + if not click.confirm(f"Do you want to re-initialize this directory with {template}?", default=False): + click.echo("Aborting init.") + return + click.echo(f"Re-initializing {existing_template} → {template}…") + + else: + click.echo(f"Initializing new scaffold for '{template}'…") + except Exception as e: + click.secho("💥 Initialization aborted due to error: %s", e, fg="red") + sys.exit(1) + + # 2) Ensure directory exists + try: + dir_path.mkdir(parents=True, exist_ok=True) + except Exception as e: + click.secho(f"❌ Could not create directory {dir_path}: {e}", fg="red") + sys.exit(1) + + # 3) Build config dict + comment map, then write config.yaml + try: + # Determine version: use user-provided version or default to latest + if version is None: + version = get_default_version_for_template(template) + + # Use the common function to build config from schema + full_cfg, comment_map = build_config_from_schema(template, version) + + save_config_yaml( + prefill=full_cfg, + comment_map=comment_map, + directory=str(dir_path), + ) + + # 4) Generate template + save_template(template, dir_path, version) + + except Exception as e: + click.secho(f"💥 Could not write config.yaml or template: {e}", fg="red") + sys.exit(1) + + # 5) Write README.md + if not skip_readme: + try: + readme_path = dir_path / "README.md" + with open(readme_path, "w") as f: + if TEMPLATES[template]["schema_type"] == CFN: + f.write(USAGE_GUIDE_TEXT_CFN) + else: + f.write(USAGE_GUIDE_TEXT_CRD) + except Exception as e: + click.secho("⚠️ README.md generation failed: %s", e, fg="yellow") + + # Convert to relative path for cleaner display + relative_path = Path(directory) if directory != "." else Path("./") + + click.secho( + f"✔️ {template} for schema version={version!r} is initialized in {relative_path}", + fg="green", + ) + click.echo( + click.style( + "🚀 Welcome!\n" + f"📘 See {relative_path}/README.md for usage.\n", + fg="green", + ) + ) + + +@click.command("reset") +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "init_reset_cli") +def reset(): + """ + Reset the current directory's config.yaml to an "empty" scaffold: + all schema keys set to default values (but keeping the template and version). + """ + dir_path = Path(".").resolve() + + # 1) Load and validate config + data, template, version = load_config(dir_path) + + # 2) Build config with default values from schema + full_cfg, comment_map = build_config_from_schema(template, version) + # 3) Overwrite config.yaml + try: + save_config_yaml( + prefill=full_cfg, + comment_map=comment_map, + directory=str(dir_path), + ) + click.secho("✔️ config.yaml reset: all fields set to default values.", fg="green") + except Exception as e: + click.secho(f"💥 Could not reset config.yaml: {e}", fg="red") + sys.exit(1) + + # 4) Regenerate the k8s Jinja template + if save_template(template, dir_path): + click.secho(f"✔️ {template} is regenerated.", fg="green") + + +@click.command("configure") +@generate_click_command() +@click.pass_context +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "init_configure_cli") +def configure(ctx, model_config): + """ + Update any subset of fields in ./config.yaml by passing -- flags. + + This command allows you to modify specific configuration fields without having + to regenerate the entire config or fix unrelated validation issues. Only the + fields you explicitly provide will be validated, making it easy to update + configurations incrementally. + + Examples: + + # Update a single field + hyp configure --hyperpod-cluster-name my-new-cluster + + # Update multiple fields at once + hyp configure --stack-name my-stack --create-fsx-stack: False + + # Update complex fields with JSON object + hyp configure --availability-zone-ids '["id1", "id2"]' + + """ + # 1) Load existing config without validation + dir_path = Path(".").resolve() + data, template, version = load_config(dir_path) + + # 2) Determine which fields the user actually provided + # Use Click's parameter source tracking to identify command-line provided parameters + user_input_fields = set() + + if ctx and hasattr(ctx, 'params') and model_config: + # Check which parameters were provided via command line (not defaults) + for param_name, param_value in ctx.params.items(): + # Skip if the parameter source indicates it came from default + param_source = ctx.get_parameter_source(param_name) + if param_source and param_source.name == 'COMMANDLINE': + user_input_fields.add(param_name) + + if not user_input_fields: + click.secho("⚠️ No arguments provided to configure.", fg="yellow") + return + + # 3) Build merged config with user input + full_cfg, comment_map = build_config_from_schema( + template=template, + version=version, + model_config=model_config, + existing_config=data, + user_provided_fields=user_input_fields + ) + + # 4) Validate the merged config, but only check user-provided fields + all_validation_errors = validate_config_against_model(full_cfg, template, version) + user_input_errors = filter_validation_errors_for_user_input(all_validation_errors, user_input_fields) + + is_valid = display_validation_results( + user_input_errors, + success_message="User input is valid!" if user_input_errors else "config.yaml updated successfully.", + error_prefix="Invalid input arguments:" + ) + + if not is_valid: + click.secho("❌ config.yaml was not updated due to invalid input.", fg="red") + sys.exit(1) + + # 5) Write out the updated config.yaml (only if user input is valid) + try: + save_config_yaml( + prefill=full_cfg, + comment_map=comment_map, + directory=str(dir_path), + ) + except Exception as e: + click.secho(f"💥 Could not update config.yaml: {e}", fg="red") + sys.exit(1) + + +@click.command("validate") +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "init_validate_cli") +def validate(): + """ + Validate this directory's config.yaml against the appropriate schema. + """ + dir_path = Path(".").resolve() + load_config_and_validate(dir_path) + + +@click.command(name="_default_create") +@click.option("--region", "-r", default=None, help="Region to create cluster stack for, default to your region in aws configure. Not available for other templates.") +@click.option("--template-version", type=click.INT, help="Version number of cluster creation template. Not available for other templates.") +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "init_create_cli") +def _default_create(region, template_version): + """ + Validate configuration and render template files for deployment. + + This command performs the following operations: + + 1. Loads and validates the config.yaml file in the current directory + 2. Determines the template type (CFN for CloudFormation or CRD for Kubernetes) + 3. Locates the appropriate Jinja template file: + - cfn_params.jinja for CloudFormation templates + - k8s.jinja for Kubernetes CRD templates + 4. Validates the configuration using the appropriate schema: + - HpClusterStack validation for CFN templates + - Registry-based validation for CRD templates + 5. Renders the Jinja template with configuration values + 6. Creates a timestamped directory under run/ (e.g., run/20240116T143022/) + 7. Copies the validated config.yaml to the run directory + 8. Writes the rendered output: + - cfn_params.yaml for CloudFormation templates + - k8s.yaml for Kubernetes templates + + The generated files in the run directory can be used for actual deployment + to SageMaker HyperPod clusters or CloudFormation stacks. + + Prerequisites: + - Must be run in a directory initialized with 'hyp init' + - config.yaml and the appropriate template file must exist + """ + dir_path = Path('.').resolve() + config_file = dir_path / 'config.yaml' + + # 1) Load config to determine template type + data, template, version = load_config_and_validate(dir_path) + + # Check if region flag is used for non-cluster-stack templates + if region and template != "cluster-stack": + click.secho(f"❌ --region flag is only available for cluster-stack template, not for {template}.", fg="red") + sys.exit(1) + + # 2) Determine correct jinja file based on template type + info = TEMPLATES[template] + schema_type = info["schema_type"] + if schema_type == CFN: + jinja_file = dir_path / 'cfn_params.jinja' + else: + jinja_file = dir_path / 'k8s.jinja' + + # 3) Ensure files exist + if not config_file.is_file() or not jinja_file.is_file(): + click.secho(f"❌ Missing config.yaml or {jinja_file.name}. Run `hyp init` first.", fg="red") + sys.exit(1) + + try: + template_source = jinja_file.read_text() + tpl = Template(template_source) + rendered = tpl.render(**data) + except Exception as e: + click.secho(f"❌ Failed to render template: {e}", fg="red") + sys.exit(1) + + # 6) Prepare run/ directory and write files + run_root = dir_path / 'run' + run_root.mkdir(exist_ok=True) + timestamp = datetime.now().strftime('%Y%m%dT%H%M%S') + out_dir = run_root / timestamp + out_dir.mkdir() + + try: + shutil.copy(config_file, out_dir / 'config.yaml') + output_file = 'cfn_params.yaml' if schema_type == CFN else 'k8s.yaml' + with open(out_dir / output_file, 'w', encoding='utf-8') as f: + f.write(rendered) + # Use relative path for cleaner display + relative_out_dir = Path("run") / timestamp + click.secho(f"✔️ Submitted! Files written to {relative_out_dir}", fg="green") + except Exception as e: + click.secho(f"❌ Failed to write run files: {e}", fg="red") + sys.exit(1) + + # 7) Make the downstream call + try : + if region is None: + region = get_aws_default_region() + # Only show region message for cluster-stack template + if template == "cluster-stack": + click.secho(f"Submitting to default region: {region}.", fg="yellow") + + # Unified pattern for all templates + dir_path = Path(".").resolve() + data, template, version = load_config(dir_path) + registry = TEMPLATES[template]["registry"] + model = registry.get(str(version)) + if model: + # Filter out CLI metadata fields before passing to model + from sagemaker.hyperpod.cli.init_utils import _filter_cli_metadata_fields + filtered_config = _filter_cli_metadata_fields(data) + template_model = model(**filtered_config) + + # Pass region to to_domain for cluster stack template + if template == "cluster-stack": + config = template_model.to_config(region=region) + HpClusterStack(**config).create(region, template_version) + else: + # Create from k8s.yaml + k8s_file = out_dir / 'k8s.yaml' + create_from_k8s_yaml(str(k8s_file)) + + + except Exception as e: + click.secho(f"❌ Failed to submit the command: {e}", fg="red") + sys.exit(1) \ No newline at end of file diff --git a/src/sagemaker/hyperpod/cli/commands/training.py b/src/sagemaker/hyperpod/cli/commands/training.py index 6f285576..9788cf1f 100644 --- a/src/sagemaker/hyperpod/cli/commands/training.py +++ b/src/sagemaker/hyperpod/cli/commands/training.py @@ -1,17 +1,14 @@ import click -import logging -import os -import yaml -import shutil -import subprocess -from pathlib import Path from sagemaker.hyperpod.training.hyperpod_pytorch_job import HyperPodPytorchJob from sagemaker.hyperpod.common.config import Metadata -import tempfile -from typing import List, Dict, Any, Optional, Callable, get_args, get_origin, Literal from sagemaker.hyperpod.cli.training_utils import generate_click_command -from importlib.metadata import entry_points from hyperpod_pytorch_job_template.registry import SCHEMA_REGISTRY +from sagemaker.hyperpod.common.telemetry.telemetry_logging import ( + _hyperpod_telemetry_emitter, +) +from sagemaker.hyperpod.common.telemetry.constants import Feature +from sagemaker.hyperpod.common.cli_decorators import handle_cli_exceptions +from sagemaker.hyperpod.common.utils import display_formatted_logs @click.command("hyp-pytorch-job") @@ -21,39 +18,13 @@ schema_pkg="hyperpod_pytorch_job_template", registry=SCHEMA_REGISTRY, ) -def pytorch_create(version, debug, config): - """Create a PyTorch job""" - try: - click.echo(f"Using version: {version}") - job_name = config.get("name") - namespace = config.get("namespace") - spec = config.get("spec") - - # Prepare metadata - metadata_kwargs = {"name": job_name} - if namespace: - metadata_kwargs["namespace"] = namespace - - # Prepare job kwargs - job_kwargs = { - "metadata": Metadata(**metadata_kwargs), - "replica_specs": spec.get("replica_specs"), - } - - # Add nproc_per_node if present - if "nproc_per_node" in spec: - job_kwargs["nproc_per_node"] = spec.get("nproc_per_node") - - # Add run_policy if present - if "run_policy" in spec: - job_kwargs["run_policy"] = spec.get("run_policy") - - # Create job - job = HyperPodPytorchJob(**job_kwargs) - job.create(debug=debug) - - except Exception as e: - raise click.UsageError(f"Failed to create job: {str(e)}") +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "create_pytorchjob_cli") +@handle_cli_exceptions() +def pytorch_create(version, debug, job): + """Create a PyTorch job.""" + click.echo(f"Using version: {version}") + # Create job + job.create(debug=debug) @click.command("hyp-pytorch-job") @@ -63,74 +34,72 @@ def pytorch_create(version, debug, config): default="default", help="Optional. The namespace to list jobs from. Defaults to 'default' namespace.", ) +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "list_pytorchjobs_cli") +@handle_cli_exceptions() def list_jobs(namespace: str): - """List all HyperPod PyTorch jobs""" - try: - jobs = HyperPodPytorchJob.list(namespace=namespace) - - if not jobs: - click.echo("No jobs found.") - return - - # Define headers and widths - headers = ["NAME", "NAMESPACE", "STATUS", "AGE"] - widths = [30, 20, 15, 15] - - # Print header - header = "".join(f"{h:<{w}}" for h, w in zip(headers, widths)) - click.echo("\n" + header) - click.echo("-" * sum(widths)) - - # Print each job - for job in jobs: - # Get status from conditions - status = "Unknown" - age = "N/A" + """List all HyperPod PyTorch jobs.""" + jobs = HyperPodPytorchJob.list(namespace=namespace) + + if not jobs: + click.echo("No jobs found.") + return + + # Define headers and widths + headers = ["NAME", "NAMESPACE", "STATUS", "AGE"] + widths = [30, 20, 15, 15] + + # Print header + header = "".join(f"{h:<{w}}" for h, w in zip(headers, widths)) + click.echo("\n" + header) + click.echo("-" * sum(widths)) + + # Print each job + for job in jobs: + # Get status from conditions + status = "Unknown" + age = "N/A" + if job.status and job.status.conditions: + for condition in reversed(job.status.conditions): + if condition.status == "True": + status = condition.type + break + + # Calculate age if job.status and job.status.conditions: - for condition in reversed(job.status.conditions): - if condition.status == "True": - status = condition.type - break - - # Calculate age - if job.status and job.status.conditions: - # Find the 'Created' condition to get the start time - created_condition = next( - (c for c in job.status.conditions if c.type == "Created"), None + # Find the 'Created' condition to get the start time + created_condition = next( + (c for c in job.status.conditions if c.type == "Created"), None + ) + if created_condition and created_condition.lastTransitionTime: + from datetime import datetime, timezone + + start_time = datetime.fromisoformat( + created_condition.lastTransitionTime.replace("Z", "+00:00") ) - if created_condition and created_condition.lastTransitionTime: - from datetime import datetime, timezone - - start_time = datetime.fromisoformat( - created_condition.lastTransitionTime.replace("Z", "+00:00") - ) - now = datetime.now(timezone.utc) - delta = now - start_time - if delta.days > 0: - age = f"{delta.days}d" + now = datetime.now(timezone.utc) + delta = now - start_time + if delta.days > 0: + age = f"{delta.days}d" + else: + hours = delta.seconds // 3600 + if hours > 0: + age = f"{hours}h" else: - hours = delta.seconds // 3600 - if hours > 0: - age = f"{hours}h" - else: - minutes = (delta.seconds % 3600) // 60 - age = f"{minutes}m" - - # Format row - row = "".join( - [ - f"{job.metadata.name:<{widths[0]}}", - f"{job.metadata.namespace:<{widths[1]}}", - f"{status:<{widths[2]}}", - f"{age:<{widths[3]}}", - ] - ) - click.echo(row) - - click.echo() # Add empty line at the end + minutes = (delta.seconds % 3600) // 60 + age = f"{minutes}m" - except Exception as e: - raise click.UsageError(f"Failed to list jobs: {str(e)}") + # Format row + row = "".join( + [ + f"{job.metadata.name:<{widths[0]}}", + f"{job.metadata.namespace:<{widths[1]}}", + f"{status:<{widths[2]}}", + f"{age:<{widths[3]}}", + ] + ) + click.echo(row) + + click.echo() # Add empty line at the end @click.command("hyp-pytorch-job") @@ -143,95 +112,95 @@ def list_jobs(namespace: str): default="default", help="Optional. The namespace of the job. Defaults to 'default' namespace.", ) +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "get_pytorchjob_cli") +@handle_cli_exceptions() def pytorch_describe(job_name: str, namespace: str): - """Describe a HyperPod PyTorch job""" - try: - job = HyperPodPytorchJob.get(name=job_name, namespace=namespace) + """Describe a HyperPod PyTorch job.""" + job = HyperPodPytorchJob.get(name=job_name, namespace=namespace) + + if job is None: + raise Exception(f"Job {job_name} not found in namespace {namespace}") + + # Print basic info + click.echo("\nJob Details:") + click.echo("=" * 80) + click.echo(f"Name: {job.metadata.name}") + click.echo(f"Namespace: {job.metadata.namespace}") + click.echo(f"Labels: {job.metadata.labels}") + click.echo(f"Annotations: {job.metadata.annotations}") + + # Print Spec details + click.echo("\nSpec:") + click.echo("-" * 80) + click.echo(f"Processes per Node: {getattr(job, 'nprocPerNode', 'N/A')}") + + # Print Replica Specs + for replica in job.replicaSpecs: + click.echo(f"\nReplica Spec:") + click.echo(f" Name: {getattr(replica, 'name', 'N/A')}") + click.echo(f" Replicas: {getattr(replica, 'replicas', 'N/A')}") + click.echo(f" Spares: {getattr(replica, 'spares', 'N/A')}") + + # Container details + if ( + hasattr(replica, "template") + and hasattr(replica.template, "spec") + and hasattr(replica.template.spec, "containers") + ): + for container in replica.template.spec.containers: + click.echo("\n Container:") + click.echo( + f" Name: {getattr(container, 'name', 'N/A')}" + ) + click.echo( + f" Image: {getattr(container, 'image', 'N/A')}" + ) + click.echo( + f" Image Pull Policy: {getattr(container, 'imagePullPolicy', 'N/A')}" + ) + if container.resources: + click.echo(" Resources:") + if container.resources.limits: + click.echo(f" Limits: {container.resources.limits}") + if container.resources.requests: + click.echo( + f" Requests: {container.resources.requests}" + ) - if job is None: - raise click.UsageError(f"Job {job_name} not found in namespace {namespace}") - - # Print basic info - click.echo("\nJob Details:") - click.echo("=" * 80) - click.echo(f"Name: {job.metadata.name}") - click.echo(f"Namespace: {job.metadata.namespace}") - - # Print Spec details - click.echo("\nSpec:") - click.echo("-" * 80) - click.echo(f"Processes per Node: {getattr(job, 'nprocPerNode', 'N/A')}") - - # Print Replica Specs - for replica in job.replicaSpecs: - click.echo(f"\nReplica Spec:") - click.echo(f" Name: {getattr(replica, 'name', 'N/A')}") - click.echo(f" Replicas: {getattr(replica, 'replicas', 'N/A')}") - click.echo(f" Spares: {getattr(replica, 'spares', 'N/A')}") - - # Container details - if ( - hasattr(replica, "template") - and hasattr(replica.template, "spec") - and hasattr(replica.template.spec, "containers") - ): - for container in replica.template.spec.containers: - click.echo("\n Container:") - click.echo( - f" Name: {getattr(container, 'name', 'N/A')}" - ) - click.echo( - f" Image: {getattr(container, 'image', 'N/A')}" - ) - click.echo( - f" Image Pull Policy: {getattr(container, 'imagePullPolicy', 'N/A')}" - ) - if container.resources: - click.echo(" Resources:") - if container.resources.limits: - click.echo(f" Limits: {container.resources.limits}") - if container.resources.requests: - click.echo( - f" Requests: {container.resources.requests}" - ) - - # Print Run Policy - click.echo("\nRun Policy:") - click.echo("-" * 80) - if hasattr(job, "runPolicy"): - click.echo( - f"Clean Pod Policy: {getattr(job.runPolicy, 'cleanPodPolicy', 'N/A')}" - ) - click.echo( - f"TTL Seconds After Finished: {getattr(job.runPolicy, 'ttlSecondsAfterFinished', 'N/A')}" - ) - else: - click.echo("Run Policy: N/A") - - # Print Status - click.echo("\nStatus:") - click.echo("-" * 80) - if job.status: - if job.status.conditions: - click.echo("Conditions:") - for condition in job.status.conditions: - click.echo( - f" Type: {getattr(condition, 'type', 'N/A')}" - ) - click.echo( - f" Status: {getattr(condition, 'status', 'N/A')}" - ) - click.echo( - f" Last Transition: {getattr(condition, 'lastTransitionTime', 'N/A')}" - ) - if condition.message: - click.echo(f" Message: {condition.message}") - click.echo() - else: - click.echo("No status information available") - - except Exception as e: - raise click.UsageError(f"Failed to describe job: {str(e)}") + # Print Run Policy + click.echo("\nRun Policy:") + click.echo("-" * 80) + if hasattr(job, "runPolicy"): + click.echo( + f"Clean Pod Policy: {getattr(job.runPolicy, 'cleanPodPolicy', 'N/A')}" + ) + click.echo( + f"TTL Seconds After Finished: {getattr(job.runPolicy, 'ttlSecondsAfterFinished', 'N/A')}" + ) + else: + click.echo("Run Policy: N/A") + + # Print Status + click.echo("\nStatus:") + click.echo("-" * 80) + if job.status: + if job.status.conditions: + click.echo("Conditions:") + for condition in job.status.conditions: + click.echo( + f" Type: {getattr(condition, 'type', 'N/A')}" + ) + click.echo( + f" Status: {getattr(condition, 'status', 'N/A')}" + ) + click.echo( + f" Last Transition: {getattr(condition, 'lastTransitionTime', 'N/A')}" + ) + if condition.message: + click.echo(f" Message: {condition.message}") + click.echo() + else: + click.echo("No status information available") @click.command("hyp-pytorch-job") @@ -244,17 +213,12 @@ def pytorch_describe(job_name: str, namespace: str): default="default", help="Optional. The namespace of the job. Defaults to 'default' namespace.", ) +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "delete_pytorchjob_cli") +@handle_cli_exceptions() def pytorch_delete(job_name: str, namespace: str): - """Delete a HyperPod PyTorch job""" - try: - job = HyperPodPytorchJob.get(name=job_name, namespace=namespace) - job.delete() - - if job is None: - raise click.UsageError(f"Job {job_name} not found in namespace {namespace}") - - except Exception as e: - raise click.UsageError(f"Failed to describe job: {str(e)}") + """Delete a HyperPod PyTorch job.""" + job = HyperPodPytorchJob.get(name=job_name, namespace=namespace) + job.delete() @click.command("hyp-pytorch-job") @@ -269,35 +233,33 @@ def pytorch_delete(job_name: str, namespace: str): default="default", help="Optional. The namespace of the job. Defaults to 'default' namespace.", ) +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "list_pods_pytorchjob_cli") +@handle_cli_exceptions() def pytorch_list_pods(job_name: str, namespace: str): - """List all HyperPod PyTorch pods corresponding to the job""" - try: - job = HyperPodPytorchJob.get(name=job_name, namespace=namespace) - pods = job.list_pods() + """List all HyperPod PyTorch pods related to the job.""" + job = HyperPodPytorchJob.get(name=job_name, namespace=namespace) + pods = job.list_pods() - if not pods: - click.echo(f"\nNo pods found for job: {job_name}") - return + if not pods: + click.echo(f"\nNo pods found for job: {job_name}") + return - # Define headers and widths - headers = ["POD NAME", "NAMESPACE"] - widths = [50, 20] + # Define headers and widths + headers = ["POD NAME", "NAMESPACE"] + widths = [50, 20] - # Print header - click.echo(f"\nPods for job: {job_name}") - header = "".join(f"{h:<{w}}" for h, w in zip(headers, widths)) - click.echo("\n" + header) - click.echo("-" * sum(widths)) + # Print header + click.echo(f"\nPods for job: {job_name}") + header = "".join(f"{h:<{w}}" for h, w in zip(headers, widths)) + click.echo("\n" + header) + click.echo("-" * sum(widths)) - # Print each pod - for pod in pods: - row = "".join([f"{pod:<{widths[0]}}", f"{namespace:<{widths[1]}}"]) - click.echo(row) + # Print each pod + for pod in pods: + row = "".join([f"{pod:<{widths[0]}}", f"{namespace:<{widths[1]}}"]) + click.echo(row) - click.echo() - - except Exception as e: - raise click.UsageError(f"Failed to list jobs: {str(e)}") + click.echo() @click.command("hyp-pytorch-job") @@ -315,33 +277,62 @@ def pytorch_list_pods(job_name: str, namespace: str): default="default", help="Optional. The namespace of the job. Defaults to 'default' namespace.", ) +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "get_pytorchjob_logs_from_pod_cli") +@handle_cli_exceptions() def pytorch_get_logs(job_name: str, pod_name: str, namespace: str): - """Get specific logs from pod corresponding to the job""" + """Get specific pod log for Hyperpod Pytorch job.""" + click.echo("Listing logs for pod: " + pod_name) + job = HyperPodPytorchJob.get(name=job_name, namespace=namespace) + logs = job.get_logs_from_pod(pod_name=pod_name) + + # Use common log display utility for consistent formatting across all job types + display_formatted_logs(logs, title=f"Pod Logs for {pod_name}") + + +@click.command("hyp-pytorch-job") +@click.option( + "--since-hours", + type=click.FLOAT, + required=True, + help="Required. The time frame to get logs for.", +) +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "get_pytorch_operator_logs") +@handle_cli_exceptions() +def pytorch_get_operator_logs(since_hours: float): + """Get operator logs for pytorch training jobs.""" + logs = HyperPodPytorchJob.get_operator_logs(since_hours=since_hours) + + # Use common log display utility for consistent formatting across all job types + display_formatted_logs(logs, title="PyTorch Operator Logs") + + +@click.command("hyp-pytorch-job", + help="""Execute commands in pods associated with a HyperPod PyTorch job. + +Usage Format: + hyp exec --job-name [-p ] [--all-pods] -- """) +@click.option("--job-name", required=True, help="Required. The name of the job to execute the command within.") +@click.option("--pod", "-p", help="The name of the pod to execute the command in. (Required: specify either --pod or --all-pods)") +@click.option("--all-pods", is_flag=True, help="Execute command in all pods associated with the job. (Required: specify either --pod or --all-pods)") +@click.option("--namespace", "-n", default="default", help="Optional. The namespace of the job.") +@click.option("--container", help="Optional. The container name to execute the command in.") +@click.argument("command", nargs=-1, required=True) +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "exec_pytorchjob_cli") +def pytorch_exec(job_name: str, pod: str, all_pods: bool, namespace: str, container: str, command: tuple): + """Execute commands in pods associated with a HyperPod PyTorch job.""" + if (all_pods and pod) or not (all_pods or pod): + raise click.UsageError("Must specify exactly one of the following: --all-pods, --pod") + try: - click.echo("Listing logs for pod: " + pod_name) job = HyperPodPytorchJob.get(name=job_name, namespace=namespace) - logs = job.get_logs_from_pod(pod_name=pod_name) - - if not logs: - click.echo("No logs available.") - return - - # Split logs into lines and display them - log_lines = logs.split("\n") - for line in log_lines: - if line.strip(): # Skip empty lines - # Color coding based on log level - if "ERROR" in line.upper(): - click.secho(line, fg="red") - elif "WARNING" in line.upper(): - click.secho(line, fg="yellow") - elif "INFO" in line.upper(): - click.secho(line, fg="green") - else: - click.echo(line) - - click.echo("\nEnd of logs") - click.echo("=" * 80) - + output = job.exec_command(list(command), pod, all_pods, container) + if output: + click.echo(output) + else: + click.echo("Command executed successfully (no output)") + except ValueError as e: + # User input validation errors + raise click.UsageError(str(e)) except Exception as e: - raise click.UsageError(f"Failed to list jobs: {str(e)}") + # Other errors (API, network, etc.) + raise click.UsageError(f"Failed to execute command: {str(e)}") diff --git a/src/sagemaker/hyperpod/cli/common_utils.py b/src/sagemaker/hyperpod/cli/common_utils.py new file mode 100644 index 00000000..e706eb13 --- /dev/null +++ b/src/sagemaker/hyperpod/cli/common_utils.py @@ -0,0 +1,121 @@ +import sys +from typing import Mapping, Type, List, Dict, Any +import click +import pkgutil +import json + +JUMPSTART_SCHEMA = "hyperpod_jumpstart_inference_template" +CUSTOM_SCHEMA = "hyperpod_custom_inference_template" +JUMPSTART_COMMAND = "hyp-jumpstart-endpoint" +CUSTOM_COMMAND = "hyp-custom-endpoint" +PYTORCH_SCHEMA="hyperpod_pytorch_job_template" +PYTORCH_COMMAND="hyp-pytorch-job" + + +def extract_version_from_args(registry: Mapping[str, Type], schema_pkg: str, default: str) -> str: + if "--version" not in sys.argv: + return default + + idx = sys.argv.index("--version") + if idx + 1 >= len(sys.argv): + return default + + requested_version = sys.argv[idx + 1] + invoked_command = next( + (arg for arg in sys.argv if arg.startswith('hyp-')), + None + ) + + # Check if schema validation is needed + needs_validation = ( + (schema_pkg == JUMPSTART_SCHEMA and invoked_command == JUMPSTART_COMMAND) or + (schema_pkg == CUSTOM_SCHEMA and invoked_command == CUSTOM_COMMAND) or + (schema_pkg == PYTORCH_SCHEMA and invoked_command == PYTORCH_COMMAND) + ) + + if registry is not None and requested_version not in registry: + if needs_validation: + raise click.ClickException(f"Unsupported schema version: {requested_version}") + else: + return default + + return requested_version + + +def get_latest_version(registry: Mapping[str, Type]) -> str: + """ + Get the latest version from the schema registry. + """ + if not registry: + raise ValueError("Schema registry is empty") + + # Sort versions and return the last (highest) one + sorted_versions = sorted(registry.keys(), key=lambda v: [int(x) for x in v.split('.')]) + return sorted_versions[-1] + + +def load_schema_for_version( + version: str, + base_package: str, +) -> dict: + """ + Load schema.json from the top-level .vX_Y_Z package. + """ + ver_pkg = f"{base_package}.v{version.replace('.', '_')}" + raw = pkgutil.get_data(ver_pkg, "schema.json") + if raw is None: + raise click.ClickException( + f"Could not load schema.json for version {version} " + f"(looked in package {ver_pkg})" + ) + return json.loads(raw) + + +def parse_comma_separated_list(value: str) -> List[str]: + """ + Parse a comma-separated string into a list of strings. + Generic utility that can be reused across commands. + + Args: + value: Comma-separated string like "item1,item2,item3" + + Returns: + List of trimmed strings + """ + if not value: + return [] + return [item.strip() for item in value.split(",") if item.strip()] + + +def categorize_resources_by_type(resources: List[Dict[str, Any]], + type_mappings: Dict[str, List[str]]) -> Dict[str, List[str]]: + """ + Generic function to categorize resources by type. + + Args: + resources: List of resource dictionaries with 'ResourceType' and 'LogicalResourceId' + type_mappings: Dictionary mapping category names to lists of resource types + + Returns: + Dictionary of category -> list of resource names + """ + categorized = {category: [] for category in type_mappings.keys()} + categorized["Other"] = [] + + for resource in resources: + resource_type = resource.get("ResourceType", "") + logical_id = resource.get("LogicalResourceId", "") + + # Find which category this resource type belongs to + category_found = False + for category, types in type_mappings.items(): + if any(resource_type.startswith(rt) for rt in types): + categorized[category].append(logical_id) + category_found = True + break + + if not category_found: + categorized["Other"].append(logical_id) + + # Remove empty categories + return {k: v for k, v in categorized.items() if v} diff --git a/src/sagemaker/hyperpod/cli/constants/command_constants.py b/src/sagemaker/hyperpod/cli/constants/command_constants.py index c086179c..3fc96606 100644 --- a/src/sagemaker/hyperpod/cli/constants/command_constants.py +++ b/src/sagemaker/hyperpod/cli/constants/command_constants.py @@ -44,6 +44,7 @@ SAGEMAKER_MANAGED_CLUSTER_QUEUE_SUFFIX = "-clusterqueue" SAGEMAKER_TRAINING_LAUNCHER_DIR = str(Path(__file__).parent.parent / "sagemaker_hyperpod_recipes") NVIDIA_GPU_RESOURCE_LIMIT_KEY = "nvidia.com/gpu" +NEURON_RESOURCE_LIMIT_KEY = "aws.amazon.com/neurondevice" AVAILABLE_ACCELERATOR_DEVICES_KEY = "AvailableAcceleratorDevices" TOTAL_ACCELERATOR_DEVICES_KEY = "TotalAcceleratorDevices" USER_NAME_LABEL_KEY = "sagemaker.user/created-by" diff --git a/src/sagemaker/hyperpod/cli/constants/init_constants.py b/src/sagemaker/hyperpod/cli/constants/init_constants.py new file mode 100644 index 00000000..3168484d --- /dev/null +++ b/src/sagemaker/hyperpod/cli/constants/init_constants.py @@ -0,0 +1,356 @@ +from hyperpod_jumpstart_inference_template.registry import SCHEMA_REGISTRY as JS_EP_REG, TEMPLATE_REGISTRY as JS_EP_TEMPLATE_REG +from hyperpod_custom_inference_template.registry import SCHEMA_REGISTRY as CUSTOM_EP_REG, TEMPLATE_REGISTRY as CUSTOM_EP_TEMPLATE_REG +from hyperpod_pytorch_job_template.registry import SCHEMA_REGISTRY as PYTORCH_JOB_REG, TEMPLATE_REGISTRY as PYTORCH_JOB_TEMPLATE_REG +from hyperpod_cluster_stack_template.registry import SCHEMA_REGISTRY as CLUSTER_REG, TEMPLATE_REGISTRY as CLUSTER_TEMPLATE_REG + +import sys + +# Here is the list of existing templates supported +# You can onboard new template by adding the mapping here + +CRD = "crd" +CFN = "cfn" +TEMPLATES = { + "hyp-jumpstart-endpoint": { + "registry": JS_EP_REG, + "template_registry": JS_EP_TEMPLATE_REG, + "schema_pkg": "hyperpod_jumpstart_inference_template", + "schema_type": CRD, + 'type': "jinja" + }, + "hyp-custom-endpoint": { + "registry": CUSTOM_EP_REG, + "template_registry": CUSTOM_EP_TEMPLATE_REG, + "schema_pkg": "hyperpod_custom_inference_template", + "schema_type": CRD, + 'type': "jinja" + }, + "hyp-pytorch-job": { + "registry": PYTORCH_JOB_REG, + "template_registry": PYTORCH_JOB_TEMPLATE_REG, + "schema_pkg": "hyperpod_pytorch_job_template", + "schema_type": CRD, + 'type': "jinja" + }, + "cluster-stack": { + "registry": CLUSTER_REG, + "template_registry": CLUSTER_TEMPLATE_REG, + "schema_pkg": "hyperpod_cluster_stack_template", + "schema_type": CFN, + 'type': "jinja" + } +} + +# K8s Kind to class mapping for create_from_k8s_yaml +K8S_KIND_MAPPING = { + "InferenceEndpointConfig": { + "class_path": "sagemaker.hyperpod.inference.hp_endpoint.HPEndpoint", + "metadata_handling": "separate" # metadata handled separately + }, + "JumpStartModel": { + "class_path": "sagemaker.hyperpod.inference.hp_jumpstart_endpoint.HPJumpStartEndpoint", + "metadata_handling": "separate" + }, + "HyperPodPyTorchJob": { + "class_path": "sagemaker.hyperpod.training.hyperpod_pytorch_job.HyperPodPytorchJob", + "metadata_handling": "combined" # metadata combined with spec + } +} + + +def _get_handler_from_template_version(template_name, version, handler_name): + """Dynamically import handler from a specific version of a template""" + try: + template_info = TEMPLATES[template_name] + registry = template_info["registry"] + + if version not in registry: + return None + + model_class = registry[version] + module = sys.modules[model_class.__module__] + return getattr(module, handler_name) + except (ImportError, AttributeError): + return None + + +# Template.field to handler mapping - avoids conflicts and works reliably +SPECIAL_FIELD_HANDLERS = { + 'hyp-pytorch-job.1.0.volume': _get_handler_from_template_version("hyp-pytorch-job", "1.0", "VOLUME_TYPE_HANDLER"), + 'hyp-pytorch-job.1.1.volume': _get_handler_from_template_version("hyp-pytorch-job", "1.1", "VOLUME_TYPE_HANDLER"), +} + +USAGE_GUIDE_TEXT_CFN = """# SageMaker HyperPod CLI - Initialization Workflow + +This document explains the initialization workflow and related commands for the SageMaker HyperPod CLI. + +## Table of Contents +- [Init Command](#init-command) +- [Configure Command](#configure-command) +- [Reset Command](#reset-command) +- [Validate Command](#validate-command) +- [Create Command](#create-command) + +## Init Command + +The `init` command creates a scaffold for your HyperPod cluster stack configuration. It generates a `config.yaml` file, a CFN template (`cfn_params.jinja`), and a README with usage instructions. + +### Basic Usage + +```bash +hyp init +``` + +Example: +```bash +hyp init cluster-stack +``` + +This creates the following files in your current directory: +``` +├── config.yaml # Configuration file with default values +├── cfn_params.jinja # Cloudformation template with placeholders +└── README.md # Usage instructions +``` + +### Specifying a Directory + +You can specify a target directory for initialization: + +```bash +hyp init cluster-stack +cd +``` + +### Edge Cases + +**Re-initializing the same template:** +``` +hyp init cluster-stack +⚠️ config.yaml already initialized as 'cluster-stack'. +Overwrite? [y/N]: +``` + +**Initializing with a different template:** +``` +hyp init hyp-custom-endpoint +⚠️ Directory already initialized as 'cluster-stack'. +⚠️ It is highly unrecommended to initiate this directory with a different template. +⚠️ Recommended path is create a new folder and then init with 'hyp-custom-endpoint'. +If you insist, re-init as 'hyp-custom-endpoint' instead? [y/N]: +``` + +## Configure Command + +The `configure` command updates specific fields in your `config.yaml` file without modifying other values. + +```bash +hyp configure \ + --stack-name my-stack \ + --create-fsx-stack: False +``` + +## Reset Command + +The `reset` command resets your `config.yaml` to default values while preserving the template type and namespace. + +```bash +hyp reset +``` + +## Validate Command + +The `validate` command checks your `config.yaml` against the JSON schema to ensure all required fields are present and valid. + +```bash +hyp validate +``` + +## Create Command + +The `create` command processes your configuration and creates the cluster stack. It injects values from `config.yaml` into the `cfn_params.jinja` template and creates a timestamped record in the `runs` directory. + +```bash +hyp create +``` + +After submission, your directory structure will look like: +``` +├── config.yaml +├── cfn_params.jinja +├── README.md +└── runs/ + └── 2025-07-16T15-22-03Z/ + ├── config.yaml # Copy of the config used for this run + └── cfn_params.yaml # Generated Cloudformation template +``` + +## Workflow Example + +A typical workflow might look like: + +1. Initialize a new endpoint configuration: + ```bash + hyp init cluster-stack + ``` + +2. Configure required parameters: + ```bash + hyp configure \ + --stack-name my-stack \ + --create-fsx-stack: False + ``` + +3. Validate the configuration: + ```bash + hyp validate + ``` + +4. Create the cluster stack request: + ```bash + hyp create + ``` + +5. Check the status of your cluster stack: + ```bash + hyp list cluster-stack + ``` +""" + +USAGE_GUIDE_TEXT_CRD = """# SageMaker HyperPod CLI - Initialization Workflow + +This document explains the initialization workflow and related commands for the SageMaker HyperPod CLI. + +## Table of Contents +- [Init Command](#init-command) +- [Configure Command](#configure-command) +- [Reset Command](#reset-command) +- [Validate Command](#validate-command) +- [Create Command](#create-command) + +## Init Command + +The `init` command creates a scaffold for your HyperPod endpoint configuration. It generates a `config.yaml` file, a Kubernetes template (`k8s.jinja`), and a README with usage instructions. + +### Basic Usage + +```bash +hyp init +``` + +Example: +```bash +hyp init hyp-jumpstart-endpoint +``` + +This creates the following files in your current directory: +``` +├── config.yaml # Configuration file with default values +├── k8s.jinja # Kubernetes template with placeholders +└── README.md # Usage instructions +``` + +### Specifying a Directory + +You can specify a target directory for initialization: + +```bash +hyp init hyp-jumpstart-endpoint +cd +``` + +### Edge Cases + +**Re-initializing the same template:** +``` +hyp init hyp-jumpstart-endpoint +⚠️ config.yaml already initialized as 'hyp-jumpstart-endpoint'. +Overwrite? [y/N]: +``` + +**Initializing with a different template:** +``` +hyp init hyp-custom-endpoint +⚠️ Directory already initialized as 'hyp-jumpstart-endpoint'. +⚠️ It is highly unrecommended to initiate this directory with a different template. +⚠️ Recommended path is create a new folder and then init with 'hyp-custom-endpoint'. +If you insist, re-init as 'hyp-custom-endpoint' instead? [y/N]: +``` + +## Configure Command + +The `configure` command updates specific fields in your `config.yaml` file without modifying other values. + +```bash +hyp configure \ + --instance-type ml.g5.12xlarge \ + --model-version 2.0.4 +``` + +## Reset Command + +The `reset` command resets your `config.yaml` to default values while preserving the template type and namespace. + +```bash +hyp reset +``` + +## Validate Command + +The `validate` command checks your `config.yaml` against the JSON schema to ensure all required fields are present and valid. + +```bash +hyp validate +``` + +## Create Command + +The `create` command processes your configuration and creates the endpoint. It injects values from `config.yaml` into the `k8s.jinja` template and creates a timestamped record in the `runs` directory. + +```bash +hyp create +``` + +After submission, your directory structure will look like: +``` +├── config.yaml +├── k8s.jinja +├── README.md +└── runs/ + └── 2025-07-16T15-22-03Z/ + ├── config.yaml # Copy of the config used for this run + └── k8s.yaml # Generated Kubernetes manifest +``` + +## Workflow Example + +A typical workflow might look like: + +1. Initialize a new endpoint configuration: + ```bash + hyp init hyp-jumpstart-endpoint + ``` + +2. Configure required parameters: + ```bash + hyp configure \ + --model-id meta-textgeneration-llama-3-70b \ + --instance-type ml.g5.8xlarge \ + --endpoint-name my-llama-endpoint + ``` + +3. Validate the configuration: + ```bash + hyp validate + ``` + +4. Create the endpoint creation request: + ```bash + hyp create + ``` + +5. Check the status of your endpoint: + ```bash + hyp list hyp-jumpstart-endpoint + ``` +""" diff --git a/src/sagemaker/hyperpod/cli/constants/pytorch_constants.py b/src/sagemaker/hyperpod/cli/constants/pytorch_constants.py index 0d76d1d7..be24743b 100644 --- a/src/sagemaker/hyperpod/cli/constants/pytorch_constants.py +++ b/src/sagemaker/hyperpod/cli/constants/pytorch_constants.py @@ -13,3 +13,4 @@ PYTORCH_CUSTOM_OBJECT_GROUP = "kubeflow.org" PYTORCH_CUSTOM_OBJECT_PLURAL = "pytorchjobs" PYTORCH_CUSTOM_OBJECT_VERSION = "v1" +HYPERPOD_PYTORCH_CRD_NAME = "hyperpodpytorchjobs.sagemaker.amazonaws.com" diff --git a/src/sagemaker/hyperpod/cli/hyp_cli.py b/src/sagemaker/hyperpod/cli/hyp_cli.py index 24b05a83..872c21ee 100644 --- a/src/sagemaker/hyperpod/cli/hyp_cli.py +++ b/src/sagemaker/hyperpod/cli/hyp_cli.py @@ -4,10 +4,13 @@ import os import subprocess from pydantic import BaseModel, ValidationError, Field -from typing import Optional +from typing import Optional, Union +from importlib.metadata import version, PackageNotFoundError from sagemaker.hyperpod.cli.commands.cluster import list_cluster, set_cluster_context, get_cluster_context, \ - get_monitoring + get_monitoring, describe_cluster +from sagemaker.hyperpod.cli.commands.cluster_stack import create_cluster_stack, describe_cluster_stack, \ + list_cluster_stacks, update_cluster, delete_cluster_stack from sagemaker.hyperpod.cli.commands.training import ( pytorch_create, list_jobs, @@ -15,6 +18,8 @@ pytorch_delete, pytorch_list_pods, pytorch_get_logs, + pytorch_get_operator_logs, + pytorch_exec, ) from sagemaker.hyperpod.cli.commands.inference import ( js_create, @@ -34,79 +39,158 @@ custom_get_operator_logs, ) +from sagemaker.hyperpod.cli.commands.init import ( + init, + reset, + configure, + validate, + _default_create +) -@click.group() -def cli(): - pass +def get_package_version(package_name): + try: + return version(package_name) + except PackageNotFoundError: + return "Not installed" -class CLICommand(click.Group): +def print_version(ctx, param, value): + if not value or ctx.resilient_parsing: + return + + hyp_version = get_package_version("sagemaker-hyperpod") + pytorch_template_version = get_package_version("hyperpod-pytorch-job-template") + custom_inference_version = get_package_version("hyperpod-custom-inference-template") + jumpstart_inference_version = get_package_version("hyperpod-jumpstart-inference-template") + + click.echo(f"hyp version: {hyp_version}") + click.echo(f"hyperpod-pytorch-job-template version: {pytorch_template_version}") + click.echo(f"hyperpod-custom-inference-template version: {custom_inference_version}") + click.echo(f"hyperpod-jumpstart-inference-template version: {jumpstart_inference_version}") + ctx.exit() + + +@click.group(context_settings={'max_content_width': 200}) +@click.option('--version', is_flag=True, callback=print_version, expose_value=False, is_eager=True, help='Show version information') +def cli(): pass -@cli.group(cls=CLICommand) +class CLICommand(click.Group): + def __init__(self, *args, default_cmd: Union[str, None] = None, **kwargs): + super().__init__(*args, **kwargs) + self.default_cmd = default_cmd + + def parse_args(self, ctx, args): + # Only inject default subcommand when: + # - user didn't name a subcommand, and + # - user didn't ask for help + if self.default_cmd: + # any non-flag token that is a known subcommand? + has_subcmd = any((not a.startswith("-")) and (a in self.commands) for a in args) + asked_for_help = any(a in ("-h", "--help") for a in args) + if (not has_subcmd) and (not asked_for_help): + args = [self.default_cmd] + args + return super().parse_args(ctx, args) + + +@cli.group(cls=CLICommand, default_cmd='_default_create') def create(): - """Create a jumpstart model endpoint, a custom model endpoint, or a pytorch job.""" + """ + Create endpoints, pytorch jobs or cluster stacks. + + If only used as 'hyp create' without [OPTIONS] COMMAND [ARGS] during init experience, + then it will validate configuration and render template files for deployment. + The generated files in the run directory can be used for actual deployment + to SageMaker HyperPod clusters or CloudFormation stacks. + + Prerequisites for directly calling 'hyp create': + - Must be run in a directory initialized with 'hyp init' + - config.yaml and the appropriate template file must exist + """ pass @cli.group(cls=CLICommand) def list(): - """List all jumpstart model endpoints, custom model endpoints, or pytorch jobs.""" + """List endpoints, pytorch jobs or cluster stacks.""" pass @cli.group(cls=CLICommand) def describe(): - """Describe a jumpstart model endpoint, a custom model endpoint, or a pytorch job.""" + """Describe endpoints, pytorch jobs or cluster stacks.""" pass +@cli.group(cls=CLICommand) +def update(): + """Update an existing HyperPod cluster configuration.""" + pass @cli.group(cls=CLICommand) def delete(): - """Delete a jumpstart model endpoint, a custom model endpoint, or a pytorch job.""" + """Delete endpoints or pytorch jobs.""" pass @cli.group(cls=CLICommand) def list_pods(): - """List all pods for jumpstart model endpoint, custom model endpoint or pytorch jobs.""" + """List pods for endpoints or pytorch jobs.""" pass @cli.group(cls=CLICommand) def get_logs(): - """Get specific pod logs for a jumpstart model endpoint, custom model endpoint or pytorch job.""" + """Get pod logs for endpoints or pytorch jobs.""" pass @cli.group(cls=CLICommand) def invoke(): - """Invoke a jumpstart model endpoint or a custom model endpoint.""" + """Invoke model endpoints.""" pass @cli.group(cls=CLICommand) def get_operator_logs(): - """Get operator logs for jumpstart model endpoint, or custom model endpoint.""" + """Get operator logs for endpoints.""" pass +@cli.group(cls=CLICommand) +def exec(): + """Execute commands in pods for endpoints or pytorch jobs.""" + pass + + +cli.add_command(init) +cli.add_command(reset) +cli.add_command(configure) +cli.add_command(validate) + create.add_command(pytorch_create) create.add_command(js_create) create.add_command(custom_create) +_default_create.hidden = True +create.add_command(_default_create) list.add_command(list_jobs) list.add_command(js_list) list.add_command(custom_list) +list.add_command(list_cluster_stacks) describe.add_command(pytorch_describe) describe.add_command(js_describe) describe.add_command(custom_describe) +describe.add_command(describe_cluster_stack) +describe.add_command(describe_cluster) + +update.add_command(update_cluster) delete.add_command(pytorch_delete) delete.add_command(js_delete) delete.add_command(custom_delete) +delete.add_command(delete_cluster_stack) list_pods.add_command(pytorch_list_pods) list_pods.add_command(js_list_pods) @@ -116,6 +200,7 @@ def get_operator_logs(): get_logs.add_command(js_get_logs) get_logs.add_command(custom_get_logs) +get_operator_logs.add_command(pytorch_get_operator_logs) get_operator_logs.add_command(js_get_operator_logs) get_operator_logs.add_command(custom_get_operator_logs) @@ -126,7 +211,9 @@ def get_operator_logs(): cli.add_command(set_cluster_context) cli.add_command(get_cluster_context) cli.add_command(get_monitoring) +# cli.add_command(create_cluster_stack) # Not supported yet +exec.add_command(pytorch_exec) if __name__ == "__main__": cli() diff --git a/src/sagemaker/hyperpod/cli/inference_utils.py b/src/sagemaker/hyperpod/cli/inference_utils.py index 4fd76193..eb38da16 100644 --- a/src/sagemaker/hyperpod/cli/inference_utils.py +++ b/src/sagemaker/hyperpod/cli/inference_utils.py @@ -2,25 +2,21 @@ import pkgutil import click from typing import Callable, Optional, Mapping, Type - - -def load_schema_for_version(version: str, schema_pkg: str) -> dict: - ver_pkg = f"{schema_pkg}.v{version.replace('.', '_')}" - raw = pkgutil.get_data(ver_pkg, "schema.json") - if raw is None: - raise click.ClickException(f"Could not load schema.json for version {version}") - return json.loads(raw) +import sys +from sagemaker.hyperpod.cli.common_utils import extract_version_from_args, get_latest_version, load_schema_for_version def generate_click_command( *, - version_key: Optional[str] = None, schema_pkg: str = "hyperpod_jumpstart_inference_template", registry: Mapping[str, Type] = None, ) -> Callable: if registry is None: raise ValueError("You must pass a registry mapping version→Model") + default_version = get_latest_version(registry) + version = extract_version_from_args(registry, schema_pkg, default_version) + def decorator(func: Callable) -> Callable: # Parser for the single JSON‐dict env var flag def _parse_json_flag(ctx, param, value): @@ -33,8 +29,8 @@ def _parse_json_flag(ctx, param, value): # 1) the wrapper click actually invokes def wrapped_func(*args, **kwargs): - namespace = kwargs.pop("namespace", None) - version = version_key or kwargs.pop("version", "1.0") + pop_version = kwargs.pop("version", default_version) + debug = kwargs.pop("debug", False) Model = registry.get(version) if Model is None: @@ -42,47 +38,31 @@ def wrapped_func(*args, **kwargs): flat = Model(**kwargs) domain = flat.to_domain() - return func(namespace, version, domain) + return func(version, debug, domain) # 2) inject the special JSON‐env flag before everything else - wrapped_func = click.option( - "--env", - callback=_parse_json_flag, - type=str, - default=None, - help=( - "JSON object of environment variables, e.g. " - '\'{"VAR1":"foo","VAR2":"bar"}\'' - ), - metavar="JSON", - )(wrapped_func) - - wrapped_func = click.option( - "--dimensions", - callback=_parse_json_flag, - type=str, - default=None, - help=("JSON object of dimensions, e.g. " '\'{"VAR1":"foo","VAR2":"bar"}\''), - metavar="JSON", - )(wrapped_func) - - wrapped_func = click.option( - "--resources-limits", - callback=_parse_json_flag, - help='JSON object of resource limits, e.g. \'{"cpu":"2","memory":"4Gi"}\'', - metavar="JSON", - )(wrapped_func) - - wrapped_func = click.option( - "--resources-requests", - callback=_parse_json_flag, - help='JSON object of resource requests, e.g. \'{"cpu":"1","memory":"2Gi"}\'', - metavar="JSON", - )(wrapped_func) + schema = load_schema_for_version(version, schema_pkg) + props = schema.get("properties", {}) + + json_flags = { + "env": ("JSON object of environment variables, e.g. " '\'{"VAR1":"foo","VAR2":"bar"}\''), + "dimensions": ("JSON object of dimensions, e.g. " '\'{"VAR1":"foo","VAR2":"bar"}\''), + "resources_limits": ('JSON object of resource limits, e.g. \'{"cpu":"2","memory":"4Gi"}\''), + "resources_requests": ('JSON object of resource requests, e.g. \'{"cpu":"1","memory":"2Gi"}\''), + } + + for flag_name, help_text in json_flags.items(): + if flag_name in props: + wrapped_func = click.option( + f"--{flag_name.replace('_', '-')}", + callback=_parse_json_flag, + type=str, + default=None, + help=help_text, + metavar="JSON", + )(wrapped_func) # 3) auto-inject all schema.json fields - schema = load_schema_for_version(version_key or "1.0", schema_pkg) - props = schema.get("properties", {}) reqs = set(schema.get("required", [])) for name, spec in reversed(list(props.items())): @@ -118,4 +98,4 @@ def wrapped_func(*args, **kwargs): return wrapped_func - return decorator + return decorator \ No newline at end of file diff --git a/src/sagemaker/hyperpod/cli/init_utils.py b/src/sagemaker/hyperpod/cli/init_utils.py new file mode 100644 index 00000000..1fb43d09 --- /dev/null +++ b/src/sagemaker/hyperpod/cli/init_utils.py @@ -0,0 +1,566 @@ +import importlib +import json +import logging +import pkgutil +import click +from typing import Callable, Tuple, get_origin, get_args +import os +import yaml +import sys +from pathlib import Path +from sagemaker.hyperpod.cli.type_handler_utils import convert_cli_value, to_click_type, is_complex_type, DEFAULT_TYPE_HANDLER +from pydantic import ValidationError +from typing import List, Any +from sagemaker.hyperpod.cli.constants.init_constants import ( + TEMPLATES, + CRD, + CFN, + SPECIAL_FIELD_HANDLERS +) + +log = logging.getLogger() + +def save_template(template: str, directory_path: Path, version: str = None) -> bool: + """ + Save the appropriate template based on the template type and version. + Template content is loaded directly from the template registry. + """ + try: + template_info = TEMPLATES[template] + + # Use provided version or get latest + if version is None: + version = _get_latest_version_from_registry(template) + + # Get template content from registry + template_registry = template_info["template_registry"] + template_content = template_registry.get(str(version)) + + if not template_content: + raise Exception(f"No template found for version {version}") + + if template_info["schema_type"] == CRD: + _save_k8s_jinja(directory=str(directory_path), content=template_content) + elif template_info["schema_type"] == CFN: + _save_cfn_jinja(directory=str(directory_path), content=template_content) + return True + except Exception as e: + click.secho(f"⚠️ Template generation failed: {e}", fg="yellow") + return False + +def _save_cfn_jinja(directory: str, content: str): + Path(directory).mkdir(parents=True, exist_ok=True) + path = os.path.join(directory, "cfn_params.jinja") + + with open(path, "w", encoding="utf-8") as f: + f.write(content) + return path + +def _save_k8s_jinja(directory: str, content: str): + Path(directory).mkdir(parents=True, exist_ok=True) + path = os.path.join(directory, "k8s.jinja") + with open(path, "w", encoding="utf-8") as f: + f.write(content) + return path + + +def _filter_cli_metadata_fields(config_data: dict) -> dict: + """ + Filter out CLI metadata fields that should not be passed to Pydantic models. + + Args: + config_data: Configuration data dictionary + + Returns: + Filtered dictionary without CLI metadata fields + """ + return { + k: v for k, v in config_data.items() + if k not in ('template', 'version') and v is not None + } + + +def _get_latest_version_from_registry(template: str) -> str: + """ + Get the latest version available in the registry for a given template. + + Args: + template: Template name + + Returns: + Latest version string (e.g., "1.0", "2.0") + """ + template_info = TEMPLATES.get(template) + if not template_info: + raise click.ClickException(f"Unknown template: {template}") + + registry = template_info.get("registry") + if not registry: + raise click.ClickException(f"No registry found for template: {template}") + + # Get all available versions and return the latest + available_versions = list(registry.keys()) + if not available_versions: + raise click.ClickException(f"No versions available in registry for template: {template}") + + # Sort versions to get the latest (assuming semantic versioning) + # Convert to tuples for proper version comparison (e.g., "1.0" -> (1, 0)) + def version_key(v): + try: + return tuple(map(int, v.split('.'))) + except ValueError: + # Fallback for non-numeric versions + return (0, 0) + + latest_version = max(available_versions, key=version_key) + return str(latest_version) + + +def get_default_version_for_template(template: str) -> str: + """ + Get the default version for a template (latest available). + + Args: + template: Template name + + Returns: + Default version string + """ + # Check if template exists first + if template not in TEMPLATES: + raise click.ClickException(f"Unknown template: {template}") + + try: + return _get_latest_version_from_registry(template) + except Exception: + raise click.ClickException(f"Could not get the latest version for template: {template}") + + +def _load_schema_for_version(version: str, schema_pkg: str) -> dict: + ver_pkg = f"{schema_pkg}.v{str(version).replace('.', '_')}" + raw = pkgutil.get_data(ver_pkg, "schema.json") + if raw is None: + raise click.ClickException(f"Could not load schema.json for version {version}") + return json.loads(raw) + + +def _get_handler_for_field(template_name, field_name): + """Get appropriate handler for a field using template.field mapping.""" + if template_name and field_name: + scoped_key = f"{template_name}.{field_name}" + handler = SPECIAL_FIELD_HANDLERS.get(scoped_key, DEFAULT_TYPE_HANDLER) + return handler + + return DEFAULT_TYPE_HANDLER + + +def _get_click_option_config(handler, field_type, default=None, required=False, help_text=""): + """Get Click option configuration for any handler.""" + # Handle PydanticUndefined for Click compatibility + from pydantic_core import PydanticUndefined + if default is PydanticUndefined: + default = None + + config = { + "multiple": handler.get('needs_multiple_option', False), + "help": help_text, + } + + # Add defaults and requirements + if default is not None: + config["default"] = default + config["show_default"] = True + # Always set type, callback overrides when needed + config["type"] = to_click_type(field_type) + + # Add callback for special handlers or complex types + if handler != DEFAULT_TYPE_HANDLER or is_complex_type(field_type): + config["callback"] = handler['parse_strings'] + + if is_complex_type(field_type): + config["metavar"] = "JSON" + + return {k: v for k, v in config.items() if v is not None} + + +def generate_click_command() -> Callable: + """ + Decorator that: + - injects -- for every property in the current template's schema (detected from config.yaml) + - only works for configure command, returns minimal decorator for others + """ + + # Only execute full decorator logic for configure command + is_configure_command = len(sys.argv) > 1 and sys.argv[1] == "configure" + + if not is_configure_command: + # Return a minimal decorator that doesn't add any options + def decorator(func: Callable) -> Callable: + return func + return decorator + + config_file = Path(".").resolve() / "config.yaml" + if not config_file.is_file(): + click.secho("❌ No config.yaml found. Run 'hyp init