|
2 | 2 | from typing import Optional, Literal, List, Any |
3 | 3 |
|
4 | 4 | class ClusterStackBase(BaseModel): |
5 | | - stage: Optional[str] = Field(None, description="Deployment stage (gamma, prod)") |
| 5 | + stage: Optional[str] = Field("gamma", description="Deployment stage (gamma, prod)") |
6 | 6 | enable_hp_inference_feature: Optional[str] = Field(None, description="Feature flag for enabling HP inference") |
7 | 7 | custom_bucket_name: Optional[str] = Field(None, description="Custom S3 bucket name for templates") |
8 | | - resource_name_prefix: Optional[str] = Field(None, description="Prefix to be used for all resources") |
| 8 | + resource_name_prefix: Optional[str] = Field("sagemaker-hyperpod-eks", description="Prefix to be used for all resources") |
9 | 9 | vpc_cidr: Optional[str] = Field(None, description="The IP range (CIDR notation) for the VPC") |
10 | 10 | availability_zone_ids: Optional[str] = Field(None, description="List of AZs to deploy subnets in") |
11 | 11 | vpc_id: Optional[str] = Field(None, description="The ID of the VPC") |
12 | 12 | nat_gateway_ids: Optional[str] = Field(None, description="Comma-separated list of NAT Gateway IDs") |
13 | | - security_group_id: Optional[str] = Field(None, description="The ID of the security group") |
14 | | - kubernetes_version: Optional[str] = Field(None, description="The Kubernetes version") |
| 13 | + security_group_id: Optional[str] = Field("", description="The ID of the security group") |
| 14 | + kubernetes_version: Optional[str] = Field("1.31", description="The Kubernetes version") |
15 | 15 | node_provisioning_mode: Optional[str] = Field(None, description="The node provisioning mode") |
16 | | - eks_cluster_name: Optional[str] = Field(None, description="The name of the EKS cluster") |
| 16 | + eks_cluster_name: Optional[str] = Field("eks", description="The name of the EKS cluster") |
17 | 17 | eks_private_subnet_ids: Optional[str] = Field(None, description="Comma-delimited list of private subnet IDs") |
18 | 18 | security_group_ids: Optional[str] = Field(None, description="The Id of your cluster security group") |
19 | 19 | private_route_table_ids: Optional[str] = Field(None, description="Comma-separated list of private route table IDs") |
20 | 20 | s3_bucket_name: Optional[str] = Field(None, description="The name of the S3 bucket") |
21 | | - github_raw_url: Optional[str] = Field(None, description="The raw GitHub URL for the lifecycle script") |
22 | | - helm_repo_url: Optional[str] = Field(None, description="The URL of the Helm repo") |
23 | | - helm_repo_path: Optional[str] = Field(None, description="The path to the HyperPod Helm chart") |
24 | | - helm_operators: Optional[str] = Field(None, description="The configuration of HyperPod Helm chart") |
25 | | - namespace: Optional[str] = Field(None, description="The namespace to deploy the HyperPod Helm chart") |
26 | | - helm_release: Optional[str] = Field(None, description="The name of the Helm release") |
27 | | - hyperpod_cluster_name: Optional[str] = Field(None, description="Name of SageMaker HyperPod Cluster") |
28 | | - node_recovery: Optional[str] = Field(None, description="Instance recovery setting") |
29 | | - sagemaker_iam_role_name: Optional[str] = Field(None, description="The name of the IAM role") |
| 21 | + github_raw_url: Optional[str] = Field("https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/refs/heads/main/1.architectures/7.sagemaker-hyperpod-eks/LifecycleScripts/base-config/on_create.sh", description="The raw GitHub URL for the lifecycle script") |
| 22 | + helm_repo_url: Optional[str] = Field("https://github.com/aws/sagemaker-hyperpod-cli.git", description="The URL of the Helm repo") |
| 23 | + helm_repo_path: Optional[str] = Field("helm_chart/HyperPodHelmChart", description="The path to the HyperPod Helm chart") |
| 24 | + helm_operators: Optional[str] = Field("", description="The configuration of HyperPod Helm chart") |
| 25 | + namespace: Optional[str] = Field("kube-system", description="The namespace to deploy the HyperPod Helm chart") |
| 26 | + helm_release: Optional[str] = Field("hyperpod-dependencies", description="The name of the Helm release") |
| 27 | + hyperpod_cluster_name: Optional[str] = Field("hp-cluster", description="Name of SageMaker HyperPod Cluster") |
| 28 | + node_recovery: Optional[str] = Field("Automatic", description="Instance recovery setting") |
| 29 | + sagemaker_iam_role_name: Optional[str] = Field("iam-role", description="The name of the IAM role") |
30 | 30 | private_subnet_ids: Optional[str] = Field(None, description="Comma-separated list of private subnet IDs") |
31 | | - on_create_path: Optional[str] = Field(None, description="The file name of lifecycle script") |
32 | | - instance_group_settings: Optional[str] = Field(None, description="JSON array string containing instance group configurations") |
33 | | - rig_settings: Optional[str] = Field(None, description="JSON array string containing restricted instance group configurations") |
| 31 | + on_create_path: Optional[str] = Field("sagemaker-hyperpod-eks-bucket", description="The file name of lifecycle script") |
| 32 | + instance_group_settings: Optional[str] = Field('[{"InstanceCount":1,"InstanceGroupName":"ig-1","InstanceStorageConfigs":[],"InstanceType":"ml.t3.medium","ThreadsPerCore":1},{"InstanceCount":1,"InstanceGroupName":"ig-2","InstanceStorageConfigs":[],"InstanceType":"ml.t3.medium","ThreadsPerCore":1}]', description="JSON array string containing instance group configurations") |
| 33 | + rig_settings: Optional[str] = Field("", description="JSON array string containing restricted instance group configurations") |
34 | 34 | rig_s3_bucket_name: Optional[str] = Field(None, description="The name of the S3 bucket for RIG resources") |
35 | | - tags: Optional[str] = Field(None, description="Custom tags for managing the SageMaker HyperPod cluster") |
36 | | - fsx_subnet_id: Optional[str] = Field(None, description="The subnet id for FSx") |
| 35 | + tags: Optional[str] = Field("", description="Custom tags for managing the SageMaker HyperPod cluster") |
| 36 | + fsx_subnet_id: Optional[str] = Field("", description="The subnet id for FSx") |
37 | 37 | fsx_availability_zone_id: Optional[str] = Field(None, description="The availability zone for FSx") |
38 | | - per_unit_storage_throughput: Optional[int] = Field(None, description="Per unit storage throughput") |
| 38 | + per_unit_storage_throughput: Optional[int] = Field(250, description="Per unit storage throughput") |
39 | 39 | data_compression_type: Optional[str] = Field(None, description="Data compression type") |
40 | | - file_system_type_version: Optional[float] = Field(None, description="File system type version") |
41 | | - storage_capacity: Optional[int] = Field(None, description="Storage capacity in GiB") |
42 | | - fsx_file_system_id: Optional[str] = Field(None, description="Existing FSx file system ID") |
43 | | - create_vpc_stack: Optional[bool] = Field(None, description="Boolean to Create VPC Stack") |
44 | | - create_security_group_stack: Optional[bool] = Field(None, description="Boolean to Create Security Group Stack") |
45 | | - create_eks_cluster_stack: Optional[bool] = Field(None, description="Boolean to Create EKS Cluster Stack") |
46 | | - create_s3_bucket_stack: Optional[bool] = Field(None, description="Boolean to Create S3 Bucket Stack") |
47 | | - create_s3_endpoint_stack: Optional[bool] = Field(None, description="Boolean to Create S3 Endpoint Stack") |
48 | | - create_life_cycle_script_stack: Optional[bool] = Field(None, description="Boolean to Create Life Cycle Script Stack") |
49 | | - create_sagemaker_iam_role_stack: Optional[bool] = Field(None, description="Boolean to Create SageMaker IAM Role Stack") |
50 | | - create_helm_chart_stack: Optional[bool] = Field(None, description="Boolean to Create Helm Chart Stack") |
51 | | - create_hyperpod_cluster_stack: Optional[bool] = Field(None, description="Boolean to Create HyperPod Cluster Stack") |
52 | | - create_fsx_stack: Optional[bool] = Field(None, description="Boolean to Create FSx Stack") |
| 40 | + file_system_type_version: Optional[float] = Field(2.15, description="File system type version") |
| 41 | + storage_capacity: Optional[int] = Field(1200, description="Storage capacity in GiB") |
| 42 | + fsx_file_system_id: Optional[str] = Field("", description="Existing FSx file system ID") |
| 43 | + create_vpc_stack: Optional[bool] = Field(True, description="Boolean to Create VPC Stack") |
| 44 | + create_security_group_stack: Optional[bool] = Field(True, description="Boolean to Create Security Group Stack") |
| 45 | + create_eks_cluster_stack: Optional[bool] = Field(True, description="Boolean to Create EKS Cluster Stack") |
| 46 | + create_s3_bucket_stack: Optional[bool] = Field(True, description="Boolean to Create S3 Bucket Stack") |
| 47 | + create_s3_endpoint_stack: Optional[bool] = Field(True, description="Boolean to Create S3 Endpoint Stack") |
| 48 | + create_life_cycle_script_stack: Optional[bool] = Field(True, description="Boolean to Create Life Cycle Script Stack") |
| 49 | + create_sagemaker_iam_role_stack: Optional[bool] = Field(True, description="Boolean to Create SageMaker IAM Role Stack") |
| 50 | + create_helm_chart_stack: Optional[bool] = Field(True, description="Boolean to Create Helm Chart Stack") |
| 51 | + create_hyperpod_cluster_stack: Optional[bool] = Field(True, description="Boolean to Create HyperPod Cluster Stack") |
| 52 | + create_fsx_stack: Optional[bool] = Field(True, description="Boolean to Create FSx Stack") |
0 commit comments