Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions helm_chart/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
HyperPodHelmChartForRIG/charts/*/templates/
HyperPodHelmChartForRIG/charts/*.tgz
HyperPodHelmChart/charts/*.tgz
Original file line number Diff line number Diff line change
Expand Up @@ -164,3 +164,5 @@ spec:
operator: Exists
- effect: NoExecute
operator: Exists
- key: CriticalAddonsOnly
operator: Exists
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ mpiOperator:
## Tolerations for pod assignment
## Ref: https://kubernetes.io/docs/concepts/configuration/taint-and-toleration/
tolerations:
- key: CriticalAddonsOnly
operator: Exists
- key: sagemaker.amazonaws.com/node-health-status
operator: "Equal"
value: "Unschedulable"
Expand All @@ -35,4 +37,4 @@ mpiOperator:
imagePullPolicy: IfNotPresent

## Apply extra labels to all created resources
extraLabels: {}
extraLabels: {}
Original file line number Diff line number Diff line change
Expand Up @@ -54,5 +54,8 @@ spec:
timeoutSeconds: 3
securityContext:
allowPrivilegeEscalation: false
tolerations:
- key: CriticalAddonsOnly
operator: Exists
serviceAccountName: training-operator
terminationGracePeriodSeconds: 10
terminationGracePeriodSeconds: 10
2 changes: 2 additions & 0 deletions helm_chart/HyperPodHelmChart/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,8 @@ nvidia-device-plugin:
operator: Equal
value: Unschedulable
effect: NoSchedule
- key: CriticalAddonsOnly
operator: Exists

neuron-device-plugin:
devicePlugin:
Expand Down
48 changes: 48 additions & 0 deletions helm_chart/HyperPodHelmChartForRIG/Chart.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
apiVersion: v2
name: hyperpod-helm-chart-for-rig
description: A Helm chart for Kubernetes

# A chart can be either an 'application' or a 'library' chart.
#
# Application charts are a collection of templates that can be packaged into versioned archives
# to be deployed.
#
# Library charts provide useful utilities or functions for the chart developer. They're included as
# a dependency of application charts to inject those utilities and functions into the rendering
# pipeline. Library charts do not define any templates and therefore cannot be deployed.
type: application

# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 0.1.0

# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
# follow Semantic Versioning. They should reflect the version the application is using.
# It is recommended to use it with quotes.
appVersion: "1.16.0"

dependencies:
- name: training-operators
version: "0.1.0"
repository: "file://charts/training-operators"
- name: nvidia-device-plugin
version: "0.16.1"
repository: https://nvidia.github.io/k8s-device-plugin
condition: nvidia-device-plugin.devicePlugin.enabled
- name: aws-efa-k8s-device-plugin
version: "0.5.3"
repository: https://aws.github.io/eks-charts/
condition: aws-efa-k8s-device-plugin.devicePlugin.enabled
- name: neuron-device-plugin
version: "0.1.0"
repository: "file://charts/neuron-device-plugin"
condition: neuron-device-plugin.devicePlugin.enabled
- name: mpi-operator
version: "0.1.0"
repository: "file://charts/mpi-operator"
condition: mpi_operator.enabled
- name: coredns
version: "0.1.0"
repository: "file://charts/coredns"
196 changes: 196 additions & 0 deletions helm_chart/HyperPodHelmChartForRIG/values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
# OVERRIDE values for HyperPodHelmChart for RIG.
# This is a YAML-formatted file.
# Declare variables to be passed into your templates.

# value_safe_name of dependencies - see install_rig_dependencies.sh
#
# Note:
#
# The format is add-on: {keys:...} .
# Helm wil AUTOMATICALLY SCOPE the add-on values when resolving each dependency
# In other words, what is exposed to each dependency template is
# {{ .Values.tolerations }} , NOT {{ .Values.add-on.tolerations }}
coredns:
tolerations:
- effect: NoSchedule
key: node-role.kubernetes.io/control-plane
- key: CriticalAddonsOnly
operator: Exists
- effect: NoSchedule
operator: Equal
key: "sagemaker.amazonaws.com/RestrictedNode"
value: "Worker"
nodeSelector:
"sagemaker.amazonaws.com/instance-group-type": "Restricted"

mpi-operator:
tolerations:
- key: "sagemaker.amazonaws.com/node-health-status"
operator: "Equal"
value: "Unschedulable"
effect: "NoSchedule"
- effect: NoSchedule
operator: Equal
key: "sagemaker.amazonaws.com/RestrictedNode"
value: "Worker"
nodeSelector:
"sagemaker.amazonaws.com/instance-group-type": "Restricted"

neuron-device-plugin:
tolerations:
- key: CriticalAddonsOnly
operator: Exists
- key: "aws.amazon.com/neuron"
operator: Exists
effect: NoSchedule
- key: "sagemaker.amazonaws.com/node-health-status"
operator: Equal
value: Unschedulable
effect: NoSchedule
- effect: NoSchedule
operator: Equal
key: "sagemaker.amazonaws.com/RestrictedNode"
value: "Worker"
nodeSelector: {}

training-operators:
tolerations:
- effect: NoSchedule
operator: Equal
key: "sagemaker.amazonaws.com/RestrictedNode"
value: "Worker"
nodeSelector:
"sagemaker.amazonaws.com/instance-group-type": "Restricted"

# Overrides for externally-maintained Helm charts
aws-efa-k8s-device-plugin:
devicePlugin:
enabled: true
supportedInstanceLabels:
values:
- ml.c5n.9xlarge
- ml.c5n.18xlarge
- ml.g5.8xlarge
- ml.g5.12xlarge
- ml.g5.16xlarge
- ml.g5.24xlarge
- ml.g5.48xlarge
- ml.g6.8xlarge
- ml.g6.12xlarge
- ml.g6.16xlarge
- ml.g6.24xlarge
- ml.g6.48xlarge
- ml.g6e.8xlarge
- ml.g6e.12xlarge
- ml.g6e.16xlarge
- ml.g6e.24xlarge
- ml.g6e.48xlarge
- ml.gr6.8xlarge
- ml.i3en.large
- ml.i3en.xlarge
- ml.i3en.2xlarge
- ml.i3en.3xlarge
- ml.i3en.6xlarge
- ml.i3en.12xlarge
- ml.i3en.24xlarge
- ml.m7i.large
- ml.m7i.xlarge
- ml.m7i.2xlarge
- ml.m7i.4xlarge
- ml.m7i.8xlarge
- ml.m7i.12xlarge
- ml.m7i.16xlarge
- ml.m7i.24xlarge
- ml.m7i.48xlarge
- ml.p4d.24xlarge
- ml.p4de.24xlarge
- ml.p5.48xlarge
- ml.p5e.48xlarge
- ml.p5en.48xlarge
- ml.r7i.large
- ml.r7i.xlarge
- ml.r7i.2xlarge
- ml.r7i.4xlarge
- ml.r7i.8xlarge
- ml.r7i.12xlarge
- ml.r7i.16xlarge
- ml.r7i.24xlarge
- ml.r7i.48xlarge
- ml.trn1.32xlarge
- ml.trn1n.32xlarge
- ml.trn2.48xlarge
tolerations:
- key: CriticalAddonsOnly
operator: Exists
- effect: NoSchedule
key: aws.amazon.com/efa
operator: Exists
- key: sagemaker.amazonaws.com/node-health-status
operator: "Equal"
value: "Unschedulable"
effect: "NoSchedule"
- effect: NoSchedule
operator: Equal
key: "sagemaker.amazonaws.com/RestrictedNode"
value: "Worker"
nodeSelector:
"sagemaker.amazonaws.com/instance-group-type": "Restricted"

nvidia-device-plugin:
devicePlugin:
enabled: true
allowDefaultNamespace: true
namespaceOverride: "kube-system"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: node.kubernetes.io/instance-type
operator: In
values:
- ml.g5.xlarge
- ml.g5.2xlarge
- ml.g5.4xlarge
- ml.g5.8xlarge
- ml.g5.12xlarge
- ml.g5.16xlarge
- ml.g5.24xlarge
- ml.g5.48xlarge
- ml.g6.xlarge
- ml.g6.2xlarge
- ml.g6.4xlarge
- ml.g6.8xlarge
- ml.g6.16xlarge
- ml.g6.12xlarge
- ml.g6.24xlarge
- ml.g6.48xlarge
- ml.g6e.xlarge
- ml.g6e.2xlarge
- ml.g6e.4xlarge
- ml.g6e.8xlarge
- ml.g6e.12xlarge
- ml.g6e.16xlarge
- ml.g6e.24xlarge
- ml.g6e.48xlarge
- ml.gr6.4xlarge
- ml.gr6.8xlarge
- ml.p4d.24xlarge
- ml.p4de.24xlarge
- ml.p5.48xlarge
- ml.p5e.48xlarge
- ml.p5en.48xlarge
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
- key: sagemaker.amazonaws.com/node-health-status
operator: Equal
value: Unschedulable
effect: NoSchedule
- effect: NoSchedule
operator: Equal
key: "sagemaker.amazonaws.com/RestrictedNode"
value: "Worker"
nodeSelector:
"sagemaker.amazonaws.com/instance-group-type": "Restricted"
84 changes: 84 additions & 0 deletions helm_chart/install_rig_dependencies.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#!/bin/bash

SRC_DIR="HyperPodHelmChart"
OUTPUT_DIR="HyperPodHelmChartForRIG"

# Format: "<eks|hyperpod>,namespace,<k8s_name|chart_dir>"
add_ons=(
"eks,kube-system,coredns"
"hp,kube-system,mpi-operator"
"hp,kube-system,neuron-device-plugin"
"hp,kube-system,training-operators"
)

fetch_yaml_and_enable_overrides() {
local resources=("${!1}")

rm -rf $OUTPUT_DIR/charts

for resource in "${resources[@]}"; do
IFS=',' read -r scope namespace name <<< "$resource"
echo "Processing $scope add-on called $name in namespace $namespace..."

value_safe_name=${name//-/_} # Convert hyphens to underscores
cp -r $SRC_DIR/charts/$name $OUTPUT_DIR/charts/$name
rm -rf $OUTPUT_DIR/charts/$name/templates
rm -f $OUTPUT_DIR/charts/$name/*.tgz
mkdir -p $OUTPUT_DIR/charts/$name/templates

if [ "$scope" = "eks" ]; then
kubectl get deployment $name -n $namespace -o yaml | \
yq 'select(.kind == "Deployment" or .kind == "DaemonSet")' - | yq e "
.metadata.name = \"rig-\" + .metadata.name |
.spec.template.spec.nodeSelector = \"NODESELECTORS\" |
.spec.template.spec.tolerations = \"TOLERATIONS\"
" - | \
sed "s/NODESELECTORS/\n{{ toYaml (index .Values \"nodeSelector\") | indent 8 }}/" |
sed "s/TOLERATIONS/\n{{ toYaml (index .Values \"tolerations\" ) | indent 8 }}/" \
> $OUTPUT_DIR/charts/$name/templates/$name.yml


cat << EOF > $OUTPUT_DIR/charts/$name/Chart.yaml
apiVersion: v2
name: $name
version: 0.1.0
appVersion: 1.0
description: A Helm chart for setting up $name in RIG Workers
EOF


else
helm template dependencies $SRC_DIR/charts/$name -f $SRC_DIR/values.yaml -f $SRC_DIR/charts/$name/values.yaml --debug | \
yq 'select(.kind == "Deployment" or .kind == "DaemonSet")' - | yq e "
.metadata.name = \"rig-\" + .metadata.name |
.spec.template.spec.nodeSelector = \"NODESELECTORS\" |
.spec.template.spec.tolerations = \"TOLERATIONS\"
" - | \
sed "s/NODESELECTORS/\n{{ toYaml (index .Values \"nodeSelector\") | indent 8 }}/" |
sed "s/TOLERATIONS/\n{{ toYaml (index .Values \"tolerations\" ) | indent 8 }}/" \
> $OUTPUT_DIR/charts/$name/templates/$name.yml
fi
done
}

if ! command -v yq &> /dev/null; then
echo "Error: yq is required but not installed."
exit 1
fi
fetch_yaml_and_enable_overrides add_ons[@]
helm dependencies update ./HyperPodHelmChartForRIG # This needs to be run after any dependency template change before "helm <template | install>"
helm template rig-dependencies ./HyperPodHelmChartForRIG --namespace kube-system -f ./HyperPodHelmChartForRIG/values.yaml > rig-dependencies.yaml
cat rig-dependencies.yaml
echo
read -p "🚀 Do you want to install this Helm chart? [y/N]: " confirm

if [[ "$confirm" =~ ^[Yy]$ ]]; then
echo "🔧 Installing Helm chart..."
helm install rig-dependencies ./HyperPodHelmChartForRIG --namespace kube-system -f ./HyperPodHelmChartForRIG/values.yaml
else
echo "❌ Installation cancelled."
fi

echo "Templates generated in $OUTPUT_DIR"
echo ""
echo ""
Loading