From 30dac6af541d3db16bf635742d25754a9f5402c1 Mon Sep 17 00:00:00 2001 From: Chris Chan Date: Mon, 12 May 2025 21:24:18 +0000 Subject: [PATCH 1/4] Add CriticalAddonsOnly toleration for all dependencies --- .../templates/health-monitoring-agent.yaml | 2 ++ helm_chart/HyperPodHelmChart/charts/mpi-operator/values.yaml | 4 +++- .../Deployment/training-operator-kubeflow-Deployment.yaml | 5 ++++- helm_chart/HyperPodHelmChart/values.yaml | 2 ++ 4 files changed, 11 insertions(+), 2 deletions(-) diff --git a/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/templates/health-monitoring-agent.yaml b/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/templates/health-monitoring-agent.yaml index 342d8437..a9fa8ed1 100644 --- a/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/templates/health-monitoring-agent.yaml +++ b/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/templates/health-monitoring-agent.yaml @@ -164,3 +164,5 @@ spec: operator: Exists - effect: NoExecute operator: Exists + - key: CriticalAddonsOnly + operator: Exists diff --git a/helm_chart/HyperPodHelmChart/charts/mpi-operator/values.yaml b/helm_chart/HyperPodHelmChart/charts/mpi-operator/values.yaml index c21fb48c..02f4dc6a 100644 --- a/helm_chart/HyperPodHelmChart/charts/mpi-operator/values.yaml +++ b/helm_chart/HyperPodHelmChart/charts/mpi-operator/values.yaml @@ -22,6 +22,8 @@ mpiOperator: ## Tolerations for pod assignment ## Ref: https://kubernetes.io/docs/concepts/configuration/taint-and-toleration/ tolerations: + - key: CriticalAddonsOnly + operator: Exists - key: sagemaker.amazonaws.com/node-health-status operator: "Equal" value: "Unschedulable" @@ -35,4 +37,4 @@ mpiOperator: imagePullPolicy: IfNotPresent ## Apply extra labels to all created resources -extraLabels: {} \ No newline at end of file +extraLabels: {} diff --git a/helm_chart/HyperPodHelmChart/charts/training-operators/templates/Deployment/training-operator-kubeflow-Deployment.yaml b/helm_chart/HyperPodHelmChart/charts/training-operators/templates/Deployment/training-operator-kubeflow-Deployment.yaml index 3ef20b51..567f3680 100644 --- a/helm_chart/HyperPodHelmChart/charts/training-operators/templates/Deployment/training-operator-kubeflow-Deployment.yaml +++ b/helm_chart/HyperPodHelmChart/charts/training-operators/templates/Deployment/training-operator-kubeflow-Deployment.yaml @@ -54,5 +54,8 @@ spec: timeoutSeconds: 3 securityContext: allowPrivilegeEscalation: false + tolerations: + - key: CriticalAddonsOnly + operator: Exists serviceAccountName: training-operator - terminationGracePeriodSeconds: 10 \ No newline at end of file + terminationGracePeriodSeconds: 10 diff --git a/helm_chart/HyperPodHelmChart/values.yaml b/helm_chart/HyperPodHelmChart/values.yaml index edd5c2fd..a8fea98a 100644 --- a/helm_chart/HyperPodHelmChart/values.yaml +++ b/helm_chart/HyperPodHelmChart/values.yaml @@ -180,6 +180,8 @@ nvidia-device-plugin: operator: Equal value: Unschedulable effect: NoSchedule + - key: CriticalAddonsOnly + operator: Exists neuron-device-plugin: devicePlugin: From e9725a244cbac4e8f523867e4e97f8e13a1ccb96 Mon Sep 17 00:00:00 2001 From: Chris Chan Date: Wed, 14 May 2025 15:15:04 +0000 Subject: [PATCH 2/4] Add installation script for RIG Worker dependencies --- helm_chart/.gitignore | 3 + helm_chart/HyperPodHelmChartForRIG/Chart.yaml | 52 +++++ .../HyperPodHelmChartForRIG/values.yaml | 206 ++++++++++++++++++ helm_chart/install_rig_dependencies.sh | 72 ++++++ 4 files changed, 333 insertions(+) create mode 100644 helm_chart/.gitignore create mode 100644 helm_chart/HyperPodHelmChartForRIG/Chart.yaml create mode 100644 helm_chart/HyperPodHelmChartForRIG/values.yaml create mode 100644 helm_chart/install_rig_dependencies.sh diff --git a/helm_chart/.gitignore b/helm_chart/.gitignore new file mode 100644 index 00000000..ff55b968 --- /dev/null +++ b/helm_chart/.gitignore @@ -0,0 +1,3 @@ +HyperPodHelmChartForRIG/charts/*/templates/ +HyperPodHelmChartForRIG/charts/*.tgz +HyperPodHelmChart/charts/*.tgz diff --git a/helm_chart/HyperPodHelmChartForRIG/Chart.yaml b/helm_chart/HyperPodHelmChartForRIG/Chart.yaml new file mode 100644 index 00000000..199823a4 --- /dev/null +++ b/helm_chart/HyperPodHelmChartForRIG/Chart.yaml @@ -0,0 +1,52 @@ +apiVersion: v2 +name: hyperpod-helm-chart-for-rig +description: A Helm chart for Kubernetes + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.1.0 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "1.16.0" + +dependencies: + - name: training-operators + version: "0.1.0" + repository: "file://charts/training-operators" + - name: nvidia-device-plugin + version: "0.16.1" + repository: https://nvidia.github.io/k8s-device-plugin + condition: nvidia-device-plugin.devicePlugin.enabled + - name: aws-efa-k8s-device-plugin + version: "0.5.3" + repository: https://aws.github.io/eks-charts/ + condition: aws-efa-k8s-device-plugin.devicePlugin.enabled + - name: neuron-device-plugin + version: "0.1.0" + repository: "file://charts/neuron-device-plugin" + condition: neuron-device-plugin.devicePlugin.enabled + - name: health-monitoring-agent + version: "0.1.0" + repository: "file://charts/health-monitoring-agent" + condition: health_monitoring_agent.enabled + - name: mpi-operator + version: "0.1.0" + repository: "file://charts/mpi-operator" + condition: mpi_operator.enabled + - name: coredns + version: "0.1.0" + repository: "file://charts/coredns" diff --git a/helm_chart/HyperPodHelmChartForRIG/values.yaml b/helm_chart/HyperPodHelmChartForRIG/values.yaml new file mode 100644 index 00000000..10bd9550 --- /dev/null +++ b/helm_chart/HyperPodHelmChartForRIG/values.yaml @@ -0,0 +1,206 @@ +# OVERRIDE values for HyperPodHelmChart for RIG. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + +# value_safe_name of dependencies - see install_rig_dependencies.sh +# +# Note: +# +# The format is add-on: {keys:...} . +# Helm wil AUTOMATICALLY SCOPE the add-on values when resolving each dependency +# In other words, what is exposed to each dependency template is +# {{ .Values.tolerations }} , NOT {{ .Values.add-on.tolerations }} +coredns: + tolerations: + - effect: NoSchedule + key: node-role.kubernetes.io/control-plane + - key: CriticalAddonsOnly + operator: Exists + - effect: NoSchedule + operator: Exists + key: "sagemaker.amazonaws.com/RestrictedNode" + value: "Worker" + nodeSelector: + "sagemaker.amazonaws.com/instance-group-type": "Restricted" + +health-monitoring-agent: + tolerations: + - effect: NoSchedule + operator: Exists + - effect: NoExecute + operator: Exists + - effect: NoSchedule + operator: Exists + key: "sagemaker.amazonaws.com/RestrictedNode" + value: "Worker" + nodeSelector: {} + +mpi-operator: + tolerations: + - key: "sagemaker.amazonaws.com/node-health-status" + operator: "Equal" + value: "Unschedulable" + effect: "NoSchedule" + - effect: NoSchedule + operator: Exists + key: "sagemaker.amazonaws.com/RestrictedNode" + value: "Worker" + nodeSelector: + "sagemaker.amazonaws.com/instance-group-type": "Restricted" + +neuron-device-plugin: + tolerations: + - key: CriticalAddonsOnly + operator: Exists + - key: "aws.amazon.com/neuron" + operator: Exists + effect: NoSchedule + - key: "sagemaker.amazonaws.com/node-health-status" + operator: Equal + value: Unschedulable + effect: NoSchedule + - effect: NoSchedule + operator: Exists + key: "sagemaker.amazonaws.com/RestrictedNode" + value: "Worker" + nodeSelector: {} + +training-operators: + tolerations: + - effect: NoSchedule + operator: Exists + key: "sagemaker.amazonaws.com/RestrictedNode" + value: "Worker" + nodeSelector: + "sagemaker.amazonaws.com/instance-group-type": "Restricted" + +# Overrides for externally-maintained Helm charts +aws-efa-k8s-device-plugin: + devicePlugin: + enabled: true + supportedInstanceLabels: + values: + - ml.c5n.9xlarge + - ml.c5n.18xlarge + - ml.g5.8xlarge + - ml.g5.12xlarge + - ml.g5.16xlarge + - ml.g5.24xlarge + - ml.g5.48xlarge + - ml.g6.8xlarge + - ml.g6.12xlarge + - ml.g6.16xlarge + - ml.g6.24xlarge + - ml.g6.48xlarge + - ml.g6e.8xlarge + - ml.g6e.12xlarge + - ml.g6e.16xlarge + - ml.g6e.24xlarge + - ml.g6e.48xlarge + - ml.gr6.8xlarge + - ml.i3en.large + - ml.i3en.xlarge + - ml.i3en.2xlarge + - ml.i3en.3xlarge + - ml.i3en.6xlarge + - ml.i3en.12xlarge + - ml.i3en.24xlarge + - ml.m7i.large + - ml.m7i.xlarge + - ml.m7i.2xlarge + - ml.m7i.4xlarge + - ml.m7i.8xlarge + - ml.m7i.12xlarge + - ml.m7i.16xlarge + - ml.m7i.24xlarge + - ml.m7i.48xlarge + - ml.p4d.24xlarge + - ml.p4de.24xlarge + - ml.p5.48xlarge + - ml.p5e.48xlarge + - ml.p5en.48xlarge + - ml.r7i.large + - ml.r7i.xlarge + - ml.r7i.2xlarge + - ml.r7i.4xlarge + - ml.r7i.8xlarge + - ml.r7i.12xlarge + - ml.r7i.16xlarge + - ml.r7i.24xlarge + - ml.r7i.48xlarge + - ml.trn1.32xlarge + - ml.trn1n.32xlarge + - ml.trn2.48xlarge + tolerations: + - key: CriticalAddonsOnly + operator: Exists + - effect: NoSchedule + key: aws.amazon.com/efa + operator: Exists + - key: sagemaker.amazonaws.com/node-health-status + operator: "Equal" + value: "Unschedulable" + effect: "NoSchedule" + - effect: NoSchedule + operator: Exists + key: "sagemaker.amazonaws.com/RestrictedNode" + value: "Worker" + nodeSelector: {} + +nvidia-device-plugin: + devicePlugin: + enabled: true + allowDefaultNamespace: true + namespaceOverride: "kube-system" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node.kubernetes.io/instance-type + operator: In + values: + - ml.g5.xlarge + - ml.g5.2xlarge + - ml.g5.4xlarge + - ml.g5.8xlarge + - ml.g5.12xlarge + - ml.g5.16xlarge + - ml.g5.24xlarge + - ml.g5.48xlarge + - ml.g6.xlarge + - ml.g6.2xlarge + - ml.g6.4xlarge + - ml.g6.8xlarge + - ml.g6.16xlarge + - ml.g6.12xlarge + - ml.g6.24xlarge + - ml.g6.48xlarge + - ml.g6e.xlarge + - ml.g6e.2xlarge + - ml.g6e.4xlarge + - ml.g6e.8xlarge + - ml.g6e.12xlarge + - ml.g6e.16xlarge + - ml.g6e.24xlarge + - ml.g6e.48xlarge + - ml.gr6.4xlarge + - ml.gr6.8xlarge + - ml.p4d.24xlarge + - ml.p4de.24xlarge + - ml.p5.48xlarge + - ml.p5e.48xlarge + - ml.p5en.48xlarge + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + - key: sagemaker.amazonaws.com/node-health-status + operator: Equal + value: Unschedulable + effect: NoSchedule + - effect: NoSchedule + operator: Exists + key: "sagemaker.amazonaws.com/RestrictedNode" + value: "Worker" + nodeSelector: {} diff --git a/helm_chart/install_rig_dependencies.sh b/helm_chart/install_rig_dependencies.sh new file mode 100644 index 00000000..f47ceaf2 --- /dev/null +++ b/helm_chart/install_rig_dependencies.sh @@ -0,0 +1,72 @@ +#!/bin/bash + +SRC_DIR="HyperPodHelmChart" +OUTPUT_DIR="HyperPodHelmChartForRIG" + +# Format: ",namespace," +add_ons=( + "eks,kube-system,coredns" + "hp,kube-system,mpi-operator" + "hp,kube-system,neuron-device-plugin" + "hp,kube-system,health-monitoring-agent" + "hp,kube-system,training-operators" +) + +fetch_yaml_and_enable_overrides() { + local resources=("${!1}") + + rm -rf $OUTPUT_DIR/charts + + for resource in "${resources[@]}"; do + IFS=',' read -r scope namespace name <<< "$resource" + echo "Processing $scope add-on called $name in namespace $namespace..." + + value_safe_name=${name//-/_} # Convert hyphens to underscores + cp -r $SRC_DIR/charts/$name $OUTPUT_DIR/charts/$name + rm -rf $OUTPUT_DIR/charts/$name/templates + rm -f $OUTPUT_DIR/charts/$name/*.tgz + mkdir -p $OUTPUT_DIR/charts/$name/templates + + if [ "$scope" = "eks" ]; then + kubectl get deployment $name -n $namespace -o yaml | \ + yq 'select(.kind == "Deployment" or .kind == "DaemonSet")' - | yq e " + .spec.template.spec.nodeSelector = \"NODESELECTORS\" | + .spec.template.spec.tolerations = \"TOLERATIONS\" + " - | \ + sed "s/NODESELECTORS/\n{{ toYaml (index .Values \"nodeSelector\") | indent 8 }}/" | + sed "s/TOLERATIONS/\n{{ toYaml (index .Values \"tolerations\" ) | indent 8 }}/" \ + > $OUTPUT_DIR/charts/$name/templates/$name.yml + + + cat << EOF > $OUTPUT_DIR/charts/$name/Chart.yaml +apiVersion: v2 +name: $name +version: 0.1.0 +appVersion: 1.0 +description: A Helm chart for setting up $name in RIG Workers +EOF + + + else + helm template $name $SRC_DIR/charts/$name -f $SRC_DIR/values.yaml -f $SRC_DIR/charts/$name/values.yaml --debug | \ + yq 'select(.kind == "Deployment" or .kind == "DaemonSet")' - | yq e " + .spec.template.spec.nodeSelector = \"NODESELECTORS\" | + .spec.template.spec.tolerations = \"TOLERATIONS\" + " - | \ + sed "s/NODESELECTORS/\n{{ toYaml (index .Values \"nodeSelector\") | indent 8 }}/" | + sed "s/TOLERATIONS/\n{{ toYaml (index .Values \"tolerations\" ) | indent 8 }}/" \ + > $OUTPUT_DIR/charts/$name/templates/$name.yml + fi + done +} + +if ! command -v yq &> /dev/null; then + echo "Error: yq is required but not installed." + exit 1 +fi +fetch_yaml_and_enable_overrides add_ons[@] +helm dependencies update ./HyperPodHelmChartForRIG # This needs to be run after any dependency template change before "helm