Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
4249c1f
Add installation script for RIG Worker dependencies
chnnmz May 12, 2025
152b3c6
Apply *.nonrig.yaml separate from Helm chart
chnnmz Jun 9, 2025
2f2affc
Correct image override for aws-node
chnnmz Jun 9, 2025
393ed27
Temporarily only enable coredns and aws-node
chnnmz Jun 12, 2025
2aa3372
Temporarily only enable coredns and aws-node and nvidia in Chart.yaml
chnnmz Jun 12, 2025
edc13bb
Convert CoreDNS to a Daemonset so every RIG will have CoreDNS
chnnmz Jun 18, 2025
cdd484f
Only use maxUnavailable for rollingUpdate CoreDNS
chnnmz Jun 18, 2025
8fde2de
Update tolerations and nodeSelector
chnnmz Jun 18, 2025
caa62df
Try using helm hook weight to order deployments
chnnmz Jun 18, 2025
f833031
Patch training-operators so we guarantee it exists
chnnmz Jun 20, 2025
1fd8ac0
Re-enable aws-node and coredns
chnnmz Jun 20, 2025
cce23a3
Uncomment more
chnnmz Jun 20, 2025
cc60e43
Warn user about CNI start up + Add more info about yq v4 required + P…
chnnmz Jun 23, 2025
b638a5d
Minor comment about training-operators patch command
chnnmz Jun 24, 2025
0543363
Override images only if below certain version for CNI
chnnmz Jun 24, 2025
38401b1
Modify to upgrade --install
chnnmz Jun 24, 2025
cb5c999
Update error message if add-on isn't enabled
chnnmz Jun 25, 2025
dc9f974
Update initContainer too for CNI
chnnmz Jun 25, 2025
31abeed
Add EFA patch for tolerations (daemonset)
chnnmz Jun 25, 2025
dea32ce
For EFA, need to override image since not done like CNI
chnnmz Jun 26, 2025
70d677d
Assert supported region (especially important for EFA image override …
chnnmz Jun 26, 2025
2149e8f
Minor cleanup
chnnmz Jun 26, 2025
2c72178
Add warning messages and confirmation about re-installing. Still need…
chnnmz Jun 27, 2025
84a48d3
Add annotations for each patch/deployment/installation
chnnmz Jun 27, 2025
e26e9d1
Skip patching if we already see the RIG annotaiton
chnnmz Jun 27, 2025
d2453d6
Add notes about HMA patching
chnnmz Jun 27, 2025
38cda1b
Merge branch 'aws:main' into rig-dev
chnnmz Jul 2, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions helm_chart/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
HyperPodHelmChartForRIG/charts/*/templates/
HyperPodHelmChartForRIG/charts/*.tgz
HyperPodHelmChart/charts/*.tgz
26 changes: 26 additions & 0 deletions helm_chart/HyperPodHelmChartForRIG/.helmignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Patterns to ignore when building packages.
# This supports shell glob matching, relative path matching, and
# negation (prefixed with !). Only one pattern per line.
.DS_Store
# Common VCS dirs
.git/
.gitignore
.bzr/
.bzrignore
.hg/
.hgignore
.svn/
# Common backup files
*.swp
*.bak
*.tmp
*.orig
*~
# Various IDEs
.project
.idea/
*.tmproj
.vscode/
# HyperPod
*.nonrig.yaml
*.nonrig.yml
33 changes: 33 additions & 0 deletions helm_chart/HyperPodHelmChartForRIG/Chart.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
apiVersion: v2
name: hyperpod-helm-chart-for-rig
description: A Helm chart for Kubernetes

# A chart can be either an 'application' or a 'library' chart.
#
# Application charts are a collection of templates that can be packaged into versioned archives
# to be deployed.
#
# Library charts provide useful utilities or functions for the chart developer. They're included as
# a dependency of application charts to inject those utilities and functions into the rendering
# pipeline. Library charts do not define any templates and therefore cannot be deployed.
type: application

# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 0.1.0

# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
# follow Semantic Versioning. They should reflect the version the application is using.
# It is recommended to use it with quotes.
appVersion: "1.16.0"

dependencies:
- name: nvidia-device-plugin
version: "0.16.1"
repository: https://nvidia.github.io/k8s-device-plugin
condition: nvidia-device-plugin.devicePlugin.enabled
- name: coredns
version: "0.1.0"
repository: "file://charts/coredns"
205 changes: 205 additions & 0 deletions helm_chart/HyperPodHelmChartForRIG/values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
# OVERRIDE values for HyperPodHelmChart for RIG.
# This is a YAML-formatted file.
# Declare variables to be passed into your templates.

# value_safe_name of dependencies - see install_rig_dependencies.sh
#
# Note:
#
# The format is add-on: {keys:...} .
# Helm wil AUTOMATICALLY SCOPE the add-on values when resolving each dependency
# In other words, what is exposed to each dependency template is
# {{ .Values.tolerations }} , NOT {{ .Values.add-on.tolerations }}


# This is a special case. We will not use values.yaml to override the values for this add-on
# It will still be rendered by Helm using special logic.
# See install_rig_dependncies.yaml
#aws-cni:


coredns:
tolerations:
- effect: NoSchedule
key: node-role.kubernetes.io/control-plane
- key: CriticalAddonsOnly
operator: Exists
- effect: NoSchedule
operator: Equal
key: "sagemaker.amazonaws.com/RestrictedNode"
value: "Worker"
nodeSelector:
"sagemaker.amazonaws.com/instance-group-type": "Restricted"

mpi-operator:
tolerations:
- key: "sagemaker.amazonaws.com/node-health-status"
operator: "Equal"
value: "Unschedulable"
effect: "NoSchedule"
- effect: NoSchedule
operator: Equal
key: "sagemaker.amazonaws.com/RestrictedNode"
value: "Worker"
nodeSelector:
"sagemaker.amazonaws.com/instance-group-type": "Restricted"

neuron-device-plugin:
tolerations:
- key: CriticalAddonsOnly
operator: Exists
- key: "aws.amazon.com/neuron"
operator: Exists
effect: NoSchedule
- key: "sagemaker.amazonaws.com/node-health-status"
operator: Equal
value: Unschedulable
effect: NoSchedule
- effect: NoSchedule
operator: Equal
key: "sagemaker.amazonaws.com/RestrictedNode"
value: "Worker"
nodeSelector:
"sagemaker.amazonaws.com/instance-group-type": "Restricted"

training-operators:
tolerations:
- effect: NoSchedule
operator: Equal
key: "sagemaker.amazonaws.com/RestrictedNode"
value: "Worker"
nodeSelector:
"sagemaker.amazonaws.com/instance-group-type": "Restricted"

# Overrides for externally-maintained Helm charts
aws-efa-k8s-device-plugin:
devicePlugin:
enabled: true
supportedInstanceLabels:
values:
- ml.c5n.9xlarge
- ml.c5n.18xlarge
- ml.g5.8xlarge
- ml.g5.12xlarge
- ml.g5.16xlarge
- ml.g5.24xlarge
- ml.g5.48xlarge
- ml.g6.8xlarge
- ml.g6.12xlarge
- ml.g6.16xlarge
- ml.g6.24xlarge
- ml.g6.48xlarge
- ml.g6e.8xlarge
- ml.g6e.12xlarge
- ml.g6e.16xlarge
- ml.g6e.24xlarge
- ml.g6e.48xlarge
- ml.gr6.8xlarge
- ml.i3en.large
- ml.i3en.xlarge
- ml.i3en.2xlarge
- ml.i3en.3xlarge
- ml.i3en.6xlarge
- ml.i3en.12xlarge
- ml.i3en.24xlarge
- ml.m7i.large
- ml.m7i.xlarge
- ml.m7i.2xlarge
- ml.m7i.4xlarge
- ml.m7i.8xlarge
- ml.m7i.12xlarge
- ml.m7i.16xlarge
- ml.m7i.24xlarge
- ml.m7i.48xlarge
- ml.p4d.24xlarge
- ml.p4de.24xlarge
- ml.p5.48xlarge
- ml.p5e.48xlarge
- ml.p5en.48xlarge
- ml.r7i.large
- ml.r7i.xlarge
- ml.r7i.2xlarge
- ml.r7i.4xlarge
- ml.r7i.8xlarge
- ml.r7i.12xlarge
- ml.r7i.16xlarge
- ml.r7i.24xlarge
- ml.r7i.48xlarge
- ml.trn1.32xlarge
- ml.trn1n.32xlarge
- ml.trn2.48xlarge
tolerations:
- key: CriticalAddonsOnly
operator: Exists
- effect: NoSchedule
key: aws.amazon.com/efa
operator: Exists
- key: sagemaker.amazonaws.com/node-health-status
operator: "Equal"
value: "Unschedulable"
effect: "NoSchedule"
- effect: NoSchedule
operator: Equal
key: "sagemaker.amazonaws.com/RestrictedNode"
value: "Worker"
nodeSelector:
"sagemaker.amazonaws.com/instance-group-type": "Restricted"

nvidia-device-plugin:
devicePlugin:
enabled: true
allowDefaultNamespace: true
namespaceOverride: "kube-system"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: node.kubernetes.io/instance-type
operator: In
values:
- ml.g5.xlarge
- ml.g5.2xlarge
- ml.g5.4xlarge
- ml.g5.8xlarge
- ml.g5.12xlarge
- ml.g5.16xlarge
- ml.g5.24xlarge
- ml.g5.48xlarge
- ml.g6.xlarge
- ml.g6.2xlarge
- ml.g6.4xlarge
- ml.g6.8xlarge
- ml.g6.16xlarge
- ml.g6.12xlarge
- ml.g6.24xlarge
- ml.g6.48xlarge
- ml.g6e.xlarge
- ml.g6e.2xlarge
- ml.g6e.4xlarge
- ml.g6e.8xlarge
- ml.g6e.12xlarge
- ml.g6e.16xlarge
- ml.g6e.24xlarge
- ml.g6e.48xlarge
- ml.gr6.4xlarge
- ml.gr6.8xlarge
- ml.p4d.24xlarge
- ml.p4de.24xlarge
- ml.p5.48xlarge
- ml.p5e.48xlarge
- ml.p5en.48xlarge
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
- key: sagemaker.amazonaws.com/node-health-status
operator: Equal
value: Unschedulable
effect: NoSchedule
- effect: NoSchedule
operator: Equal
key: "sagemaker.amazonaws.com/RestrictedNode"
value: "Worker"
nodeSelector:
"sagemaker.amazonaws.com/instance-group-type": "Restricted"
Loading
Loading