aws · chnnmz · May 12, 2025 · Jun 9, 2025 · Jun 9, 2025 · Jun 12, 2025
@@ -0,0 +1,3 @@
+HyperPodHelmChartForRIG/charts/*/templates/
+HyperPodHelmChartForRIG/charts/*.tgz
+HyperPodHelmChart/charts/*.tgz
@@ -0,0 +1,26 @@
+# Patterns to ignore when building packages.
+# This supports shell glob matching, relative path matching, and
+# negation (prefixed with !). Only one pattern per line.
+.DS_Store
+# Common VCS dirs
+.git/
+.gitignore
+.bzr/
+.bzrignore
+.hg/
+.hgignore
+.svn/
+# Common backup files
+*.swp
+*.bak
+*.tmp
+*.orig
+*~
+# Various IDEs
+.project
+.idea/
+*.tmproj
+.vscode/
+# HyperPod
+*.nonrig.yaml
+*.nonrig.yml
@@ -0,0 +1,33 @@
+apiVersion: v2
+name: hyperpod-helm-chart-for-rig
+description: A Helm chart for Kubernetes
+
+# A chart can be either an 'application' or a 'library' chart.
+#
+# Application charts are a collection of templates that can be packaged into versioned archives
+# to be deployed.
+#
+# Library charts provide useful utilities or functions for the chart developer. They're included as
+# a dependency of application charts to inject those utilities and functions into the rendering
+# pipeline. Library charts do not define any templates and therefore cannot be deployed.
+type: application
+
+# This is the chart version. This version number should be incremented each time you make changes
+# to the chart and its templates, including the app version.
+# Versions are expected to follow Semantic Versioning (https://semver.org/)
+version: 0.1.0
+
+# This is the version number of the application being deployed. This version number should be
+# incremented each time you make changes to the application. Versions are not expected to
+# follow Semantic Versioning. They should reflect the version the application is using.
+# It is recommended to use it with quotes.
+appVersion: "1.16.0"
+
+dependencies:
+  - name: nvidia-device-plugin
+    version: "0.16.1"
+    repository: https://nvidia.github.io/k8s-device-plugin
+    condition: nvidia-device-plugin.devicePlugin.enabled    
+  - name: coredns
+    version: "0.1.0"
+    repository: "file://charts/coredns"
@@ -0,0 +1,205 @@
+# OVERRIDE values for HyperPodHelmChart for RIG.
+# This is a YAML-formatted file.
+# Declare variables to be passed into your templates.
+
+# value_safe_name of dependencies - see install_rig_dependencies.sh
+# 
+# Note:
+#
+# The format is add-on: {keys:...} .
+# Helm wil AUTOMATICALLY SCOPE the add-on values when resolving each dependency
+# In other words, what is exposed to each dependency template is 
+# {{ .Values.tolerations }} , NOT {{ .Values.add-on.tolerations  }}
+
+
+# This is a special case. We will not use values.yaml to override the values for this add-on
+# It will still be rendered by Helm using special logic.
+# See install_rig_dependncies.yaml
+#aws-cni:
+
+
+coredns:
+  tolerations:
+    - effect: NoSchedule
+      key: node-role.kubernetes.io/control-plane
+    - key: CriticalAddonsOnly
+      operator: Exists
+    - effect: NoSchedule
+      operator: Equal
+      key: "sagemaker.amazonaws.com/RestrictedNode"
+      value: "Worker"
+  nodeSelector:
+    "sagemaker.amazonaws.com/instance-group-type": "Restricted"
+
+mpi-operator:
+  tolerations:
+    - key: "sagemaker.amazonaws.com/node-health-status"
+      operator: "Equal"
+      value: "Unschedulable"
+      effect: "NoSchedule"
+    - effect: NoSchedule
+      operator: Equal
+      key: "sagemaker.amazonaws.com/RestrictedNode"
+      value: "Worker"
+  nodeSelector:
+    "sagemaker.amazonaws.com/instance-group-type": "Restricted"
+
+neuron-device-plugin:
+  tolerations:
+    - key: CriticalAddonsOnly
+      operator: Exists
+    - key: "aws.amazon.com/neuron"
+      operator: Exists
+      effect: NoSchedule
+    - key: "sagemaker.amazonaws.com/node-health-status"
+      operator: Equal
+      value: Unschedulable
+      effect: NoSchedule
+    - effect: NoSchedule
+      operator: Equal
+      key: "sagemaker.amazonaws.com/RestrictedNode"
+      value: "Worker"
+  nodeSelector:
+    "sagemaker.amazonaws.com/instance-group-type": "Restricted"
+
+training-operators:
+  tolerations:
+    - effect: NoSchedule
+      operator: Equal
+      key: "sagemaker.amazonaws.com/RestrictedNode"
+      value: "Worker"
+  nodeSelector:
+    "sagemaker.amazonaws.com/instance-group-type": "Restricted"
+
+# Overrides for externally-maintained Helm charts
+aws-efa-k8s-device-plugin:
+  devicePlugin:
+    enabled: true
+  supportedInstanceLabels:
+    values:
+      - ml.c5n.9xlarge
+      - ml.c5n.18xlarge
+      - ml.g5.8xlarge
+      - ml.g5.12xlarge
+      - ml.g5.16xlarge
+      - ml.g5.24xlarge
+      - ml.g5.48xlarge
+      - ml.g6.8xlarge
+      - ml.g6.12xlarge
+      - ml.g6.16xlarge
+      - ml.g6.24xlarge
+      - ml.g6.48xlarge
+      - ml.g6e.8xlarge
+      - ml.g6e.12xlarge
+      - ml.g6e.16xlarge
+      - ml.g6e.24xlarge
+      - ml.g6e.48xlarge
+      - ml.gr6.8xlarge
+      - ml.i3en.large
+      - ml.i3en.xlarge
+      - ml.i3en.2xlarge
+      - ml.i3en.3xlarge
+      - ml.i3en.6xlarge
+      - ml.i3en.12xlarge
+      - ml.i3en.24xlarge
+      - ml.m7i.large
+      - ml.m7i.xlarge
+      - ml.m7i.2xlarge
+      - ml.m7i.4xlarge
+      - ml.m7i.8xlarge
+      - ml.m7i.12xlarge
+      - ml.m7i.16xlarge
+      - ml.m7i.24xlarge
+      - ml.m7i.48xlarge
+      - ml.p4d.24xlarge
+      - ml.p4de.24xlarge
+      - ml.p5.48xlarge
+      - ml.p5e.48xlarge
+      - ml.p5en.48xlarge
+      - ml.r7i.large
+      - ml.r7i.xlarge
+      - ml.r7i.2xlarge
+      - ml.r7i.4xlarge
+      - ml.r7i.8xlarge
+      - ml.r7i.12xlarge
+      - ml.r7i.16xlarge
+      - ml.r7i.24xlarge
+      - ml.r7i.48xlarge
+      - ml.trn1.32xlarge
+      - ml.trn1n.32xlarge
+      - ml.trn2.48xlarge
+  tolerations:
+    - key: CriticalAddonsOnly
+      operator: Exists
+    - effect: NoSchedule
+      key: aws.amazon.com/efa
+      operator: Exists
+    - key: sagemaker.amazonaws.com/node-health-status
+      operator: "Equal"
+      value: "Unschedulable"
+      effect: "NoSchedule"
+    - effect: NoSchedule
+      operator: Equal
+      key: "sagemaker.amazonaws.com/RestrictedNode"
+      value: "Worker"
+  nodeSelector:
+    "sagemaker.amazonaws.com/instance-group-type": "Restricted"
+
+nvidia-device-plugin:
+  devicePlugin:
+    enabled: true
+  allowDefaultNamespace: true
+  namespaceOverride: "kube-system"
+  affinity:
+    nodeAffinity:
+      requiredDuringSchedulingIgnoredDuringExecution:
+        nodeSelectorTerms: 
+        - matchExpressions:
+          - key: node.kubernetes.io/instance-type
+            operator: In
+            values:
+              - ml.g5.xlarge
+              - ml.g5.2xlarge
+              - ml.g5.4xlarge
+              - ml.g5.8xlarge
+              - ml.g5.12xlarge
+              - ml.g5.16xlarge
+              - ml.g5.24xlarge
+              - ml.g5.48xlarge
+              - ml.g6.xlarge
+              - ml.g6.2xlarge
+              - ml.g6.4xlarge
+              - ml.g6.8xlarge
+              - ml.g6.16xlarge
+              - ml.g6.12xlarge
+              - ml.g6.24xlarge
+              - ml.g6.48xlarge
+              - ml.g6e.xlarge
+              - ml.g6e.2xlarge
+              - ml.g6e.4xlarge
+              - ml.g6e.8xlarge
+              - ml.g6e.12xlarge
+              - ml.g6e.16xlarge
+              - ml.g6e.24xlarge
+              - ml.g6e.48xlarge
+              - ml.gr6.4xlarge
+              - ml.gr6.8xlarge
+              - ml.p4d.24xlarge
+              - ml.p4de.24xlarge
+              - ml.p5.48xlarge
+              - ml.p5e.48xlarge
+              - ml.p5en.48xlarge
+  tolerations:
+    - key: nvidia.com/gpu
+      operator: Exists
+      effect: NoSchedule
+    - key: sagemaker.amazonaws.com/node-health-status
+      operator: Equal
+      value: Unschedulable
+      effect: NoSchedule
+    - effect: NoSchedule
+      operator: Equal
+      key: "sagemaker.amazonaws.com/RestrictedNode"
+      value: "Worker"
+  nodeSelector:
+    "sagemaker.amazonaws.com/instance-group-type": "Restricted"