Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
54 commits
Select commit Hold shift + click to select a range
9ef081e
Add docker files to PT2.9 training
Oct 22, 2025
a43b05f
removed the pins and updated versions
Oct 22, 2025
ead5f52
fixed the pins in cpu file as well
Oct 22, 2025
7eff279
Modified the Buildspec files and toml file
Oct 22, 2025
c61c52f
Removed fastai temporarily
Oct 22, 2025
f7c6c44
rebuilding after pinning opencv-python
Oct 22, 2025
6e6238c
rebuild with updated base image
Oct 23, 2025
48f5832
corrected base image and few typos
Oct 27, 2025
49dd5c1
adding additional dependency for TE 2.8
Oct 27, 2025
793b873
Merge branch 'aws:master' into pt2.9-currency
DevakiBolleneni Nov 5, 2025
d2e1f02
Enable efa log and modify the license file
Nov 7, 2025
29f9e55
Modify the license file
Nov 7, 2025
ccfeb0a
Merge branch 'master' into pt2.9-currency
DevakiBolleneni Nov 7, 2025
cfbc16f
Merge branch 'master' into pt2.9-currency
DevakiBolleneni Nov 7, 2025
e43aafc
Add pt2.9 ec2 test file
Nov 7, 2025
f2a7df8
fix typo and enable host networking
Nov 7, 2025
4d4fc07
fix formatting and skip test_fused_attn.py
Nov 7, 2025
bd23f19
Fix formatting in common_cases.py
Nov 8, 2025
da89fb4
Fix EFA NCCL failure
Nov 10, 2025
26484e2
Merge branch 'master' into pt2.9-currency
DevakiBolleneni Nov 10, 2025
14f3035
Fix EFA NCCL failure
Nov 10, 2025
b8ad28b
Fix the script to detect actual network interface
Nov 10, 2025
ea0920f
Merge branch 'master' into pt2.9-currency
DevakiBolleneni Nov 12, 2025
399a043
update prbase image and revert back the NCCL changes
Nov 12, 2025
76c2cd4
modify the ofi-nccl path
Nov 13, 2025
eb67a5b
build sm image
Nov 13, 2025
fc61fdb
add fastai and update TE version
Nov 13, 2025
595f5b4
rebuild ec2 image with fastai
Nov 13, 2025
edf9871
rebuild sm image and test
Nov 14, 2025
64492d3
update base image and flashattention wheel
Nov 14, 2025
a5bb85d
rebuild sm image with enabled security tests
Nov 14, 2025
d52590c
Merge branch 'master' into pt2.9-currency
DevakiBolleneni Nov 14, 2025
606d4fd
rebuild ec2 image
Nov 14, 2025
d22ad48
rerun jobs after deleting AML2_CPU_ARM64_US_EAST_1
Nov 15, 2025
85e9903
rerun jobs after disabling safety check test and ecr scan allowlist
Nov 15, 2025
a9bb637
update MAX_JOBS and try rebuild
Nov 17, 2025
155cae7
rebuild ec2 image with safety check test and ecr scan allowlist
Nov 17, 2025
719a55f
rebuild ec2 image and run tests
Nov 17, 2025
07b1b26
rebuild sm image and run tests
Nov 17, 2025
6568833
skip smppy tests and rerun
Nov 17, 2025
1b6ed8d
rerun after enabling safety check test and ecr scan allowlist
Nov 18, 2025
ef9f107
rebuild ec2 image
Nov 18, 2025
55c5593
Merge branch 'master' into pt2.9-currency
DevakiBolleneni Nov 18, 2025
af6e0d6
Merge branch 'master' into pt2.9-currency
DevakiBolleneni Nov 18, 2025
a450ada
fix formatting
Nov 18, 2025
b8d42f9
Merge branch 'master' into pt2.9-currency
DevakiBolleneni Nov 19, 2025
36f8594
Rerun SM tests
Nov 19, 2025
bc54719
Merge branch 'master' into pt2.9-currency
DevakiBolleneni Nov 19, 2025
15efc9b
Revert testEFA changes and run
Nov 19, 2025
6d14366
Merge branch 'master' into pt2.9-currency
DevakiBolleneni Nov 19, 2025
a18c234
Revert toml file
Nov 19, 2025
ebd0f7b
Merge branch 'master' into pt2.9-currency
DevakiBolleneni Nov 19, 2025
c9ad9e7
Merge branch 'master' into pt2.9-currency
DevakiBolleneni Nov 19, 2025
06a8275
Merge branch 'master' into pt2.9-currency
DevakiBolleneni Nov 19, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 75 additions & 0 deletions pytorch/training/buildspec-2-9-ec2.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
prod_account_id: &PROD_ACCOUNT_ID 763104351884
region: &REGION <set-$REGION-in-environment>
framework: &FRAMEWORK pytorch
version: &VERSION 2.9.0
short_version: &SHORT_VERSION "2.9"
arch_type: x86
# autopatch_build: "True"

repository_info:
training_repository: &TRAINING_REPOSITORY
image_type: &TRAINING_IMAGE_TYPE training
root: !join [ *FRAMEWORK, "/", *TRAINING_IMAGE_TYPE ]
repository_name: &REPOSITORY_NAME !join [ pr, "-", *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ]
repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ]
release_repository_name: &RELEASE_REPOSITORY_NAME !join [ *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ]
release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ]

context:
training_context: &TRAINING_CONTEXT
start_cuda_compat:
source: docker/build_artifacts/start_cuda_compat.sh
target: start_cuda_compat.sh
dockerd_entrypoint:
source: docker/build_artifacts/dockerd_entrypoint.sh
target: dockerd_entrypoint.sh
changehostname:
source: docker/build_artifacts/changehostname.c
target: changehostname.c
start_with_right_hostname:
source: docker/build_artifacts/start_with_right_hostname.sh
target: start_with_right_hostname.sh
example_mnist_file:
source: docker/build_artifacts/mnist.py
target: mnist.py
deep_learning_container:
source: ../../src/deep_learning_container.py
target: deep_learning_container.py
setup_oss_compliance:
source: ../../scripts/setup_oss_compliance.sh
target: setup_oss_compliance.sh

images:
BuildEC2CPUPTTrainPy3DockerImage:
<<: *TRAINING_REPOSITORY
build: &PYTORCH_CPU_TRAINING_PY3 false
image_size_baseline: 7200
device_type: &DEVICE_TYPE cpu
python_version: &DOCKER_PYTHON_VERSION py3
tag_python_version: &TAG_PYTHON_VERSION py312
os_version: &OS_VERSION ubuntu22.04
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ]
latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ]
# skip_build: "False"
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ]
target: ec2
context:
<<: *TRAINING_CONTEXT
BuildEC2GPUPTTrainPy3cu130DockerImage:
<<: *TRAINING_REPOSITORY
build: &PYTORCH_GPU_TRAINING_PY3 false
image_size_baseline: 28000
device_type: &DEVICE_TYPE gpu
python_version: &DOCKER_PYTHON_VERSION py3
tag_python_version: &TAG_PYTHON_VERSION py312
cuda_version: &CUDA_VERSION cu130
os_version: &OS_VERSION ubuntu22.04
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ]
latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ]
# skip_build: "False"
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile.,
*DEVICE_TYPE ]
target: ec2
context:
<<: *TRAINING_CONTEXT
75 changes: 75 additions & 0 deletions pytorch/training/buildspec-2-9-sm.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
prod_account_id: &PROD_ACCOUNT_ID 763104351884
region: &REGION <set-$REGION-in-environment>
framework: &FRAMEWORK pytorch
version: &VERSION 2.9.0
short_version: &SHORT_VERSION "2.9"
arch_type: x86
# autopatch_build: "True"

repository_info:
training_repository: &TRAINING_REPOSITORY
image_type: &TRAINING_IMAGE_TYPE training
root: !join [ *FRAMEWORK, "/", *TRAINING_IMAGE_TYPE ]
repository_name: &REPOSITORY_NAME !join [ pr, "-", *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ]
repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ]
release_repository_name: &RELEASE_REPOSITORY_NAME !join [ *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ]
release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ]

context:
training_context: &TRAINING_CONTEXT
start_cuda_compat:
source: docker/build_artifacts/start_cuda_compat.sh
target: start_cuda_compat.sh
dockerd_entrypoint:
source: docker/build_artifacts/dockerd_entrypoint.sh
target: dockerd_entrypoint.sh
changehostname:
source: docker/build_artifacts/changehostname.c
target: changehostname.c
start_with_right_hostname:
source: docker/build_artifacts/start_with_right_hostname.sh
target: start_with_right_hostname.sh
example_mnist_file:
source: docker/build_artifacts/mnist.py
target: mnist.py
deep_learning_container:
source: ../../src/deep_learning_container.py
target: deep_learning_container.py
setup_oss_compliance:
source: ../../scripts/setup_oss_compliance.sh
target: setup_oss_compliance.sh

images:
BuildSageMakerCPUPTTrainPy3DockerImage:
<<: *TRAINING_REPOSITORY
build: &PYTORCH_CPU_TRAINING_PY3 false
image_size_baseline: 7200
device_type: &DEVICE_TYPE cpu
python_version: &DOCKER_PYTHON_VERSION py3
tag_python_version: &TAG_PYTHON_VERSION py312
os_version: &OS_VERSION ubuntu22.04
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ]
latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ]
# skip_build: "False"
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ]
target: sagemaker
context:
<<: *TRAINING_CONTEXT
BuildSageMakerGPUPTTrainPy3DockerImage:
<<: *TRAINING_REPOSITORY
build: &PYTORCH_GPU_TRAINING_PY3 false
image_size_baseline: 28000
device_type: &DEVICE_TYPE gpu
python_version: &DOCKER_PYTHON_VERSION py3
tag_python_version: &TAG_PYTHON_VERSION py312
cuda_version: &CUDA_VERSION cu130
os_version: &OS_VERSION ubuntu22.04
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
# skip_build: "False"
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile.,
*DEVICE_TYPE ]
target: sagemaker
context:
<<: *TRAINING_CONTEXT
2 changes: 1 addition & 1 deletion pytorch/training/buildspec.yml
Original file line number Diff line number Diff line change
@@ -1 +1 @@
buildspec_pointer: buildspec-2-8-sm.yml
buildspec_pointer: buildspec-2-9-ec2.yml
Loading