From 9ef081e3dd670ab858a959cf422b3fac7459922a Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Wed, 22 Oct 2025 00:24:20 -0700 Subject: [PATCH 01/40] Add docker files to PT2.9 training --- .../training/docker/2.9/py3/Dockerfile.cpu | 313 ++++++++++++++++++ .../docker/2.9/py3/cu130/Dockerfile.gpu | 285 ++++++++++++++++ 2 files changed, 598 insertions(+) create mode 100644 pytorch/training/docker/2.9/py3/Dockerfile.cpu create mode 100644 pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu diff --git a/pytorch/training/docker/2.9/py3/Dockerfile.cpu b/pytorch/training/docker/2.9/py3/Dockerfile.cpu new file mode 100644 index 000000000000..2b151e7b3cd7 --- /dev/null +++ b/pytorch/training/docker/2.9/py3/Dockerfile.cpu @@ -0,0 +1,313 @@ +ARG PYTHON=python3 +ARG PYTHON_VERSION=3.12.10 +ARG PYTHON_SHORT_VERSION=3.12 +ARG PYTORCH_VERSION=2.9.0 + +ARG OPEN_MPI_VERSION=4.1.7 + +ARG TORCHTNT_VERSION=0.2.4 +ARG TORCHDATA_VERSION=0.11.0 +ARG TORCHAUDIO_VERSION=2.9.0 +ARG TORCHVISION_VERSION=0.24.0 + +FROM ubuntu:22.04 AS base_image + +# This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 20 +ENV DEBIAN_FRONTEND=noninteractive +ENV LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" + +RUN apt-get update \ + && apt-get upgrade -y \ + && apt-get autoremove -y \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +################################################################# +# ____ +# / ___| ___ _ __ ___ _ __ ___ ___ _ __ +# | | / _ \| '_ ` _ \| '_ ` _ \ / _ \| '_ \ +# | |___ (_) | | | | | | | | | | | (_) | | | | +# \____|\___/|_| |_| |_|_| |_| |_|\___/|_| |_| +# ___ ____ _ +# |_ _|_ __ ___ __ _ __ _ ___ | _ \ ___ ___(_)_ __ ___ +# | || '_ ` _ \ / _` |/ _` |/ _ \ | |_) / _ \/ __| | '_ \ / _ \ +# | || | | | | | (_| | (_| | __/ | _ < __/ (__| | |_) | __/ +# |___|_| |_| |_|\__,_|\__, |\___| |_| \_\___|\___|_| .__/ \___| +# |___/ |_| +################################################################# + +FROM base_image AS common + +LABEL maintainer="Amazon AI" +LABEL dlc_major_version="1" + +ARG PYTHON +ARG PYTHON_VERSION +ARG PYTHON_SHORT_VERSION +ARG PYTORCH_VERSION +ARG TORCHTNT_VERSION +ARG TORCHDATA_VERSION +ARG TORCHAUDIO_VERSION +ARG TORCHVISION_VERSION + +ARG OPEN_MPI_VERSION + +ENV LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" +ENV LD_LIBRARY_PATH="/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}" + +# Python won’t try to write .pyc or .pyo files on the import of source modules +# Force stdin, stdout and stderr to be totally unbuffered. Good for logging +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 +ENV PYTHONIOENCODING=UTF-8 +ENV LANG=C.UTF-8 +ENV LC_ALL=C.UTF-8 + +ENV DLC_CONTAINER_TYPE=training +WORKDIR / + +RUN apt-get update \ + && apt-get -y upgrade --only-upgrade systemd \ + && apt-get install -y --no-install-recommends \ + automake \ + build-essential \ + ca-certificates \ + cmake \ + curl \ + emacs \ + git \ + jq \ + libcurl4-openssl-dev \ + libglib2.0-0 \ + libgl1-mesa-glx \ + libsm6 \ + libssl-dev \ + libxext6 \ + libxrender-dev \ + zlib1g-dev \ + unzip \ + vim \ + wget \ + libbz2-dev \ + libreadline-dev \ + libsqlite3-dev \ + llvm \ + libncurses5-dev \ + libncursesw5-dev \ + xz-utils \ + tk-dev \ + liblzma-dev \ + libffi-dev \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +# Install Open MPI +RUN wget https://www.open-mpi.org/software/ompi/v4.1/downloads/openmpi-${OPEN_MPI_VERSION}.tar.gz \ + && gunzip -c openmpi-${OPEN_MPI_VERSION}.tar.gz | tar xf - \ + && cd openmpi-${OPEN_MPI_VERSION} \ + && ./configure --prefix=/home/.openmpi \ + && make all install \ + && cd .. \ + && rm openmpi-${OPEN_MPI_VERSION}.tar.gz \ + && rm -rf openmpi-${OPEN_MPI_VERSION} + +# The ENV variables declared below are changed in the previous section +# Grouping these ENV variables in the first section causes +# ompi_info to fail. This is only observed in CPU containers +ENV PATH="/home/.openmpi/bin:${PATH}" +ENV LD_LIBRARY_PATH="/home/.openmpi/lib:${LD_LIBRARY_PATH}" +RUN ompi_info --parsable --all | grep mpi_built_with_cuda_support:value + +# Install OpenSSH for MPI to communicate between containers, allow OpenSSH to talk to containers without asking for confirmation +RUN apt-get update \ + && apt-get install -y --no-install-recommends openssh-client openssh-server \ + && mkdir -p /var/run/sshd \ + && cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new \ + && echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new \ + && mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +# Configure OpenSSH so that nodes can communicate with each other +RUN mkdir -p /var/run/sshd \ + && sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd + +RUN rm -rf /root/.ssh/ \ + && mkdir -p /root/.ssh/ \ + && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \ + && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \ + && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config + +# install python +RUN cd /tmp/ \ +&& wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz \ +&& tar xzf Python-${PYTHON_VERSION}.tgz \ +&& cd Python-${PYTHON_VERSION} \ +&& ./configure --enable-optimizations --with-lto --with-computed-gotos --with-system-ffi \ +&& make -j "$(nproc)" \ +&& make altinstall \ +&& cd .. \ +&& rm -rf Python-${PYTHON_VERSION} \ +&& rm Python-${PYTHON_VERSION}.tgz \ +&& ln -s /usr/local/bin/python${PYTHON_SHORT_VERSION} /usr/local/bin/python \ +&& ln -s /usr/local/bin/python${PYTHON_SHORT_VERSION} /usr/local/bin/python3 \ +# This installation generate a .python_history file in the root directory leads sanity check to fail +&& rm -f /root/.python_history + +# Python Path +ENV PATH="/usr/local/bin:${PATH}" + +# this will add pip systemlink to pip${PYTHON_SHORT_VERSION} +RUN python -m pip install --upgrade pip --trusted-host pypi.org --trusted-host files.pythonhosted.org + +# Install common packages +RUN pip install --no-cache-dir \ + cython \ + cryptography \ + pyOpenSSL \ + pybind11 \ + mkl \ + mkl-include \ + parso \ + typing \ + charset-normalizer \ + packaging \ + boto3 \ + PyYAML \ + numpy \ + scipy \ + click \ + psutil \ + ipython \ + ipykernel \ + pillow \ + h5py \ + fsspec \ + "idna>=3.7" \ + "tqdm>=4.66.3" \ + "requests>=2.32.0" \ + "setuptools>=70.0.0" \ + "urllib3>=2.5.0" \ + "awscli" \ + opencv-python==4.11.0.86 \ + mpi4py \ + jinja2>=3.1.6 \ + tornado>=6.5.1 + +# Install PyTorch +RUN pip install --no-cache-dir -U torch==${PYTORCH_VERSION} \ + torchvision==${TORCHVISION_VERSION} \ + torchaudio==${TORCHAUDIO_VERSION} \ + --index-url https://download.pytorch.org/whl/cpu \ + && pip install --no-cache-dir -U torchtnt==${TORCHTNT_VERSION} \ + torchdata==${TORCHDATA_VERSION} \ + s3torchconnector \ + fastai \ + accelerate \ + # pin numpy requirement for fastai dependency + # requires explicit declaration of spacy, thic, blis + spacy \ + # pin thinc due to incompatibility with numpy 1.26.4 (sagemaker doesn't support latest numpy) + thinc==8.3.4 \ + blis \ + numpy \ + && pip uninstall -y dataclasses + +RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.8/license.txt + +COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py + +RUN chmod +x /usr/local/bin/deep_learning_container.py + +COPY bash_telemetry.sh /usr/local/bin/bash_telemetry.sh +RUN chmod +x /usr/local/bin/bash_telemetry.sh +RUN echo 'source /usr/local/bin/bash_telemetry.sh' >> /etc/bash.bashrc + +# Removing the cache as it is needed for security verification +RUN rm -rf /root/.cache | true + +######################################################## +# _____ ____ ____ ___ +# | ____/ ___|___ \ |_ _|_ __ ___ __ _ __ _ ___ +# | _|| | __) | | || '_ ` _ \ / _` |/ _` |/ _ \ +# | |__| |___ / __/ | || | | | | | (_| | (_| | __/ +# |_____\____|_____| |___|_| |_| |_|\__,_|\__, |\___| +# |___/ +# ____ _ +# | _ \ ___ ___(_)_ __ ___ +# | |_) / _ \/ __| | '_ \ / _ \ +# | _ < __/ (__| | |_) | __/ +# |_| \_\___|\___|_| .__/ \___| +# |_| +######################################################## + +FROM common AS ec2 + +WORKDIR / + +COPY setup_oss_compliance.sh setup_oss_compliance.sh +RUN bash setup_oss_compliance.sh ${PYTHON} && rm setup_oss_compliance.sh + +COPY dockerd_entrypoint.sh /usr/local/bin/dockerd_entrypoint.sh +RUN chmod +x /usr/local/bin/dockerd_entrypoint.sh +ENTRYPOINT ["bash", "-m", "dockerd_entrypoint.sh"] + +# Starts framework +CMD ["/bin/bash"] + +################################################################# +# ____ __ __ _ +# / ___| __ _ __ _ ___| \/ | __ _| | _____ _ __ +# \___ \ / _` |/ _` |/ _ \ |\/| |/ _` | |/ / _ \ '__| +# ___) | (_| | (_| | __/ | | | (_| | < __/ | +# |____/ \__,_|\__, |\___|_| |_|\__,_|_|\_\___|_| +# |___/ +# ___ ____ _ +# |_ _|_ __ ___ __ _ __ _ ___ | _ \ ___ ___(_)_ __ ___ +# | || '_ ` _ \ / _` |/ _` |/ _ \ | |_) / _ \/ __| | '_ \ / _ \ +# | || | | | | | (_| | (_| | __/ | _ < __/ (__| | |_) | __/ +# |___|_| |_| |_|\__,_|\__, |\___| |_| \_\___|\___|_| .__/ \___| +# |___/ |_| +################################################################# + +FROM common AS sagemaker + +LABEL maintainer="Amazon AI" +LABEL dlc_major_version="1" + +ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main + +WORKDIR / + +# Install SM packages +RUN pip install --no-cache-dir -U \ + smclarify \ + "sagemaker>=2" \ + sagemaker-experiments \ + sagemaker-pytorch-training \ + sagemaker-training + +# Install extra packages +RUN pip install --no-cache-dir -U \ + bokeh \ + imageio \ + numba \ + pandas \ + plotly \ + scikit-learn \ + seaborn \ + shap \ + cloudpickle + +# Copy workaround script for incorrect hostname +COPY changehostname.c / +COPY start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh + +RUN chmod +x /usr/local/bin/start_with_right_hostname.sh + +COPY setup_oss_compliance.sh setup_oss_compliance.sh +RUN bash setup_oss_compliance.sh ${PYTHON} && rm setup_oss_compliance.sh + +ENTRYPOINT ["bash", "-m", "start_with_right_hostname.sh"] +CMD ["/bin/bash"] + diff --git a/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu b/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu new file mode 100644 index 000000000000..f3435553821b --- /dev/null +++ b/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu @@ -0,0 +1,285 @@ +ARG PYTHON=python3 +ARG PYTHON_VERSION=3.12.10 +ARG PYTHON_SHORT_VERSION=3.12 +ARG PYTORCH_VERSION=2.9.0 +ARG TORCHTNT_VERSION=0.2.4 +ARG TORCHAUDIO_VERSION=2.9.0 +ARG TORCHVISION_VERSION=0.24.0 +ARG TORCHDATA_VERSION=0.11.0 + +ARG GDRCOPY_VERSION=2.5.1 +ARG TE_VERSION=2.5 +ARG FLASH_ATTN_VERSION=2.8.3 + +################################################################# +# ____ +# / ___| ___ _ __ ___ _ __ ___ ___ _ __ +# | | / _ \| '_ ` _ \| '_ ` _ \ / _ \| '_ \ +# | |___ (_) | | | | | | | | | | | (_) | | | | +# \____|\___/|_| |_| |_|_| |_| |_|\___/|_| |_| +# ___ ____ _ +# |_ _|_ __ ___ __ _ __ _ ___ | _ \ ___ ___(_)_ __ ___ +# | || '_ ` _ \ / _` |/ _` |/ _ \ | |_) / _ \/ __| | '_ \ / _ \ +# | || | | | | | (_| | (_| | __/ | _ < __/ (__| | |_) | __/ +# |___|_| |_| |_|\__,_|\__, |\___| |_| \_\___|\___|_| .__/ \___| +# |___/ |_| +################################################################# + +FROM 669063966089.dkr.ecr.us-west-2.amazonaws.com/pr-base:13.0.0-gpu-py312-cu130-ubuntu22.04-ec2-pr-5394-2025-10-17-00-03-02 AS common +# base has EFA, PYTHON and CUDA 13.0 + +LABEL maintainer="Amazon AI" +LABEL dlc_major_version="1" + +ARG PYTHON +ARG PYTORCH_VERSION +ARG TORCHDATA_VERSION +ARG TORCHAUDIO_VERSION +ARG TORCHVISION_VERSION +ARG TORCHTNT_VERSION +ARG TE_VERSION +ARG FLASH_ATTN_VERSION +ARG GDRCOPY_VERSION + +ENV CUDA_HOME="/usr/local/cuda" +ENV PATH="${CUDA_HOME}/bin:${PATH}" +ENV EFA_PATH="/opt/amazon/efa" +ENV OPEN_MPI_PATH="/opt/amazon/openmpi" + +# Python won’t try to write .pyc or .pyo files on the import of source modules +# Force stdin, stdout and stderr to be totally unbuffered. Good for logging +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 +ENV PYTHONIOENCODING=UTF-8 +ENV LANG=C.UTF-8 +ENV LC_ALL=C.UTF-8 + +ENV TORCH_NVCC_FLAGS="-Xfatbin -compress-all" + +ENV DLC_CONTAINER_TYPE=training +WORKDIR / + +RUN apt-get update \ + && apt-get -y upgrade --only-upgrade systemd \ + && apt-get install -y --allow-change-held-packages --no-install-recommends \ + libgl1-mesa-glx \ + build-essential \ + ca-certificates \ + zlib1g-dev \ + openssl \ + python3-dev \ + pkg-config \ + check \ + llvm \ + xz-utils \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get clean + +ENV PATH="${OPEN_MPI_PATH}/bin:${EFA_PATH}/bin:${PATH}" +ENV LD_LIBRARY_PATH="/usr/local/lib:/opt/amazon/ofi-nccl/lib/x86_64-linux-gnu:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib:/usr/local/cuda/lib64:${LD_LIBRARY_PATH}" +# Python Path +ENV PATH="/usr/local/bin:${PATH}" + +# Install common conda packages +RUN pip install --no-cache-dir \ + cython \ + cryptography \ + pyOpenSSL \ + pybind11 \ + mkl \ + mkl-include \ + parso \ + typing \ + charset-normalizer \ + packaging \ + PyYAML \ + numpy \ + scipy \ + click \ + psutil \ + ipython \ + ipykernel \ + pillow \ + h5py \ + fsspec \ + "idna>=3.7" \ + "tqdm>=4.66.3" \ + "requests>=2.32.0" \ + "setuptools>=70.0.0" \ + "urllib3>=2.5.0" \ + ninja \ + opencv-python==4.11.0.86 \ + mpi4py \ + jinja2>=3.1.6 \ + tornado>=6.5.1 + +# Install PyTorch +RUN pip install --no-cache-dir -U torch==${PYTORCH_VERSION} \ + torchvision==${TORCHVISION_VERSION} \ + torchaudio==${TORCHAUDIO_VERSION} \ + --index-url https://download.pytorch.org/whl/cu129 \ + && pip install --no-cache-dir -U torchtnt==${TORCHTNT_VERSION} \ + torchdata==${TORCHDATA_VERSION} \ + triton \ + s3torchconnector \ + fastai \ + accelerate \ + # pin numpy requirement for fastai dependency + # requires explicit declaration of spacy, thic, blis + spacy \ + # pin thinc due to incompatibility with numpy 1.26.4 (sagemaker doesn't support latest numpy) + thinc==8.3.4 \ + blis \ + numpy \ + && pip uninstall -y dataclasses + +# Install flash attn and NVIDIA transformer engine. +# Optionally set NVTE_FRAMEWORK to avoid bringing in additional frameworks during TE install +ENV NVTE_FRAMEWORK=pytorch + +RUN curl -LO https://github.com/Dao-AILab/flash-attention/releases/download/v${FLASH_ATTN_VERSION}/flash_attn-${FLASH_ATTN_VERSION}+cu12torch2.8cxx11abiTRUE-cp312-cp312-linux_x86_64.whl \ + && pip install flash_attn-${FLASH_ATTN_VERSION}+cu12torch2.8cxx11abiTRUE-cp312-cp312-linux_x86_64.whl --no-build-isolation \ + && rm flash_attn-${FLASH_ATTN_VERSION}+cu12torch2.8cxx11abiTRUE-cp312-cp312-linux_x86_64.whl + +# Install TE using instructions from https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/installation.html +RUN pip install --no-cache-dir git+https://github.com/NVIDIA/TransformerEngine.git@release_v${TE_VERSION} --no-build-isolation + +RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.8/license.txt + +COPY start_cuda_compat.sh /usr/local/bin/start_cuda_compat.sh +RUN chmod +x /usr/local/bin/start_cuda_compat.sh + +COPY bash_telemetry.sh /usr/local/bin/bash_telemetry.sh +RUN chmod +x /usr/local/bin/bash_telemetry.sh +RUN echo 'source /usr/local/bin/bash_telemetry.sh' >> /etc/bash.bashrc + +# Install GDRCopy which is a dependency of SM Distributed DataParallel binary +# The test binaries requires cuda driver library which could be found in conda +# So update the linker path to point to it to avoid -Lcuda not found +RUN cd /tmp \ + && git clone https://github.com/NVIDIA/gdrcopy.git -b v${GDRCOPY_VERSION} \ + && cd gdrcopy \ + && sed -ie '13s@$@ -L $(CUDA)/lib64/stubs@' tests/Makefile \ + && CUDA=${CUDA_HOME} make install \ + && rm -rf /tmp/gdrcopy + +# Install common packages used by both EC2 and SageMaker +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + curl \ + wget \ + git \ + jq \ + emacs \ + vim \ + unzip \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get clean + +# Removing the cache as it is needed for security verification +RUN rm -rf /root/.cache | true + +######################################################## +# _____ ____ ____ ___ +# | ____/ ___|___ \ |_ _|_ __ ___ __ _ __ _ ___ +# | _|| | __) | | || '_ ` _ \ / _` |/ _` |/ _ \ +# | |__| |___ / __/ | || | | | | | (_| | (_| | __/ +# |_____\____|_____| |___|_| |_| |_|\__,_|\__, |\___| +# |___/ +# ____ _ +# | _ \ ___ ___(_)_ __ ___ +# | |_) / _ \/ __| | '_ \ / _ \ +# | _ < __/ (__| | |_) | __/ +# |_| \_\___|\___|_| .__/ \___| +# |_| +######################################################## + +FROM common AS ec2 + +ARG PYTHON + +WORKDIR / + +COPY dockerd_entrypoint.sh /usr/local/bin/dockerd_entrypoint.sh +RUN chmod +x /usr/local/bin/dockerd_entrypoint.sh + +COPY setup_oss_compliance.sh setup_oss_compliance.sh +RUN bash setup_oss_compliance.sh ${PYTHON} && rm setup_oss_compliance.sh + +# This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 20 +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update \ + && apt-get upgrade -y \ + && apt-get autoremove -y \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +ENTRYPOINT ["bash", "-m", "dockerd_entrypoint.sh"] +CMD ["/bin/bash"] + +################################################################# +# ____ __ __ _ +# / ___| __ _ __ _ ___| \/ | __ _| | _____ _ __ +# \___ \ / _` |/ _` |/ _ \ |\/| |/ _` | |/ / _ \ '__| +# ___) | (_| | (_| | __/ | | | (_| | < __/ | +# |____/ \__,_|\__, |\___|_| |_|\__,_|_|\_\___|_| +# |___/ +# ___ ____ _ +# |_ _|_ __ ___ __ _ __ _ ___ | _ \ ___ ___(_)_ __ ___ +# | || '_ ` _ \ / _` |/ _` |/ _ \ | |_) / _ \/ __| | '_ \ / _ \ +# | || | | | | | (_| | (_| | __/ | _ < __/ (__| | |_) | __/ +# |___|_| |_| |_|\__,_|\__, |\___| |_| \_\___|\___|_| .__/ \___| +# |___/ |_| +################################################################# + +FROM common AS sagemaker + +LABEL maintainer="Amazon AI" +LABEL dlc_major_version="1" + +ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main + +ARG PYTHON + +WORKDIR / + +# Install SM packages +RUN pip install --no-cache-dir -U \ + smclarify \ + "sagemaker>=2" \ + sagemaker-experiments \ + sagemaker-pytorch-training \ + sagemaker-training + +# Install extra packages +RUN pip install --no-cache-dir -U \ + bokeh \ + imageio \ + numba \ + pandas \ + plotly \ + shap \ + scikit-learn \ + seaborn \ + cloudpickle + +COPY setup_oss_compliance.sh setup_oss_compliance.sh +RUN bash setup_oss_compliance.sh ${PYTHON} && rm setup_oss_compliance.sh + +# Copy workaround script for incorrect hostname +COPY changehostname.c / +COPY start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh +RUN chmod +x /usr/local/bin/start_with_right_hostname.sh + +# This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 20 +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update \ + && apt-get upgrade -y \ + && apt-get autoremove -y \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +ENTRYPOINT ["bash", "-m", "start_with_right_hostname.sh"] +CMD ["/bin/bash"] \ No newline at end of file From a43b05fd73cb6bde174de57d20c24b68f3a6d664 Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Wed, 22 Oct 2025 11:00:36 -0700 Subject: [PATCH 02/40] removed the pins and updated versions --- pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu b/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu index f3435553821b..f413ef09c75b 100644 --- a/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu +++ b/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu @@ -8,7 +8,7 @@ ARG TORCHVISION_VERSION=0.24.0 ARG TORCHDATA_VERSION=0.11.0 ARG GDRCOPY_VERSION=2.5.1 -ARG TE_VERSION=2.5 +ARG TE_VERSION=2.8 ARG FLASH_ATTN_VERSION=2.8.3 ################################################################# @@ -108,7 +108,7 @@ RUN pip install --no-cache-dir \ "setuptools>=70.0.0" \ "urllib3>=2.5.0" \ ninja \ - opencv-python==4.11.0.86 \ + opencv-python \ mpi4py \ jinja2>=3.1.6 \ tornado>=6.5.1 @@ -128,7 +128,7 @@ RUN pip install --no-cache-dir -U torch==${PYTORCH_VERSION} \ # requires explicit declaration of spacy, thic, blis spacy \ # pin thinc due to incompatibility with numpy 1.26.4 (sagemaker doesn't support latest numpy) - thinc==8.3.4 \ + thinc \ blis \ numpy \ && pip uninstall -y dataclasses From ead5f528d31af885bdae55680fd57340ed2df326 Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Wed, 22 Oct 2025 11:06:23 -0700 Subject: [PATCH 03/40] fixed the pins in cpu file as well --- pytorch/training/docker/2.9/py3/Dockerfile.cpu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch/training/docker/2.9/py3/Dockerfile.cpu b/pytorch/training/docker/2.9/py3/Dockerfile.cpu index 2b151e7b3cd7..6926e029d812 100644 --- a/pytorch/training/docker/2.9/py3/Dockerfile.cpu +++ b/pytorch/training/docker/2.9/py3/Dockerfile.cpu @@ -189,7 +189,7 @@ RUN pip install --no-cache-dir \ "setuptools>=70.0.0" \ "urllib3>=2.5.0" \ "awscli" \ - opencv-python==4.11.0.86 \ + opencv-python \ mpi4py \ jinja2>=3.1.6 \ tornado>=6.5.1 @@ -208,7 +208,7 @@ RUN pip install --no-cache-dir -U torch==${PYTORCH_VERSION} \ # requires explicit declaration of spacy, thic, blis spacy \ # pin thinc due to incompatibility with numpy 1.26.4 (sagemaker doesn't support latest numpy) - thinc==8.3.4 \ + thinc \ blis \ numpy \ && pip uninstall -y dataclasses From 7eff279e4a9c6db8e4642049af84462a9218669c Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Wed, 22 Oct 2025 11:47:33 -0700 Subject: [PATCH 04/40] Modified the Buildspec files and toml file --- dlc_developer_config.toml | 10 ++-- pytorch/training/buildspec-2-9-ec2.yml | 75 ++++++++++++++++++++++++++ pytorch/training/buildspec-2-9-sm.yml | 75 ++++++++++++++++++++++++++ pytorch/training/buildspec.yml | 2 +- 4 files changed, 156 insertions(+), 6 deletions(-) create mode 100644 pytorch/training/buildspec-2-9-ec2.yml create mode 100644 pytorch/training/buildspec-2-9-sm.yml diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 90c179d30484..2c5eb0f972dc 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -37,12 +37,12 @@ deep_canary_mode = false [build] # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image. # available frameworks - ["base", "vllm", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"] -build_frameworks = [] +build_frameworks = ["pytorch"] # By default we build both training and inference containers. Set true/false values to determine which to build. build_training = true -build_inference = true +build_inference = false # Set do_build to "false" to skip builds and test the latest image built by this PR # Note: at least one build is required to set do_build to "false" @@ -69,13 +69,13 @@ ecs_tests = true eks_tests = true ec2_tests = true # Set it to true if you are preparing a Benchmark related PR -ec2_benchmark_tests = false +ec2_benchmark_tests = true ### Set ec2_tests_on_heavy_instances = true to be able to run any EC2 tests that use large/expensive instance types by ### default. If false, these types of tests will be skipped while other tests will run as usual. ### These tests are run in EC2 test jobs, so ec2_tests must be true if ec2_tests_on_heavy_instances is true. ### Off by default (set to false) -ec2_tests_on_heavy_instances = false +ec2_tests_on_heavy_instances = true ### SM specific tests ### On by default sagemaker_local_tests = true @@ -126,7 +126,7 @@ use_scheduler = false dlc-pr-base = "" # Standard Framework Training -dlc-pr-pytorch-training = "" +dlc-pr-pytorch-training = "pytorch/training/buildspec-2-9-ec2.yml" dlc-pr-tensorflow-2-training = "" dlc-pr-autogluon-training = "" diff --git a/pytorch/training/buildspec-2-9-ec2.yml b/pytorch/training/buildspec-2-9-ec2.yml new file mode 100644 index 000000000000..befac69e7920 --- /dev/null +++ b/pytorch/training/buildspec-2-9-ec2.yml @@ -0,0 +1,75 @@ +account_id: &ACCOUNT_ID +prod_account_id: &PROD_ACCOUNT_ID 763104351884 +region: ®ION +framework: &FRAMEWORK pytorch +version: &VERSION 2.9.0 +short_version: &SHORT_VERSION "2.9" +arch_type: x86 +# autopatch_build: "True" + +repository_info: + training_repository: &TRAINING_REPOSITORY + image_type: &TRAINING_IMAGE_TYPE training + root: !join [ *FRAMEWORK, "/", *TRAINING_IMAGE_TYPE ] + repository_name: &REPOSITORY_NAME !join [ pr, "-", *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ] + repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ] + release_repository_name: &RELEASE_REPOSITORY_NAME !join [ *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ] + release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ] + +context: + training_context: &TRAINING_CONTEXT + start_cuda_compat: + source: docker/build_artifacts/start_cuda_compat.sh + target: start_cuda_compat.sh + dockerd_entrypoint: + source: docker/build_artifacts/dockerd_entrypoint.sh + target: dockerd_entrypoint.sh + changehostname: + source: docker/build_artifacts/changehostname.c + target: changehostname.c + start_with_right_hostname: + source: docker/build_artifacts/start_with_right_hostname.sh + target: start_with_right_hostname.sh + example_mnist_file: + source: docker/build_artifacts/mnist.py + target: mnist.py + deep_learning_container: + source: ../../src/deep_learning_container.py + target: deep_learning_container.py + setup_oss_compliance: + source: ../../scripts/setup_oss_compliance.sh + target: setup_oss_compliance.sh + +images: + BuildEC2CPUPTTrainPy3DockerImage: + <<: *TRAINING_REPOSITORY + build: &PYTORCH_CPU_TRAINING_PY3 false + image_size_baseline: 7200 + device_type: &DEVICE_TYPE cpu + python_version: &DOCKER_PYTHON_VERSION py3 + tag_python_version: &TAG_PYTHON_VERSION py312 + os_version: &OS_VERSION ubuntu22.04 + tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ] + latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ] + # skip_build: "False" + docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ] + target: ec2 + context: + <<: *TRAINING_CONTEXT + BuildEC2GPUPTTrainPy3cu129DockerImage: + <<: *TRAINING_REPOSITORY + build: &PYTORCH_GPU_TRAINING_PY3 false + image_size_baseline: 28000 + device_type: &DEVICE_TYPE gpu + python_version: &DOCKER_PYTHON_VERSION py3 + tag_python_version: &TAG_PYTHON_VERSION py312 + cuda_version: &CUDA_VERSION cu130 + os_version: &OS_VERSION ubuntu22.04 + tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] + latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] + # skip_build: "False" + docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., + *DEVICE_TYPE ] + target: ec2 + context: + <<: *TRAINING_CONTEXT diff --git a/pytorch/training/buildspec-2-9-sm.yml b/pytorch/training/buildspec-2-9-sm.yml new file mode 100644 index 000000000000..c4b2f8dcdff2 --- /dev/null +++ b/pytorch/training/buildspec-2-9-sm.yml @@ -0,0 +1,75 @@ +account_id: &ACCOUNT_ID +prod_account_id: &PROD_ACCOUNT_ID 763104351884 +region: ®ION +framework: &FRAMEWORK pytorch +version: &VERSION 2.9.0 +short_version: &SHORT_VERSION "2.9" +arch_type: x86 +# autopatch_build: "True" + +repository_info: + training_repository: &TRAINING_REPOSITORY + image_type: &TRAINING_IMAGE_TYPE training + root: !join [ *FRAMEWORK, "/", *TRAINING_IMAGE_TYPE ] + repository_name: &REPOSITORY_NAME !join [ pr, "-", *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ] + repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ] + release_repository_name: &RELEASE_REPOSITORY_NAME !join [ *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE ] + release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ] + +context: + training_context: &TRAINING_CONTEXT + start_cuda_compat: + source: docker/build_artifacts/start_cuda_compat.sh + target: start_cuda_compat.sh + dockerd_entrypoint: + source: docker/build_artifacts/dockerd_entrypoint.sh + target: dockerd_entrypoint.sh + changehostname: + source: docker/build_artifacts/changehostname.c + target: changehostname.c + start_with_right_hostname: + source: docker/build_artifacts/start_with_right_hostname.sh + target: start_with_right_hostname.sh + example_mnist_file: + source: docker/build_artifacts/mnist.py + target: mnist.py + deep_learning_container: + source: ../../src/deep_learning_container.py + target: deep_learning_container.py + setup_oss_compliance: + source: ../../scripts/setup_oss_compliance.sh + target: setup_oss_compliance.sh + +images: + BuildSageMakerCPUPTTrainPy3DockerImage: + <<: *TRAINING_REPOSITORY + build: &PYTORCH_CPU_TRAINING_PY3 false + image_size_baseline: 7200 + device_type: &DEVICE_TYPE cpu + python_version: &DOCKER_PYTHON_VERSION py3 + tag_python_version: &TAG_PYTHON_VERSION py312 + os_version: &OS_VERSION ubuntu22.04 + tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ] + latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ] + # skip_build: "False" + docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ] + target: sagemaker + context: + <<: *TRAINING_CONTEXT + BuildSageMakerGPUPTTrainPy3DockerImage: + <<: *TRAINING_REPOSITORY + build: &PYTORCH_GPU_TRAINING_PY3 false + image_size_baseline: 28000 + device_type: &DEVICE_TYPE gpu + python_version: &DOCKER_PYTHON_VERSION py3 + tag_python_version: &TAG_PYTHON_VERSION py312 + cuda_version: &CUDA_VERSION cu130 + os_version: &OS_VERSION ubuntu22.04 + tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] + latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] + # skip_build: "False" + docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., + *DEVICE_TYPE ] + target: sagemaker + context: + <<: *TRAINING_CONTEXT diff --git a/pytorch/training/buildspec.yml b/pytorch/training/buildspec.yml index e9f328177b4b..e7a0d5614f66 100644 --- a/pytorch/training/buildspec.yml +++ b/pytorch/training/buildspec.yml @@ -1 +1 @@ -buildspec_pointer: buildspec-2-8-sm.yml +buildspec_pointer: buildspec-2-9-ec2.yml From c61c52fb7ea69ab2fd05b25410686357ef04b3e6 Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Wed, 22 Oct 2025 15:14:27 -0700 Subject: [PATCH 05/40] Removed fastai temporarily --- pytorch/training/docker/2.9/py3/Dockerfile.cpu | 1 - pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu | 1 - 2 files changed, 2 deletions(-) diff --git a/pytorch/training/docker/2.9/py3/Dockerfile.cpu b/pytorch/training/docker/2.9/py3/Dockerfile.cpu index 6926e029d812..801049f415ec 100644 --- a/pytorch/training/docker/2.9/py3/Dockerfile.cpu +++ b/pytorch/training/docker/2.9/py3/Dockerfile.cpu @@ -202,7 +202,6 @@ RUN pip install --no-cache-dir -U torch==${PYTORCH_VERSION} \ && pip install --no-cache-dir -U torchtnt==${TORCHTNT_VERSION} \ torchdata==${TORCHDATA_VERSION} \ s3torchconnector \ - fastai \ accelerate \ # pin numpy requirement for fastai dependency # requires explicit declaration of spacy, thic, blis diff --git a/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu b/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu index f413ef09c75b..f7f53a73be83 100644 --- a/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu +++ b/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu @@ -122,7 +122,6 @@ RUN pip install --no-cache-dir -U torch==${PYTORCH_VERSION} \ torchdata==${TORCHDATA_VERSION} \ triton \ s3torchconnector \ - fastai \ accelerate \ # pin numpy requirement for fastai dependency # requires explicit declaration of spacy, thic, blis From f7c6c44daa1f582df5d52e8e95379f1795df2897 Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Wed, 22 Oct 2025 16:40:55 -0700 Subject: [PATCH 06/40] rebuilding after pinning opencv-python --- pytorch/training/docker/2.9/py3/Dockerfile.cpu | 2 +- pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch/training/docker/2.9/py3/Dockerfile.cpu b/pytorch/training/docker/2.9/py3/Dockerfile.cpu index 801049f415ec..7315b490b52a 100644 --- a/pytorch/training/docker/2.9/py3/Dockerfile.cpu +++ b/pytorch/training/docker/2.9/py3/Dockerfile.cpu @@ -189,7 +189,7 @@ RUN pip install --no-cache-dir \ "setuptools>=70.0.0" \ "urllib3>=2.5.0" \ "awscli" \ - opencv-python \ + opencv-python==4.11.0.86 \ mpi4py \ jinja2>=3.1.6 \ tornado>=6.5.1 diff --git a/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu b/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu index f7f53a73be83..acf29b76708a 100644 --- a/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu +++ b/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu @@ -108,7 +108,7 @@ RUN pip install --no-cache-dir \ "setuptools>=70.0.0" \ "urllib3>=2.5.0" \ ninja \ - opencv-python \ + opencv-python==4.11.0.86 \ mpi4py \ jinja2>=3.1.6 \ tornado>=6.5.1 From 6e6238ce6b6628024d956ea0e436f1a8e455f68f Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Thu, 23 Oct 2025 13:22:16 -0700 Subject: [PATCH 07/40] rebuild with updated base image --- pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu b/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu index acf29b76708a..aa5c03325961 100644 --- a/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu +++ b/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu @@ -25,7 +25,7 @@ ARG FLASH_ATTN_VERSION=2.8.3 # |___/ |_| ################################################################# -FROM 669063966089.dkr.ecr.us-west-2.amazonaws.com/pr-base:13.0.0-gpu-py312-cu130-ubuntu22.04-ec2-pr-5394-2025-10-17-00-03-02 AS common +FROM public.ecr.aws/deep-learning-containers/base:13.0.0-gpu-py312-cu130-ubuntu22.04-ec2-v1.0 AS common # base has EFA, PYTHON and CUDA 13.0 LABEL maintainer="Amazon AI" From 48f5832c840ffc6e70f4fe5c697ac93bee21af54 Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Mon, 27 Oct 2025 11:28:06 -0700 Subject: [PATCH 08/40] corrected base image and few typos --- pytorch/training/buildspec-2-9-ec2.yml | 2 +- pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pytorch/training/buildspec-2-9-ec2.yml b/pytorch/training/buildspec-2-9-ec2.yml index befac69e7920..3f2cef8d599f 100644 --- a/pytorch/training/buildspec-2-9-ec2.yml +++ b/pytorch/training/buildspec-2-9-ec2.yml @@ -56,7 +56,7 @@ images: target: ec2 context: <<: *TRAINING_CONTEXT - BuildEC2GPUPTTrainPy3cu129DockerImage: + BuildEC2GPUPTTrainPy3cu130DockerImage: <<: *TRAINING_REPOSITORY build: &PYTORCH_GPU_TRAINING_PY3 false image_size_baseline: 28000 diff --git a/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu b/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu index aa5c03325961..abd2f7a70e9d 100644 --- a/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu +++ b/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu @@ -25,7 +25,7 @@ ARG FLASH_ATTN_VERSION=2.8.3 # |___/ |_| ################################################################# -FROM public.ecr.aws/deep-learning-containers/base:13.0.0-gpu-py312-cu130-ubuntu22.04-ec2-v1.0 AS common +FROM public.ecr.aws/deep-learning-containers/base:13.0.0-gpu-py312-ubuntu22.04-ec2 AS common # base has EFA, PYTHON and CUDA 13.0 LABEL maintainer="Amazon AI" @@ -117,7 +117,7 @@ RUN pip install --no-cache-dir \ RUN pip install --no-cache-dir -U torch==${PYTORCH_VERSION} \ torchvision==${TORCHVISION_VERSION} \ torchaudio==${TORCHAUDIO_VERSION} \ - --index-url https://download.pytorch.org/whl/cu129 \ + --index-url https://download.pytorch.org/whl/cu130 \ && pip install --no-cache-dir -U torchtnt==${TORCHTNT_VERSION} \ torchdata==${TORCHDATA_VERSION} \ triton \ From 49dd5c183d21beab288403a57a07c0440b970f55 Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Mon, 27 Oct 2025 13:52:46 -0700 Subject: [PATCH 09/40] adding additional dependency for TE 2.8 --- pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu b/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu index abd2f7a70e9d..c21a5d249f61 100644 --- a/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu +++ b/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu @@ -140,6 +140,8 @@ RUN curl -LO https://github.com/Dao-AILab/flash-attention/releases/download/v${F && pip install flash_attn-${FLASH_ATTN_VERSION}+cu12torch2.8cxx11abiTRUE-cp312-cp312-linux_x86_64.whl --no-build-isolation \ && rm flash_attn-${FLASH_ATTN_VERSION}+cu12torch2.8cxx11abiTRUE-cp312-cp312-linux_x86_64.whl +RUN pip install --no-cache-dir nvidia-mathdx + # Install TE using instructions from https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/installation.html RUN pip install --no-cache-dir git+https://github.com/NVIDIA/TransformerEngine.git@release_v${TE_VERSION} --no-build-isolation From d2e1f027f96114b747906a416dbdd94ee5de75e5 Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Thu, 6 Nov 2025 16:25:24 -0800 Subject: [PATCH 10/40] Enable efa log and modify the license file --- pytorch/training/docker/2.9/py3/Dockerfile.cpu | 2 +- test/dlc_tests/container_tests/bin/efa/testEFA | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch/training/docker/2.9/py3/Dockerfile.cpu b/pytorch/training/docker/2.9/py3/Dockerfile.cpu index 7315b490b52a..34f1f71c786c 100644 --- a/pytorch/training/docker/2.9/py3/Dockerfile.cpu +++ b/pytorch/training/docker/2.9/py3/Dockerfile.cpu @@ -212,7 +212,7 @@ RUN pip install --no-cache-dir -U torch==${PYTORCH_VERSION} \ numpy \ && pip uninstall -y dataclasses -RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.8/license.txt +RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.9/license.txt COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py diff --git a/test/dlc_tests/container_tests/bin/efa/testEFA b/test/dlc_tests/container_tests/bin/efa/testEFA index 4b676249d816..52f5664625d8 100755 --- a/test/dlc_tests/container_tests/bin/efa/testEFA +++ b/test/dlc_tests/container_tests/bin/efa/testEFA @@ -89,7 +89,7 @@ check_efa_nccl_all_reduce(){ RETURN_VAL=${PIPESTATUS[0]} # In case, if you would like see logs, uncomment below line - # RESULT=$(cat ${TRAINING_LOG}) + RESULT=$(cat ${TRAINING_LOG}) if [ ${RETURN_VAL} -eq 0 ]; then echo "***************************** check_efa_nccl_all_reduce passed *****************************" From 29f9e55ba7a368fb832714d2935b0d75f61ffccc Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Thu, 6 Nov 2025 16:28:04 -0800 Subject: [PATCH 11/40] Modify the license file --- pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu b/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu index c21a5d249f61..e727351a74d3 100644 --- a/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu +++ b/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu @@ -145,7 +145,7 @@ RUN pip install --no-cache-dir nvidia-mathdx # Install TE using instructions from https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/installation.html RUN pip install --no-cache-dir git+https://github.com/NVIDIA/TransformerEngine.git@release_v${TE_VERSION} --no-build-isolation -RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.8/license.txt +RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.9/license.txt COPY start_cuda_compat.sh /usr/local/bin/start_cuda_compat.sh RUN chmod +x /usr/local/bin/start_cuda_compat.sh From e43aafcf9ba21cccf152d7ba49ae108fa526cba7 Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Thu, 6 Nov 2025 22:52:36 -0800 Subject: [PATCH 12/40] Add pt2.9 ec2 test file --- test/dlc_tests/conftest.py | 1 + .../training/test_pytorch_training_2_9.py | 137 ++++++++++++++++++ 2 files changed, 138 insertions(+) create mode 100644 test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_9.py diff --git a/test/dlc_tests/conftest.py b/test/dlc_tests/conftest.py index dc6fc2ea624e..7e7522995fca 100644 --- a/test/dlc_tests/conftest.py +++ b/test/dlc_tests/conftest.py @@ -55,6 +55,7 @@ # ECR repo name fixtures # PyTorch "pytorch_training", + "pytorch_training___2__9", "pytorch_training___2__8", "pytorch_training___2__7", "pytorch_training___2__6", diff --git a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_9.py b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_9.py new file mode 100644 index 000000000000..25ddbcbd9ba4 --- /dev/null +++ b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_9.py @@ -0,0 +1,137 @@ +import pytest + +import test.test_utils as test_utils + +from test.test_utils import ec2 + +from test.dlc_tests.ec2.pytorch.training import common_cases +from test.dlc_tests.ec2 import smclarify_cases + + +@pytest.mark.usefixtures("sagemaker") +@pytest.mark.integration("pytorch_gpu_tests") +@pytest.mark.model("N/A") +@pytest.mark.team("conda") +@pytest.mark.parametrize( + "ec2_instance_type, region", common_cases.PT_EC2_GPU_INSTANCE_TYPE_AND_REGION, indirect=True +) +def test_pytorch_2_8_gpu( + pytorch_training___2__8, ec2_connection, region, gpu_only, ec2_instance_type +): + pytorch_training = pytorch_training___2__8 + if test_utils.is_image_incompatible_with_instance_type(pytorch_training, ec2_instance_type): + pytest.skip( + f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}" + ) + + test_cases = [ + (common_cases.pytorch_standalone, (pytorch_training, ec2_connection)), + (common_cases.pytorch_training_mnist, (pytorch_training, ec2_connection)), + (common_cases.pytorch_linear_regression_gpu, (pytorch_training, ec2_connection)), + (common_cases.pytorch_gloo, (pytorch_training, ec2_connection)), + (common_cases.pytorch_nccl, (pytorch_training, ec2_connection)), + (common_cases.pytorch_mpi, (pytorch_training, ec2_connection)), + (common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)), + (common_cases.pytorch_training_torchdata, (pytorch_training, ec2_connection)), + (common_cases.pytorch_cudnn_match_gpu, (pytorch_training, ec2_connection, region)), + (common_cases.pytorch_curand_gpu, (pytorch_training, ec2_connection)), + (common_cases.pytorch_telemetry_bashrc_gpu, (pytorch_training, ec2_connection)), + (common_cases.pytorch_telemetry_entrypoint_gpu, (pytorch_training, ec2_connection)), + ] + + if "sagemaker" in pytorch_training: + test_cases.append( + (smclarify_cases.smclarify_metrics_gpu, (pytorch_training, ec2_connection)), + ) + + # AMP must be run on multi_gpu + if ec2.is_instance_multi_gpu(ec2_instance_type): + test_cases.append((common_cases.pytorch_amp, (pytorch_training, ec2_connection))) + + test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.9 GPU") + + +@pytest.mark.usefixtures("sagemaker") +@pytest.mark.integration("pytorch_gpu_heavy_tests") +@pytest.mark.model("N/A") +@pytest.mark.team("conda") +@pytest.mark.parametrize( + "ec2_instance_type, region", + common_cases.PT_EC2_HEAVY_GPU_INSTANCE_TYPE_AND_REGION, + indirect=True, +) +@pytest.mark.skipif( + test_utils.is_pr_context() and not ec2.are_heavy_instance_ec2_tests_enabled(), + reason="Skip GPU Heavy tests in PR context unless explicitly enabled", +) +def test_pytorch_2_8_gpu_heavy( + pytorch_training___2__8, ec2_connection, region, gpu_only, ec2_instance_type +): + pytorch_training = pytorch_training___2__8 + if test_utils.is_image_incompatible_with_instance_type(pytorch_training, ec2_instance_type): + pytest.skip( + f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}" + ) + + test_cases = [ + (common_cases.pytorch_gdrcopy, (pytorch_training, ec2_connection)), + (common_cases.pytorch_transformer_engine, (pytorch_training, ec2_connection)), + ] + + test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.9 GPU Heavy") + + +@pytest.mark.usefixtures("sagemaker") +@pytest.mark.integration("inductor") +@pytest.mark.model("N/A") +@pytest.mark.team("training-compiler") +@pytest.mark.parametrize( + "ec2_instance_type, region", + common_cases.PT_EC2_GPU_INDUCTOR_INSTANCE_TYPE_AND_REGION, + indirect=True, +) +def test_pytorch_2_8_gpu_inductor( + pytorch_training___2__8, ec2_connection, region, gpu_only, ec2_instance_type +): + pytorch_training = pytorch_training___2__8 + if test_utils.is_image_incompatible_with_instance_type(pytorch_training, ec2_instance_type): + pytest.skip( + f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}" + ) + + test_cases = [ + (common_cases.pytorch_gloo_inductor_gpu, (pytorch_training, ec2_connection)), + (common_cases.pytorch_mpi_inductor_gpu, (pytorch_training, ec2_connection)), + (common_cases.pytorch_nccl_inductor, (pytorch_training, ec2_connection)), + (common_cases.pytorch_amp_inductor, (pytorch_training, ec2_connection)), + ] + + test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.9 GPU Inductor") + + +@pytest.mark.usefixtures("sagemaker") +@pytest.mark.integration("pytorch_cpu_tests") +@pytest.mark.model("N/A") +@pytest.mark.team("conda") +@pytest.mark.parametrize("ec2_instance_type", common_cases.PT_EC2_CPU_INSTANCE_TYPE, indirect=True) +def test_pytorch_2_8_cpu(pytorch_training___2__8, ec2_connection, cpu_only): + pytorch_training = pytorch_training___2__8 + + test_cases = [ + (common_cases.pytorch_standalone, (pytorch_training, ec2_connection)), + (common_cases.pytorch_training_mnist, (pytorch_training, ec2_connection)), + (common_cases.pytorch_linear_regression_cpu, (pytorch_training, ec2_connection)), + (common_cases.pytorch_gloo, (pytorch_training, ec2_connection)), + (common_cases.pytorch_mpi, (pytorch_training, ec2_connection)), + (common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)), + (common_cases.pytorch_training_torchdata, (pytorch_training, ec2_connection)), + (common_cases.pytorch_telemetry_bashrc_cpu, (pytorch_training, ec2_connection)), + (common_cases.pytorch_telemetry_entrypoint_cpu, (pytorch_training, ec2_connection)), + ] + + if "sagemaker" in pytorch_training: + test_cases += [ + (smclarify_cases.smclarify_metrics_cpu, (pytorch_training, ec2_connection)), + ] + + test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.9 CPU") From f2a7df8bb3d38b03dc53d139a676944f0fdf9504 Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Fri, 7 Nov 2025 01:37:28 -0800 Subject: [PATCH 13/40] fix typo and enable host networking --- .../ec2/pytorch/training/common_cases.py | 9 +++++--- .../training/test_pytorch_training_2_9.py | 22 +++++++++---------- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/test/dlc_tests/ec2/pytorch/training/common_cases.py b/test/dlc_tests/ec2/pytorch/training/common_cases.py index 14a1dc4a0ced..8bf94bfcb2fd 100644 --- a/test/dlc_tests/ec2/pytorch/training/common_cases.py +++ b/test/dlc_tests/ec2/pytorch/training/common_cases.py @@ -259,6 +259,7 @@ def pytorch_gloo(pytorch_training, ec2_connection): container_name="pytorch_gloo", large_shm=True, timeout=1500, + host_network=True, ) @@ -274,6 +275,7 @@ def pytorch_gloo_inductor_gpu(pytorch_training, ec2_connection): container_name="pytorch_gloo_inductor", large_shm=True, timeout=1500, + host_network=True, ) @@ -286,7 +288,7 @@ def pytorch_mpi( """ test_cmd = f"{PT_COMMON_GLOO_MPI_CMD} mpi 0" # input: backend, inductor flags execute_ec2_training_test( - ec2_connection, pytorch_training, test_cmd, container_name="pytorch_mpi_gloo" + ec2_connection, pytorch_training, test_cmd, container_name="pytorch_mpi_gloo", host_network=True ) @@ -296,7 +298,7 @@ def pytorch_mpi_inductor_gpu(pytorch_training, ec2_connection): """ test_cmd = f"{PT_COMMON_GLOO_MPI_CMD} mpi 1" # input: backend, inductor flags execute_ec2_training_test( - ec2_connection, pytorch_training, test_cmd, container_name="pytorch_mpi_gloo_inductor" + ec2_connection, pytorch_training, test_cmd, container_name="pytorch_mpi_gloo_inductor", host_network=True ) @@ -306,7 +308,7 @@ def pytorch_nccl(pytorch_training, ec2_connection): """ test_cmd = f"{PT_COMMON_NCCL_CMD} 0" # input: inductor flags execute_ec2_training_test( - ec2_connection, pytorch_training, test_cmd, container_name="pytorch_nccl", large_shm=True + ec2_connection, pytorch_training, test_cmd, container_name="pytorch_nccl", large_shm=True, host_network=True ) @@ -321,6 +323,7 @@ def pytorch_nccl_inductor(pytorch_training, ec2_connection): test_cmd, container_name="pytorch_nccl_inductor", large_shm=True, + host_network=True, ) diff --git a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_9.py b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_9.py index 25ddbcbd9ba4..7441b4933750 100644 --- a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_9.py +++ b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_2_9.py @@ -15,10 +15,10 @@ @pytest.mark.parametrize( "ec2_instance_type, region", common_cases.PT_EC2_GPU_INSTANCE_TYPE_AND_REGION, indirect=True ) -def test_pytorch_2_8_gpu( - pytorch_training___2__8, ec2_connection, region, gpu_only, ec2_instance_type +def test_pytorch_2_9_gpu( + pytorch_training___2__9, ec2_connection, region, gpu_only, ec2_instance_type ): - pytorch_training = pytorch_training___2__8 + pytorch_training = pytorch_training___2__9 if test_utils.is_image_incompatible_with_instance_type(pytorch_training, ec2_instance_type): pytest.skip( f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}" @@ -64,10 +64,10 @@ def test_pytorch_2_8_gpu( test_utils.is_pr_context() and not ec2.are_heavy_instance_ec2_tests_enabled(), reason="Skip GPU Heavy tests in PR context unless explicitly enabled", ) -def test_pytorch_2_8_gpu_heavy( - pytorch_training___2__8, ec2_connection, region, gpu_only, ec2_instance_type +def test_pytorch_2_9_gpu_heavy( + pytorch_training___2__9, ec2_connection, region, gpu_only, ec2_instance_type ): - pytorch_training = pytorch_training___2__8 + pytorch_training = pytorch_training___2__9 if test_utils.is_image_incompatible_with_instance_type(pytorch_training, ec2_instance_type): pytest.skip( f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}" @@ -90,10 +90,10 @@ def test_pytorch_2_8_gpu_heavy( common_cases.PT_EC2_GPU_INDUCTOR_INSTANCE_TYPE_AND_REGION, indirect=True, ) -def test_pytorch_2_8_gpu_inductor( - pytorch_training___2__8, ec2_connection, region, gpu_only, ec2_instance_type +def test_pytorch_2_9_gpu_inductor( + pytorch_training___2__9, ec2_connection, region, gpu_only, ec2_instance_type ): - pytorch_training = pytorch_training___2__8 + pytorch_training = pytorch_training___2__9 if test_utils.is_image_incompatible_with_instance_type(pytorch_training, ec2_instance_type): pytest.skip( f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}" @@ -114,8 +114,8 @@ def test_pytorch_2_8_gpu_inductor( @pytest.mark.model("N/A") @pytest.mark.team("conda") @pytest.mark.parametrize("ec2_instance_type", common_cases.PT_EC2_CPU_INSTANCE_TYPE, indirect=True) -def test_pytorch_2_8_cpu(pytorch_training___2__8, ec2_connection, cpu_only): - pytorch_training = pytorch_training___2__8 +def test_pytorch_2_9_cpu(pytorch_training___2__9, ec2_connection, cpu_only): + pytorch_training = pytorch_training___2__9 test_cases = [ (common_cases.pytorch_standalone, (pytorch_training, ec2_connection)), From 4d4fc0794e1f8e35dd2d54b05c493f66a671380e Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Fri, 7 Nov 2025 15:58:19 -0800 Subject: [PATCH 14/40] fix formatting and skip test_fused_attn.py --- .../transformerengine/testPTTransformerEngine | 20 +++++++++++++++++++ .../ec2/pytorch/training/common_cases.py | 19 +++++++++++++++--- 2 files changed, 36 insertions(+), 3 deletions(-) diff --git a/test/dlc_tests/container_tests/bin/transformerengine/testPTTransformerEngine b/test/dlc_tests/container_tests/bin/transformerengine/testPTTransformerEngine index e057b957ab54..490c727c4840 100755 --- a/test/dlc_tests/container_tests/bin/transformerengine/testPTTransformerEngine +++ b/test/dlc_tests/container_tests/bin/transformerengine/testPTTransformerEngine @@ -36,6 +36,26 @@ elif [ $(version $TE_VERSION) -lt $(version "2.0") ]; then pytest -v -s $TE_PATH/tests/pytorch/test_fused_optimizer.py pytest -v -s $TE_PATH/tests/pytorch/test_multi_tensor.py pytest -v -s $TE_PATH/tests/pytorch/test_fusible_ops.py +elif [ $(version $TE_VERSION) -lt $(version "3.0") ]; then + pip install pytest==8.2.1 onnxruntime onnx expecttest + pytest -v -s $TE_PATH/tests/pytorch/test_sanity.py + pytest -v -s $TE_PATH/tests/pytorch/test_recipe.py + pytest -v -s $TE_PATH/tests/pytorch/test_deferred_init.py + # Disabled test due to bug: https://github.com/NVIDIA/TransformerEngine/issues/1165 + # PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 pytest -v -s $TE_PATH/tests/pytorch/test_numerics.py + PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 pytest -v -s $TE_PATH/tests/pytorch/test_cuda_graphs.py + pytest -v -s $TE_PATH/tests/pytorch/test_jit.py + # Skip test_fused_attn.py as it doesn't exist in TE 2.8 + # NVTE_TORCH_COMPILE=0 pytest -v -s $TE_PATH/tests/pytorch/fused_attn/test_fused_attn.py + pytest -v -s $TE_PATH/tests/pytorch/test_fused_rope.py + # Disable onnx test due lack of TE prioritization on onnx: https://github.com/NVIDIA/TransformerEngine/issues/528 + # NVTE_TORCH_COMPILE=0 pytest -v -s $TE_PATH/tests/pytorch/test_onnx_export.py + pytest -v -s $TE_PATH/tests/pytorch/test_float8tensor.py + pytest -v -s $TE_PATH/tests/pytorch/test_gqa.py + pytest -v -s $TE_PATH/tests/pytorch/test_fused_optimizer.py + pytest -v -s $TE_PATH/tests/pytorch/test_multi_tensor.py + pytest -v -s $TE_PATH/tests/pytorch/test_fusible_ops.py + else pip install pytest==8.2.1 onnxruntime onnx expecttest pytest -v -s $TE_PATH/tests/pytorch/test_sanity.py diff --git a/test/dlc_tests/ec2/pytorch/training/common_cases.py b/test/dlc_tests/ec2/pytorch/training/common_cases.py index 8bf94bfcb2fd..cde73515a4c9 100644 --- a/test/dlc_tests/ec2/pytorch/training/common_cases.py +++ b/test/dlc_tests/ec2/pytorch/training/common_cases.py @@ -288,7 +288,11 @@ def pytorch_mpi( """ test_cmd = f"{PT_COMMON_GLOO_MPI_CMD} mpi 0" # input: backend, inductor flags execute_ec2_training_test( - ec2_connection, pytorch_training, test_cmd, container_name="pytorch_mpi_gloo", host_network=True + ec2_connection, + pytorch_training, + test_cmd, + container_name="pytorch_mpi_gloo", + host_network=True ) @@ -298,7 +302,11 @@ def pytorch_mpi_inductor_gpu(pytorch_training, ec2_connection): """ test_cmd = f"{PT_COMMON_GLOO_MPI_CMD} mpi 1" # input: backend, inductor flags execute_ec2_training_test( - ec2_connection, pytorch_training, test_cmd, container_name="pytorch_mpi_gloo_inductor", host_network=True + ec2_connection, + pytorch_training, + test_cmd, + container_name="pytorch_mpi_gloo_inductor", + host_network=True ) @@ -308,7 +316,12 @@ def pytorch_nccl(pytorch_training, ec2_connection): """ test_cmd = f"{PT_COMMON_NCCL_CMD} 0" # input: inductor flags execute_ec2_training_test( - ec2_connection, pytorch_training, test_cmd, container_name="pytorch_nccl", large_shm=True, host_network=True + ec2_connection, + pytorch_training, + test_cmd, + container_name="pytorch_nccl", + large_shm=True, + host_network=True ) From bd23f191826717af267b19c9e131801549a7d16f Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Fri, 7 Nov 2025 16:08:54 -0800 Subject: [PATCH 15/40] Fix formatting in common_cases.py --- .../ec2/pytorch/training/common_cases.py | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/test/dlc_tests/ec2/pytorch/training/common_cases.py b/test/dlc_tests/ec2/pytorch/training/common_cases.py index cde73515a4c9..e5af63c44e93 100644 --- a/test/dlc_tests/ec2/pytorch/training/common_cases.py +++ b/test/dlc_tests/ec2/pytorch/training/common_cases.py @@ -288,11 +288,11 @@ def pytorch_mpi( """ test_cmd = f"{PT_COMMON_GLOO_MPI_CMD} mpi 0" # input: backend, inductor flags execute_ec2_training_test( - ec2_connection, - pytorch_training, - test_cmd, - container_name="pytorch_mpi_gloo", - host_network=True + ec2_connection, + pytorch_training, + test_cmd, + container_name="pytorch_mpi_gloo", + host_network=True, ) @@ -302,11 +302,11 @@ def pytorch_mpi_inductor_gpu(pytorch_training, ec2_connection): """ test_cmd = f"{PT_COMMON_GLOO_MPI_CMD} mpi 1" # input: backend, inductor flags execute_ec2_training_test( - ec2_connection, - pytorch_training, - test_cmd, - container_name="pytorch_mpi_gloo_inductor", - host_network=True + ec2_connection, + pytorch_training, + test_cmd, + container_name="pytorch_mpi_gloo_inductor", + host_network=True, ) @@ -316,12 +316,12 @@ def pytorch_nccl(pytorch_training, ec2_connection): """ test_cmd = f"{PT_COMMON_NCCL_CMD} 0" # input: inductor flags execute_ec2_training_test( - ec2_connection, - pytorch_training, - test_cmd, - container_name="pytorch_nccl", - large_shm=True, - host_network=True + ec2_connection, + pytorch_training, + test_cmd, + container_name="pytorch_nccl", + large_shm=True, + host_network=True, ) From da89fb486a7935f867f838511e2f7c9c042889aa Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Mon, 10 Nov 2025 08:17:42 -0800 Subject: [PATCH 16/40] Fix EFA NCCL failure --- test/dlc_tests/container_tests/bin/efa/testEFA | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/dlc_tests/container_tests/bin/efa/testEFA b/test/dlc_tests/container_tests/bin/efa/testEFA index 52f5664625d8..5bf10182f7a6 100755 --- a/test/dlc_tests/container_tests/bin/efa/testEFA +++ b/test/dlc_tests/container_tests/bin/efa/testEFA @@ -67,6 +67,8 @@ check_efa_nccl_all_reduce_performance(){ check_efa_nccl_all_reduce(){ echo "Running all_reduce_perf test" + export NCCL_SOCKET_IFNAME="ens*,eth*,eno*" + if [[ ${IS_IPV6} == "True" ]]; then echo "Running all_reduce_perf test with IPv6: using IPv6 mode with NCCL_SOCKET_FAMILY=AF_INET6" mpirun -x FI_PROVIDER="efa" -n $NODES -N $GPU_COUNT --hostfile $NUM_HOSTS_FILE \ From 14f3035b0f39daf92861725931ac4724039835a8 Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Mon, 10 Nov 2025 12:36:30 -0800 Subject: [PATCH 17/40] Fix EFA NCCL failure --- test/dlc_tests/container_tests/bin/efa/testEFA | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/dlc_tests/container_tests/bin/efa/testEFA b/test/dlc_tests/container_tests/bin/efa/testEFA index 5bf10182f7a6..0c6ac8e3ef44 100755 --- a/test/dlc_tests/container_tests/bin/efa/testEFA +++ b/test/dlc_tests/container_tests/bin/efa/testEFA @@ -74,7 +74,7 @@ check_efa_nccl_all_reduce(){ mpirun -x FI_PROVIDER="efa" -n $NODES -N $GPU_COUNT --hostfile $NUM_HOSTS_FILE \ -x NCCL_DEBUG=INFO ${USE_DEVICE_RDMA_ARG} -x NCCL_PROTO=simple -x NCCL_ALGO=ring -x RDMAV_FORK_SAFE=1 \ -x PATH -x LD_LIBRARY_PATH=${CUDA_HOME}/lib:${CUDA_HOME}/lib64:$LD_LIBRARY_PATH \ - -x NCCL_SOCKET_IFNAME=^lo --mca pml ^cm --mca btl tcp,self --mca btl_tcp_if_exclude lo,docker0 --bind-to none \ + -x NCCL_SOCKET_IFNAME="ens*,eth*,eno*" --mca pml ^cm --mca btl tcp,self --mca btl_tcp_if_exclude lo,docker0 --bind-to none \ -x NCCL_SOCKET_FAMILY=AF_INET6 \ /all_reduce_perf -b 8 -e 1G -f 2 -g 1 -c 1 -n 100 2>&1 | tee "${TRAINING_LOG}" else @@ -85,7 +85,7 @@ check_efa_nccl_all_reduce(){ mpirun -x FI_PROVIDER="efa" -n $NODES -N $GPU_COUNT --hostfile $NUM_HOSTS_FILE \ -x NCCL_DEBUG=INFO ${USE_DEVICE_RDMA_ARG} -x NCCL_PROTO=simple -x NCCL_ALGO=ring -x RDMAV_FORK_SAFE=1 \ -x PATH -x LD_LIBRARY_PATH=${CUDA_HOME}/lib:${CUDA_HOME}/lib64:$LD_LIBRARY_PATH \ - -x NCCL_SOCKET_IFNAME=^lo --mca pml ^cm --mca btl tcp,self --mca btl_tcp_if_exclude lo,docker0 --bind-to none \ + -x NCCL_SOCKET_IFNAME="ens*,eth*,eno*" --mca pml ^cm --mca btl tcp,self --mca btl_tcp_if_exclude lo,docker0 --bind-to none \ /all_reduce_perf -b 8 -e 1G -f 2 -g 1 -c 1 -n 100 2>&1 | tee "${TRAINING_LOG}" fi From b8ad28b02bf0c931009b47ebf194cd25f04fd9da Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Mon, 10 Nov 2025 14:32:10 -0800 Subject: [PATCH 18/40] Fix the script to detect actual network interface --- test/dlc_tests/container_tests/bin/efa/testEFA | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/test/dlc_tests/container_tests/bin/efa/testEFA b/test/dlc_tests/container_tests/bin/efa/testEFA index 0c6ac8e3ef44..4c4627b554ab 100755 --- a/test/dlc_tests/container_tests/bin/efa/testEFA +++ b/test/dlc_tests/container_tests/bin/efa/testEFA @@ -67,14 +67,15 @@ check_efa_nccl_all_reduce_performance(){ check_efa_nccl_all_reduce(){ echo "Running all_reduce_perf test" - export NCCL_SOCKET_IFNAME="ens*,eth*,eno*" + iface=$(ip -o -4 route show to default | awk '{print $5}') + export NCCL_SOCKET_IFNAME="$iface" if [[ ${IS_IPV6} == "True" ]]; then echo "Running all_reduce_perf test with IPv6: using IPv6 mode with NCCL_SOCKET_FAMILY=AF_INET6" mpirun -x FI_PROVIDER="efa" -n $NODES -N $GPU_COUNT --hostfile $NUM_HOSTS_FILE \ -x NCCL_DEBUG=INFO ${USE_DEVICE_RDMA_ARG} -x NCCL_PROTO=simple -x NCCL_ALGO=ring -x RDMAV_FORK_SAFE=1 \ -x PATH -x LD_LIBRARY_PATH=${CUDA_HOME}/lib:${CUDA_HOME}/lib64:$LD_LIBRARY_PATH \ - -x NCCL_SOCKET_IFNAME="ens*,eth*,eno*" --mca pml ^cm --mca btl tcp,self --mca btl_tcp_if_exclude lo,docker0 --bind-to none \ + -x NCCL_SOCKET_IFNAME="$iface" --mca pml ^cm --mca btl tcp,self --mca btl_tcp_if_exclude lo,docker0 --bind-to none \ -x NCCL_SOCKET_FAMILY=AF_INET6 \ /all_reduce_perf -b 8 -e 1G -f 2 -g 1 -c 1 -n 100 2>&1 | tee "${TRAINING_LOG}" else @@ -85,7 +86,7 @@ check_efa_nccl_all_reduce(){ mpirun -x FI_PROVIDER="efa" -n $NODES -N $GPU_COUNT --hostfile $NUM_HOSTS_FILE \ -x NCCL_DEBUG=INFO ${USE_DEVICE_RDMA_ARG} -x NCCL_PROTO=simple -x NCCL_ALGO=ring -x RDMAV_FORK_SAFE=1 \ -x PATH -x LD_LIBRARY_PATH=${CUDA_HOME}/lib:${CUDA_HOME}/lib64:$LD_LIBRARY_PATH \ - -x NCCL_SOCKET_IFNAME="ens*,eth*,eno*" --mca pml ^cm --mca btl tcp,self --mca btl_tcp_if_exclude lo,docker0 --bind-to none \ + -x NCCL_SOCKET_IFNAME="$iface" --mca pml ^cm --mca btl tcp,self --mca btl_tcp_if_exclude lo,docker0 --bind-to none \ /all_reduce_perf -b 8 -e 1G -f 2 -g 1 -c 1 -n 100 2>&1 | tee "${TRAINING_LOG}" fi From 399a043db57291a4147cf72d53eb36d1da13718b Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Wed, 12 Nov 2025 13:04:13 -0800 Subject: [PATCH 19/40] update prbase image and revert back the NCCL changes --- .../docker/2.9/py3/cu130/Dockerfile.gpu | 2 +- .../dlc_tests/container_tests/bin/efa/testEFA | 7 ++---- .../ec2/pytorch/training/common_cases.py | 22 +++---------------- 3 files changed, 6 insertions(+), 25 deletions(-) diff --git a/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu b/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu index e727351a74d3..36fc45b14885 100644 --- a/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu +++ b/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu @@ -25,7 +25,7 @@ ARG FLASH_ATTN_VERSION=2.8.3 # |___/ |_| ################################################################# -FROM public.ecr.aws/deep-learning-containers/base:13.0.0-gpu-py312-ubuntu22.04-ec2 AS common +FROM 669063966089.dkr.ecr.us-west-2.amazonaws.com/pr-base:13.0.0-gpu-py312-cu130-ubuntu22.04-ec2-pr-5468-2025-11-12-00-24-28 AS common # base has EFA, PYTHON and CUDA 13.0 LABEL maintainer="Amazon AI" diff --git a/test/dlc_tests/container_tests/bin/efa/testEFA b/test/dlc_tests/container_tests/bin/efa/testEFA index 4c4627b554ab..52f5664625d8 100755 --- a/test/dlc_tests/container_tests/bin/efa/testEFA +++ b/test/dlc_tests/container_tests/bin/efa/testEFA @@ -67,15 +67,12 @@ check_efa_nccl_all_reduce_performance(){ check_efa_nccl_all_reduce(){ echo "Running all_reduce_perf test" - iface=$(ip -o -4 route show to default | awk '{print $5}') - export NCCL_SOCKET_IFNAME="$iface" - if [[ ${IS_IPV6} == "True" ]]; then echo "Running all_reduce_perf test with IPv6: using IPv6 mode with NCCL_SOCKET_FAMILY=AF_INET6" mpirun -x FI_PROVIDER="efa" -n $NODES -N $GPU_COUNT --hostfile $NUM_HOSTS_FILE \ -x NCCL_DEBUG=INFO ${USE_DEVICE_RDMA_ARG} -x NCCL_PROTO=simple -x NCCL_ALGO=ring -x RDMAV_FORK_SAFE=1 \ -x PATH -x LD_LIBRARY_PATH=${CUDA_HOME}/lib:${CUDA_HOME}/lib64:$LD_LIBRARY_PATH \ - -x NCCL_SOCKET_IFNAME="$iface" --mca pml ^cm --mca btl tcp,self --mca btl_tcp_if_exclude lo,docker0 --bind-to none \ + -x NCCL_SOCKET_IFNAME=^lo --mca pml ^cm --mca btl tcp,self --mca btl_tcp_if_exclude lo,docker0 --bind-to none \ -x NCCL_SOCKET_FAMILY=AF_INET6 \ /all_reduce_perf -b 8 -e 1G -f 2 -g 1 -c 1 -n 100 2>&1 | tee "${TRAINING_LOG}" else @@ -86,7 +83,7 @@ check_efa_nccl_all_reduce(){ mpirun -x FI_PROVIDER="efa" -n $NODES -N $GPU_COUNT --hostfile $NUM_HOSTS_FILE \ -x NCCL_DEBUG=INFO ${USE_DEVICE_RDMA_ARG} -x NCCL_PROTO=simple -x NCCL_ALGO=ring -x RDMAV_FORK_SAFE=1 \ -x PATH -x LD_LIBRARY_PATH=${CUDA_HOME}/lib:${CUDA_HOME}/lib64:$LD_LIBRARY_PATH \ - -x NCCL_SOCKET_IFNAME="$iface" --mca pml ^cm --mca btl tcp,self --mca btl_tcp_if_exclude lo,docker0 --bind-to none \ + -x NCCL_SOCKET_IFNAME=^lo --mca pml ^cm --mca btl tcp,self --mca btl_tcp_if_exclude lo,docker0 --bind-to none \ /all_reduce_perf -b 8 -e 1G -f 2 -g 1 -c 1 -n 100 2>&1 | tee "${TRAINING_LOG}" fi diff --git a/test/dlc_tests/ec2/pytorch/training/common_cases.py b/test/dlc_tests/ec2/pytorch/training/common_cases.py index e5af63c44e93..14a1dc4a0ced 100644 --- a/test/dlc_tests/ec2/pytorch/training/common_cases.py +++ b/test/dlc_tests/ec2/pytorch/training/common_cases.py @@ -259,7 +259,6 @@ def pytorch_gloo(pytorch_training, ec2_connection): container_name="pytorch_gloo", large_shm=True, timeout=1500, - host_network=True, ) @@ -275,7 +274,6 @@ def pytorch_gloo_inductor_gpu(pytorch_training, ec2_connection): container_name="pytorch_gloo_inductor", large_shm=True, timeout=1500, - host_network=True, ) @@ -288,11 +286,7 @@ def pytorch_mpi( """ test_cmd = f"{PT_COMMON_GLOO_MPI_CMD} mpi 0" # input: backend, inductor flags execute_ec2_training_test( - ec2_connection, - pytorch_training, - test_cmd, - container_name="pytorch_mpi_gloo", - host_network=True, + ec2_connection, pytorch_training, test_cmd, container_name="pytorch_mpi_gloo" ) @@ -302,11 +296,7 @@ def pytorch_mpi_inductor_gpu(pytorch_training, ec2_connection): """ test_cmd = f"{PT_COMMON_GLOO_MPI_CMD} mpi 1" # input: backend, inductor flags execute_ec2_training_test( - ec2_connection, - pytorch_training, - test_cmd, - container_name="pytorch_mpi_gloo_inductor", - host_network=True, + ec2_connection, pytorch_training, test_cmd, container_name="pytorch_mpi_gloo_inductor" ) @@ -316,12 +306,7 @@ def pytorch_nccl(pytorch_training, ec2_connection): """ test_cmd = f"{PT_COMMON_NCCL_CMD} 0" # input: inductor flags execute_ec2_training_test( - ec2_connection, - pytorch_training, - test_cmd, - container_name="pytorch_nccl", - large_shm=True, - host_network=True, + ec2_connection, pytorch_training, test_cmd, container_name="pytorch_nccl", large_shm=True ) @@ -336,7 +321,6 @@ def pytorch_nccl_inductor(pytorch_training, ec2_connection): test_cmd, container_name="pytorch_nccl_inductor", large_shm=True, - host_network=True, ) From 76c2cd463f79602980ec6885d812e98e3a6a59b9 Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Wed, 12 Nov 2025 16:00:37 -0800 Subject: [PATCH 20/40] modify the ofi-nccl path --- pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu b/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu index 36fc45b14885..42ffac19b9f8 100644 --- a/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu +++ b/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu @@ -76,7 +76,7 @@ RUN apt-get update \ && apt-get clean ENV PATH="${OPEN_MPI_PATH}/bin:${EFA_PATH}/bin:${PATH}" -ENV LD_LIBRARY_PATH="/usr/local/lib:/opt/amazon/ofi-nccl/lib/x86_64-linux-gnu:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib:/usr/local/cuda/lib64:${LD_LIBRARY_PATH}" +ENV LD_LIBRARY_PATH="/usr/local/lib:/opt/amazon/ofi-nccl/lib:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib:/usr/local/cuda/lib64:${LD_LIBRARY_PATH}" # Python Path ENV PATH="/usr/local/bin:${PATH}" From eb67a5bd168e2ea11d20564c3fff149907164235 Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Wed, 12 Nov 2025 23:45:51 -0800 Subject: [PATCH 21/40] build sm image --- dlc_developer_config.toml | 10 +++++----- pytorch/training/buildspec.yml | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index c43f29993e27..83d73d1b42af 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -69,13 +69,13 @@ ecs_tests = true eks_tests = true ec2_tests = true # Set it to true if you are preparing a Benchmark related PR -ec2_benchmark_tests = true +ec2_benchmark_tests = false ### Set ec2_tests_on_heavy_instances = true to be able to run any EC2 tests that use large/expensive instance types by ### default. If false, these types of tests will be skipped while other tests will run as usual. ### These tests are run in EC2 test jobs, so ec2_tests must be true if ec2_tests_on_heavy_instances is true. ### Off by default (set to false) -ec2_tests_on_heavy_instances = true +ec2_tests_on_heavy_instances = false ### SM specific tests ### On by default sagemaker_local_tests = true @@ -100,9 +100,9 @@ sagemaker_remote_tests = true # run efa sagemaker tests sagemaker_efa_tests = false # run release_candidate_integration tests -sagemaker_rc_tests = false +sagemaker_rc_tests = true # run sagemaker benchmark tests -sagemaker_benchmark_tests = false +sagemaker_benchmark_tests = true # SM remote EFA test instance type sagemaker_remote_efa_instance_type = "" @@ -124,7 +124,7 @@ nightly_pr_test_mode = false dlc-pr-base = "" # Standard Framework Training -dlc-pr-pytorch-training = "pytorch/training/buildspec-2-9-ec2.yml" +dlc-pr-pytorch-training = "pytorch/training/buildspec-2-9-sm.yml" dlc-pr-tensorflow-2-training = "" dlc-pr-autogluon-training = "" diff --git a/pytorch/training/buildspec.yml b/pytorch/training/buildspec.yml index e7a0d5614f66..674430aca0ab 100644 --- a/pytorch/training/buildspec.yml +++ b/pytorch/training/buildspec.yml @@ -1 +1 @@ -buildspec_pointer: buildspec-2-9-ec2.yml +buildspec_pointer: buildspec-2-9-sm.yml From fc61fdb293a33f34be8def54996e2432e0112437 Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Thu, 13 Nov 2025 14:24:28 -0800 Subject: [PATCH 22/40] add fastai and update TE version --- pytorch/training/docker/2.9/py3/Dockerfile.cpu | 1 + pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/pytorch/training/docker/2.9/py3/Dockerfile.cpu b/pytorch/training/docker/2.9/py3/Dockerfile.cpu index 34f1f71c786c..422912115a07 100644 --- a/pytorch/training/docker/2.9/py3/Dockerfile.cpu +++ b/pytorch/training/docker/2.9/py3/Dockerfile.cpu @@ -202,6 +202,7 @@ RUN pip install --no-cache-dir -U torch==${PYTORCH_VERSION} \ && pip install --no-cache-dir -U torchtnt==${TORCHTNT_VERSION} \ torchdata==${TORCHDATA_VERSION} \ s3torchconnector \ + fastai \ accelerate \ # pin numpy requirement for fastai dependency # requires explicit declaration of spacy, thic, blis diff --git a/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu b/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu index 42ffac19b9f8..5f6689ddb64b 100644 --- a/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu +++ b/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu @@ -8,7 +8,7 @@ ARG TORCHVISION_VERSION=0.24.0 ARG TORCHDATA_VERSION=0.11.0 ARG GDRCOPY_VERSION=2.5.1 -ARG TE_VERSION=2.8 +ARG TE_VERSION=2.9 ARG FLASH_ATTN_VERSION=2.8.3 ################################################################# @@ -122,6 +122,7 @@ RUN pip install --no-cache-dir -U torch==${PYTORCH_VERSION} \ torchdata==${TORCHDATA_VERSION} \ triton \ s3torchconnector \ + fastai \ accelerate \ # pin numpy requirement for fastai dependency # requires explicit declaration of spacy, thic, blis From 595f5b4a3a091f4d72a2483f599ec234f7b39cbc Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Thu, 13 Nov 2025 15:22:15 -0800 Subject: [PATCH 23/40] rebuild ec2 image with fastai --- dlc_developer_config.toml | 6 +++--- pytorch/training/buildspec.yml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 83d73d1b42af..b783e5d6459e 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -69,13 +69,13 @@ ecs_tests = true eks_tests = true ec2_tests = true # Set it to true if you are preparing a Benchmark related PR -ec2_benchmark_tests = false +ec2_benchmark_tests = true ### Set ec2_tests_on_heavy_instances = true to be able to run any EC2 tests that use large/expensive instance types by ### default. If false, these types of tests will be skipped while other tests will run as usual. ### These tests are run in EC2 test jobs, so ec2_tests must be true if ec2_tests_on_heavy_instances is true. ### Off by default (set to false) -ec2_tests_on_heavy_instances = false +ec2_tests_on_heavy_instances = true ### SM specific tests ### On by default sagemaker_local_tests = true @@ -124,7 +124,7 @@ nightly_pr_test_mode = false dlc-pr-base = "" # Standard Framework Training -dlc-pr-pytorch-training = "pytorch/training/buildspec-2-9-sm.yml" +dlc-pr-pytorch-training = "pytorch/training/buildspec-2-9-ec2.yml" dlc-pr-tensorflow-2-training = "" dlc-pr-autogluon-training = "" diff --git a/pytorch/training/buildspec.yml b/pytorch/training/buildspec.yml index 674430aca0ab..e7a0d5614f66 100644 --- a/pytorch/training/buildspec.yml +++ b/pytorch/training/buildspec.yml @@ -1 +1 @@ -buildspec_pointer: buildspec-2-9-sm.yml +buildspec_pointer: buildspec-2-9-ec2.yml From edf9871457534bafe1166416ab6c5d79cd4f7256 Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Fri, 14 Nov 2025 09:34:16 -0800 Subject: [PATCH 24/40] rebuild sm image and test --- dlc_developer_config.toml | 6 +++--- pytorch/training/buildspec.yml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index b783e5d6459e..83d73d1b42af 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -69,13 +69,13 @@ ecs_tests = true eks_tests = true ec2_tests = true # Set it to true if you are preparing a Benchmark related PR -ec2_benchmark_tests = true +ec2_benchmark_tests = false ### Set ec2_tests_on_heavy_instances = true to be able to run any EC2 tests that use large/expensive instance types by ### default. If false, these types of tests will be skipped while other tests will run as usual. ### These tests are run in EC2 test jobs, so ec2_tests must be true if ec2_tests_on_heavy_instances is true. ### Off by default (set to false) -ec2_tests_on_heavy_instances = true +ec2_tests_on_heavy_instances = false ### SM specific tests ### On by default sagemaker_local_tests = true @@ -124,7 +124,7 @@ nightly_pr_test_mode = false dlc-pr-base = "" # Standard Framework Training -dlc-pr-pytorch-training = "pytorch/training/buildspec-2-9-ec2.yml" +dlc-pr-pytorch-training = "pytorch/training/buildspec-2-9-sm.yml" dlc-pr-tensorflow-2-training = "" dlc-pr-autogluon-training = "" diff --git a/pytorch/training/buildspec.yml b/pytorch/training/buildspec.yml index e7a0d5614f66..674430aca0ab 100644 --- a/pytorch/training/buildspec.yml +++ b/pytorch/training/buildspec.yml @@ -1 +1 @@ -buildspec_pointer: buildspec-2-9-ec2.yml +buildspec_pointer: buildspec-2-9-sm.yml From 64492d382f3b95527a577518311681af093e362d Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Fri, 14 Nov 2025 12:19:08 -0800 Subject: [PATCH 25/40] update base image and flashattention wheel --- pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu b/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu index 5f6689ddb64b..feccb5b21194 100644 --- a/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu +++ b/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu @@ -25,7 +25,7 @@ ARG FLASH_ATTN_VERSION=2.8.3 # |___/ |_| ################################################################# -FROM 669063966089.dkr.ecr.us-west-2.amazonaws.com/pr-base:13.0.0-gpu-py312-cu130-ubuntu22.04-ec2-pr-5468-2025-11-12-00-24-28 AS common +FROM public.ecr.aws/deep-learning-containers/base:13.0.0-gpu-py312-ubuntu22.04-ec2 AS common # base has EFA, PYTHON and CUDA 13.0 LABEL maintainer="Amazon AI" @@ -137,9 +137,12 @@ RUN pip install --no-cache-dir -U torch==${PYTORCH_VERSION} \ # Optionally set NVTE_FRAMEWORK to avoid bringing in additional frameworks during TE install ENV NVTE_FRAMEWORK=pytorch -RUN curl -LO https://github.com/Dao-AILab/flash-attention/releases/download/v${FLASH_ATTN_VERSION}/flash_attn-${FLASH_ATTN_VERSION}+cu12torch2.8cxx11abiTRUE-cp312-cp312-linux_x86_64.whl \ - && pip install flash_attn-${FLASH_ATTN_VERSION}+cu12torch2.8cxx11abiTRUE-cp312-cp312-linux_x86_64.whl --no-build-isolation \ - && rm flash_attn-${FLASH_ATTN_VERSION}+cu12torch2.8cxx11abiTRUE-cp312-cp312-linux_x86_64.whl +# Install flash-attn using instructions from https://github.com/Dao-AILab/flash-attention#installation-and-features +RUN MAX_JOBS=18 pip install --no-cache-dir flash-attn==${FLASH_ATTN_VERSION} --no-build-isolation + +#RUN curl -LO https://github.com/Dao-AILab/flash-attention/releases/download/v${FLASH_ATTN_VERSION}/flash_attn-${FLASH_ATTN_VERSION}+cu12torch2.8cxx11abiTRUE-cp312-cp312-linux_x86_64.whl \ +# && pip install flash_attn-${FLASH_ATTN_VERSION}+cu12torch2.8cxx11abiTRUE-cp312-cp312-linux_x86_64.whl --no-build-isolation \ +# && rm flash_attn-${FLASH_ATTN_VERSION}+cu12torch2.8cxx11abiTRUE-cp312-cp312-linux_x86_64.whl RUN pip install --no-cache-dir nvidia-mathdx From a5bb85d1ed85f76f6ec40ce4d7cef939773b414d Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Fri, 14 Nov 2025 13:17:12 -0800 Subject: [PATCH 26/40] rebuild sm image with enabled security tests --- dlc_developer_config.toml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 83d73d1b42af..04b2304fac50 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -63,19 +63,19 @@ use_new_test_structure = false ### On by default sanity_tests = true security_tests = true - safety_check_test = false - ecr_scan_allowlist_feature = false + safety_check_test = true + ecr_scan_allowlist_feature = true ecs_tests = true eks_tests = true ec2_tests = true # Set it to true if you are preparing a Benchmark related PR -ec2_benchmark_tests = false +ec2_benchmark_tests = true ### Set ec2_tests_on_heavy_instances = true to be able to run any EC2 tests that use large/expensive instance types by ### default. If false, these types of tests will be skipped while other tests will run as usual. ### These tests are run in EC2 test jobs, so ec2_tests must be true if ec2_tests_on_heavy_instances is true. ### Off by default (set to false) -ec2_tests_on_heavy_instances = false +ec2_tests_on_heavy_instances = true ### SM specific tests ### On by default sagemaker_local_tests = true From 606d4fd23bebbf05069822d744a0d7f5730e42bf Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Fri, 14 Nov 2025 14:26:51 -0800 Subject: [PATCH 27/40] rebuild ec2 image --- dlc_developer_config.toml | 4 ++-- pytorch/training/buildspec.yml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 0ce09f9a1cad..24678d1dd2b6 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -37,7 +37,7 @@ deep_canary_mode = false [build] # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image. # available frameworks - ["base", "vllm", "sglang", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"] -build_frameworks = [] +build_frameworks = ["pytorch"] # By default we build both training and inference containers. Set true/false values to determine which to build. @@ -124,7 +124,7 @@ nightly_pr_test_mode = false dlc-pr-base = "" # Standard Framework Training -dlc-pr-pytorch-training = "pytorch/training/buildspec-2-9-sm.yml" +dlc-pr-pytorch-training = "pytorch/training/buildspec-2-9-ec2.yml" dlc-pr-tensorflow-2-training = "" dlc-pr-autogluon-training = "" diff --git a/pytorch/training/buildspec.yml b/pytorch/training/buildspec.yml index 674430aca0ab..e7a0d5614f66 100644 --- a/pytorch/training/buildspec.yml +++ b/pytorch/training/buildspec.yml @@ -1 +1 @@ -buildspec_pointer: buildspec-2-9-sm.yml +buildspec_pointer: buildspec-2-9-ec2.yml From d22ad4897eeae2a848782fa23fcdceeea18f0384 Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Fri, 14 Nov 2025 16:10:42 -0800 Subject: [PATCH 28/40] rerun jobs after deleting AML2_CPU_ARM64_US_EAST_1 --- test/test_utils/__init__.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/test/test_utils/__init__.py b/test/test_utils/__init__.py index 53421e25ffe4..6205820eab42 100644 --- a/test/test_utils/__init__.py +++ b/test/test_utils/__init__.py @@ -140,10 +140,7 @@ def get_ami_id_ssm(region_name, parameter_path): ami_name_pattern="Deep Learning ARM64 AMI OSS Nvidia Driver GPU PyTorch 2.2.? (Ubuntu 20.04) ????????", IncludeDeprecated=True, ) -AML2_CPU_ARM64_US_EAST_1 = get_ami_id_boto3( - region_name="us-east-1", - ami_name_pattern="Deep Learning Base AMI (Amazon Linux 2) Version ??.?", -) + PT_GPU_PY3_BENCHMARK_IMAGENET_AMI_US_EAST_1 = "ami-0673bb31cc62485dd" PT_GPU_PY3_BENCHMARK_IMAGENET_AMI_US_WEST_2 = "ami-02d9a47bc61a31d43" From 85e990358a5263e68da45e14a18f738177182e0c Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Fri, 14 Nov 2025 17:41:06 -0800 Subject: [PATCH 29/40] rerun jobs after disabling safety check test and ecr scan allowlist --- dlc_developer_config.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 24678d1dd2b6..adc4471f27e8 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -63,8 +63,8 @@ use_new_test_structure = false ### On by default sanity_tests = true security_tests = true - safety_check_test = true - ecr_scan_allowlist_feature = true + safety_check_test = false + ecr_scan_allowlist_feature = false ecs_tests = true eks_tests = true ec2_tests = true From a9bb637cbd0359ba12bfe06095945a740cea2312 Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Sun, 16 Nov 2025 21:35:05 -0800 Subject: [PATCH 30/40] update MAX_JOBS and try rebuild --- pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu b/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu index feccb5b21194..811ba246d9f8 100644 --- a/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu +++ b/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu @@ -138,7 +138,7 @@ RUN pip install --no-cache-dir -U torch==${PYTORCH_VERSION} \ ENV NVTE_FRAMEWORK=pytorch # Install flash-attn using instructions from https://github.com/Dao-AILab/flash-attention#installation-and-features -RUN MAX_JOBS=18 pip install --no-cache-dir flash-attn==${FLASH_ATTN_VERSION} --no-build-isolation +RUN MAX_JOBS=4 pip install --no-cache-dir flash-attn==${FLASH_ATTN_VERSION} --no-build-isolation #RUN curl -LO https://github.com/Dao-AILab/flash-attention/releases/download/v${FLASH_ATTN_VERSION}/flash_attn-${FLASH_ATTN_VERSION}+cu12torch2.8cxx11abiTRUE-cp312-cp312-linux_x86_64.whl \ # && pip install flash_attn-${FLASH_ATTN_VERSION}+cu12torch2.8cxx11abiTRUE-cp312-cp312-linux_x86_64.whl --no-build-isolation \ From 155cae7e8fbe12673475ab9e5919507098bbaef4 Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Mon, 17 Nov 2025 00:08:49 -0800 Subject: [PATCH 31/40] rebuild ec2 image with safety check test and ecr scan allowlist --- dlc_developer_config.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index adc4471f27e8..24678d1dd2b6 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -63,8 +63,8 @@ use_new_test_structure = false ### On by default sanity_tests = true security_tests = true - safety_check_test = false - ecr_scan_allowlist_feature = false + safety_check_test = true + ecr_scan_allowlist_feature = true ecs_tests = true eks_tests = true ec2_tests = true From 719a55fa8c1d895a086283213f7be67ce64df524 Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Mon, 17 Nov 2025 02:48:41 -0800 Subject: [PATCH 32/40] rebuild ec2 image and run tests --- dlc_developer_config.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 24678d1dd2b6..adc4471f27e8 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -63,8 +63,8 @@ use_new_test_structure = false ### On by default sanity_tests = true security_tests = true - safety_check_test = true - ecr_scan_allowlist_feature = true + safety_check_test = false + ecr_scan_allowlist_feature = false ecs_tests = true eks_tests = true ec2_tests = true From 07b1b2601962189c667bcdf533ea2abf8d7b96f0 Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Mon, 17 Nov 2025 08:11:14 -0800 Subject: [PATCH 33/40] rebuild sm image and run tests --- dlc_developer_config.toml | 4 ++-- pytorch/training/buildspec.yml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index adc4471f27e8..67957cea8c68 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -98,7 +98,7 @@ ipv6_vpc_name = "" # run standard sagemaker remote tests from test/sagemaker_tests sagemaker_remote_tests = true # run efa sagemaker tests -sagemaker_efa_tests = false +sagemaker_efa_tests = true # run release_candidate_integration tests sagemaker_rc_tests = true # run sagemaker benchmark tests @@ -124,7 +124,7 @@ nightly_pr_test_mode = false dlc-pr-base = "" # Standard Framework Training -dlc-pr-pytorch-training = "pytorch/training/buildspec-2-9-ec2.yml" +dlc-pr-pytorch-training = "pytorch/training/buildspec-2-9-sm.yml" dlc-pr-tensorflow-2-training = "" dlc-pr-autogluon-training = "" diff --git a/pytorch/training/buildspec.yml b/pytorch/training/buildspec.yml index e7a0d5614f66..674430aca0ab 100644 --- a/pytorch/training/buildspec.yml +++ b/pytorch/training/buildspec.yml @@ -1 +1 @@ -buildspec_pointer: buildspec-2-9-ec2.yml +buildspec_pointer: buildspec-2-9-sm.yml From 65688331dab9c48199952f0b66bccc8b26e58cf8 Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Mon, 17 Nov 2025 15:55:11 -0800 Subject: [PATCH 34/40] skip smppy tests and rerun --- test/sagemaker_tests/pytorch/training/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/sagemaker_tests/pytorch/training/conftest.py b/test/sagemaker_tests/pytorch/training/conftest.py index 20e381a9ffbc..74ab75a252d2 100644 --- a/test/sagemaker_tests/pytorch/training/conftest.py +++ b/test/sagemaker_tests/pytorch/training/conftest.py @@ -432,7 +432,7 @@ def skip_smppy_test( """For each currency release, we can skip smppy tests if the Profiler binary does not exist. However, when the Profiler binaries are added, be sure to fix the test logic such that the tests are not skipped. """ - skip_dict = {">=2.7.1,<2.8": ["cpu", "cu128"], ">=2.8,<2.9": ["cpu", "cu129"]} + skip_dict = {">=2.7.1,<2.8": ["cpu", "cu128"], ">=2.8,<2.9": ["cpu", "cu129"], ">=2.9,<3.0": ["cpu", "cu130"]} if _validate_pytorch_framework_version( request, processor, ecr_image, "skip_smppy_test", skip_dict ): From 1b6ed8d134cfcbd2ec6044dbca415fb260e14e4b Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Mon, 17 Nov 2025 21:34:52 -0800 Subject: [PATCH 35/40] rerun after enabling safety check test and ecr scan allowlist --- dlc_developer_config.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 67957cea8c68..7a64db081518 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -63,8 +63,8 @@ use_new_test_structure = false ### On by default sanity_tests = true security_tests = true - safety_check_test = false - ecr_scan_allowlist_feature = false + safety_check_test = true + ecr_scan_allowlist_feature = true ecs_tests = true eks_tests = true ec2_tests = true From ef9f107f6c5570c722e1c2e46a2b8d09872e5159 Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Tue, 18 Nov 2025 00:46:14 -0800 Subject: [PATCH 36/40] rebuild ec2 image --- dlc_developer_config.toml | 8 ++++---- pytorch/training/buildspec.yml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 7a64db081518..8c648fdc2549 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -98,11 +98,11 @@ ipv6_vpc_name = "" # run standard sagemaker remote tests from test/sagemaker_tests sagemaker_remote_tests = true # run efa sagemaker tests -sagemaker_efa_tests = true +sagemaker_efa_tests = false # run release_candidate_integration tests -sagemaker_rc_tests = true +sagemaker_rc_tests = false # run sagemaker benchmark tests -sagemaker_benchmark_tests = true +sagemaker_benchmark_tests = false # SM remote EFA test instance type sagemaker_remote_efa_instance_type = "" @@ -124,7 +124,7 @@ nightly_pr_test_mode = false dlc-pr-base = "" # Standard Framework Training -dlc-pr-pytorch-training = "pytorch/training/buildspec-2-9-sm.yml" +dlc-pr-pytorch-training = "pytorch/training/buildspec-2-9-ec2.yml" dlc-pr-tensorflow-2-training = "" dlc-pr-autogluon-training = "" diff --git a/pytorch/training/buildspec.yml b/pytorch/training/buildspec.yml index 674430aca0ab..e7a0d5614f66 100644 --- a/pytorch/training/buildspec.yml +++ b/pytorch/training/buildspec.yml @@ -1 +1 @@ -buildspec_pointer: buildspec-2-9-sm.yml +buildspec_pointer: buildspec-2-9-ec2.yml From a450adac64714eef0646c54dfcf932eb6f8f429a Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Tue, 18 Nov 2025 14:38:29 -0800 Subject: [PATCH 37/40] fix formatting --- pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu | 5 +---- test/sagemaker_tests/pytorch/training/conftest.py | 6 +++++- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu b/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu index 811ba246d9f8..5cd5bf502de0 100644 --- a/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu +++ b/pytorch/training/docker/2.9/py3/cu130/Dockerfile.gpu @@ -138,12 +138,9 @@ RUN pip install --no-cache-dir -U torch==${PYTORCH_VERSION} \ ENV NVTE_FRAMEWORK=pytorch # Install flash-attn using instructions from https://github.com/Dao-AILab/flash-attention#installation-and-features +# Set MAX_JOBS=4 to avoid OOM issues in installation process RUN MAX_JOBS=4 pip install --no-cache-dir flash-attn==${FLASH_ATTN_VERSION} --no-build-isolation -#RUN curl -LO https://github.com/Dao-AILab/flash-attention/releases/download/v${FLASH_ATTN_VERSION}/flash_attn-${FLASH_ATTN_VERSION}+cu12torch2.8cxx11abiTRUE-cp312-cp312-linux_x86_64.whl \ -# && pip install flash_attn-${FLASH_ATTN_VERSION}+cu12torch2.8cxx11abiTRUE-cp312-cp312-linux_x86_64.whl --no-build-isolation \ -# && rm flash_attn-${FLASH_ATTN_VERSION}+cu12torch2.8cxx11abiTRUE-cp312-cp312-linux_x86_64.whl - RUN pip install --no-cache-dir nvidia-mathdx # Install TE using instructions from https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/installation.html diff --git a/test/sagemaker_tests/pytorch/training/conftest.py b/test/sagemaker_tests/pytorch/training/conftest.py index 74ab75a252d2..196096c79056 100644 --- a/test/sagemaker_tests/pytorch/training/conftest.py +++ b/test/sagemaker_tests/pytorch/training/conftest.py @@ -432,7 +432,11 @@ def skip_smppy_test( """For each currency release, we can skip smppy tests if the Profiler binary does not exist. However, when the Profiler binaries are added, be sure to fix the test logic such that the tests are not skipped. """ - skip_dict = {">=2.7.1,<2.8": ["cpu", "cu128"], ">=2.8,<2.9": ["cpu", "cu129"], ">=2.9,<3.0": ["cpu", "cu130"]} + skip_dict = { + ">=2.7.1,<2.8": ["cpu", "cu128"], + ">=2.8,<2.9": ["cpu", "cu129"], + ">=2.9,<3.0": ["cpu", "cu130"], + } if _validate_pytorch_framework_version( request, processor, ecr_image, "skip_smppy_test", skip_dict ): From 36f8594bbd363e2b09516c64431f171ec064ecbc Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Tue, 18 Nov 2025 19:52:24 -0800 Subject: [PATCH 38/40] Rerun SM tests --- dlc_developer_config.toml | 8 ++++---- pytorch/training/buildspec.yml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 5fa853c1dff0..a0483b69b655 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -98,11 +98,11 @@ ipv6_vpc_name = "" # run standard sagemaker remote tests from test/sagemaker_tests sagemaker_remote_tests = true # run efa sagemaker tests -sagemaker_efa_tests = false +sagemaker_efa_tests = true # run release_candidate_integration tests -sagemaker_rc_tests = false +sagemaker_rc_tests = true # run sagemaker benchmark tests -sagemaker_benchmark_tests = false +sagemaker_benchmark_tests = true # SM remote EFA test instance type sagemaker_remote_efa_instance_type = "" @@ -124,7 +124,7 @@ nightly_pr_test_mode = false dlc-pr-base = "" # Standard Framework Training -dlc-pr-pytorch-training = "pytorch/training/buildspec-2-9-ec2.yml" +dlc-pr-pytorch-training = "pytorch/training/buildspec-2-9-sm.yml" dlc-pr-tensorflow-2-training = "" dlc-pr-autogluon-training = "" diff --git a/pytorch/training/buildspec.yml b/pytorch/training/buildspec.yml index e7a0d5614f66..674430aca0ab 100644 --- a/pytorch/training/buildspec.yml +++ b/pytorch/training/buildspec.yml @@ -1 +1 @@ -buildspec_pointer: buildspec-2-9-ec2.yml +buildspec_pointer: buildspec-2-9-sm.yml From 15efc9b008bb61c5eee070b89e89b6a97aeb48fd Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Wed, 19 Nov 2025 11:13:54 -0800 Subject: [PATCH 39/40] Revert testEFA changes and run --- dlc_developer_config.toml | 8 ++++---- pytorch/training/buildspec.yml | 2 +- test/dlc_tests/container_tests/bin/efa/testEFA | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index a0483b69b655..5fa853c1dff0 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -98,11 +98,11 @@ ipv6_vpc_name = "" # run standard sagemaker remote tests from test/sagemaker_tests sagemaker_remote_tests = true # run efa sagemaker tests -sagemaker_efa_tests = true +sagemaker_efa_tests = false # run release_candidate_integration tests -sagemaker_rc_tests = true +sagemaker_rc_tests = false # run sagemaker benchmark tests -sagemaker_benchmark_tests = true +sagemaker_benchmark_tests = false # SM remote EFA test instance type sagemaker_remote_efa_instance_type = "" @@ -124,7 +124,7 @@ nightly_pr_test_mode = false dlc-pr-base = "" # Standard Framework Training -dlc-pr-pytorch-training = "pytorch/training/buildspec-2-9-sm.yml" +dlc-pr-pytorch-training = "pytorch/training/buildspec-2-9-ec2.yml" dlc-pr-tensorflow-2-training = "" dlc-pr-autogluon-training = "" diff --git a/pytorch/training/buildspec.yml b/pytorch/training/buildspec.yml index 674430aca0ab..e7a0d5614f66 100644 --- a/pytorch/training/buildspec.yml +++ b/pytorch/training/buildspec.yml @@ -1 +1 @@ -buildspec_pointer: buildspec-2-9-sm.yml +buildspec_pointer: buildspec-2-9-ec2.yml diff --git a/test/dlc_tests/container_tests/bin/efa/testEFA b/test/dlc_tests/container_tests/bin/efa/testEFA index 52f5664625d8..fd66b0ab2ccc 100755 --- a/test/dlc_tests/container_tests/bin/efa/testEFA +++ b/test/dlc_tests/container_tests/bin/efa/testEFA @@ -89,7 +89,7 @@ check_efa_nccl_all_reduce(){ RETURN_VAL=${PIPESTATUS[0]} # In case, if you would like see logs, uncomment below line - RESULT=$(cat ${TRAINING_LOG}) + #RESULT=$(cat ${TRAINING_LOG}) if [ ${RETURN_VAL} -eq 0 ]; then echo "***************************** check_efa_nccl_all_reduce passed *****************************" From a18c234ffdf3af4f8db70106aeea2414261a61ec Mon Sep 17 00:00:00 2001 From: DevakiBolleneni Date: Wed, 19 Nov 2025 13:48:26 -0800 Subject: [PATCH 40/40] Revert toml file --- dlc_developer_config.toml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 5fa853c1dff0..2ddfe8ccb932 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -37,12 +37,12 @@ deep_canary_mode = false [build] # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image. # available frameworks - ["base", "vllm", "sglang", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"] -build_frameworks = ["pytorch"] +build_frameworks = [] # By default we build both training and inference containers. Set true/false values to determine which to build. build_training = true -build_inference = false +build_inference = true # Set do_build to "false" to skip builds and test the latest image built by this PR # Note: at least one build is required to set do_build to "false" @@ -63,19 +63,19 @@ use_new_test_structure = false ### On by default sanity_tests = true security_tests = true - safety_check_test = true - ecr_scan_allowlist_feature = true + safety_check_test = false + ecr_scan_allowlist_feature = false ecs_tests = true eks_tests = true ec2_tests = true # Set it to true if you are preparing a Benchmark related PR -ec2_benchmark_tests = true +ec2_benchmark_tests = false ### Set ec2_tests_on_heavy_instances = true to be able to run any EC2 tests that use large/expensive instance types by ### default. If false, these types of tests will be skipped while other tests will run as usual. ### These tests are run in EC2 test jobs, so ec2_tests must be true if ec2_tests_on_heavy_instances is true. ### Off by default (set to false) -ec2_tests_on_heavy_instances = true +ec2_tests_on_heavy_instances = false ### SM specific tests ### On by default sagemaker_local_tests = true @@ -124,7 +124,7 @@ nightly_pr_test_mode = false dlc-pr-base = "" # Standard Framework Training -dlc-pr-pytorch-training = "pytorch/training/buildspec-2-9-ec2.yml" +dlc-pr-pytorch-training = "" dlc-pr-tensorflow-2-training = "" dlc-pr-autogluon-training = ""