diff --git a/pytorch/training/buildspec.yml b/pytorch/training/buildspec.yml index 00e393291f32..f44b91988043 100644 --- a/pytorch/training/buildspec.yml +++ b/pytorch/training/buildspec.yml @@ -41,21 +41,21 @@ images: # target: ec2 # context: # <<: *TRAINING_CONTEXT - # BuildEC2GPUPTTrainPy3cu121DockerImage: - # <<: *TRAINING_REPOSITORY - # build: &PYTORCH_GPU_TRAINING_PY3 false - # image_size_baseline: 19700 - # device_type: &DEVICE_TYPE gpu - # python_version: &DOCKER_PYTHON_VERSION py3 - # tag_python_version: &TAG_PYTHON_VERSION py310 - # cuda_version: &CUDA_VERSION cu121 - # os_version: &OS_VERSION ubuntu20.04 - # tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] - # docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., - # *DEVICE_TYPE ] - # target: ec2 - # context: - # <<: *TRAINING_CONTEXT + BuildEC2GPUPTTrainPy3cu121DockerImage: + <<: *TRAINING_REPOSITORY + build: &PYTORCH_GPU_TRAINING_PY3 false + image_size_baseline: 19700 + device_type: &DEVICE_TYPE gpu + python_version: &DOCKER_PYTHON_VERSION py3 + tag_python_version: &TAG_PYTHON_VERSION py310 + cuda_version: &CUDA_VERSION cu121 + os_version: &OS_VERSION ubuntu20.04 + tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] + docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., + *DEVICE_TYPE ] + target: ec2 + context: + <<: *TRAINING_CONTEXT # BuildEC2GPUPTTrainPy3cu118DockerImage: # <<: *TRAINING_REPOSITORY # build: &PYTORCH_GPU_TRAINING_PY3 false @@ -84,21 +84,21 @@ images: # target: sagemaker # context: # <<: *TRAINING_CONTEXT - BuildSageMakerGPUPTTrainPy3DockerImage: - <<: *TRAINING_REPOSITORY - build: &PYTORCH_GPU_TRAINING_PY3 false - image_size_baseline: 21500 - device_type: &DEVICE_TYPE gpu - python_version: &DOCKER_PYTHON_VERSION py3 - tag_python_version: &TAG_PYTHON_VERSION py310 - cuda_version: &CUDA_VERSION cu118 - os_version: &OS_VERSION ubuntu20.04 - tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] - docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., - *DEVICE_TYPE ] - target: sagemaker - context: - <<: *TRAINING_CONTEXT + # BuildSageMakerGPUPTTrainPy3DockerImage: + # <<: *TRAINING_REPOSITORY + # build: &PYTORCH_GPU_TRAINING_PY3 false + # image_size_baseline: 21500 + # device_type: &DEVICE_TYPE gpu + # python_version: &DOCKER_PYTHON_VERSION py3 + # tag_python_version: &TAG_PYTHON_VERSION py310 + # cuda_version: &CUDA_VERSION cu118 + # os_version: &OS_VERSION ubuntu20.04 + # tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] + # docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., + # *DEVICE_TYPE ] + # target: sagemaker + # context: + # <<: *TRAINING_CONTEXT # BuildPyTorchExampleGPUTrainPy3cu121DockerImage: # <<: *TRAINING_REPOSITORY # build: &PYTORCH_GPU_TRAINING_PY3 false diff --git a/pytorch/training/docker/2.0/py3/cu121/Dockerfile.gpu b/pytorch/training/docker/2.0/py3/cu121/Dockerfile.gpu index b70e7baf404c..2e709c886d85 100644 --- a/pytorch/training/docker/2.0/py3/cu121/Dockerfile.gpu +++ b/pytorch/training/docker/2.0/py3/cu121/Dockerfile.gpu @@ -46,6 +46,7 @@ ENV PATH /opt/conda/bin:$PATH # 5.2 is G3 EC2 instance, 7.5 is G4*, 7.0 is p3*, 8.0 is P4*, 8.6 is G5* and 9.0 is P5* ENV TORCH_CUDA_ARCH_LIST="5.2;7.0+PTX;7.5;8.0;8.6;9.0" ENV TORCH_NVCC_FLAGS="-Xfatbin -compress-all" +ENV CUDNN_VERSION=8.9.3.28 ENV NCCL_VERSION=2.18.3 ENV EFA_VERSION=1.24.1 ENV GDRCOPY_VERSION=2.3.1 @@ -68,6 +69,8 @@ RUN apt-get update \ build-essential \ ca-certificates \ cmake \ + libcudnn8=$CUDNN_VERSION-1+cuda12.1 \ + libcudnn8-dev=$CUDNN_VERSION-1+cuda12.1 \ curl \ emacs \ git \ @@ -133,7 +136,7 @@ RUN /opt/conda/bin/mamba install -y -c conda-forge \ # Adding package for studio kernels ipykernel \ # patch CVE - "cryptography>=41.0.2" \ + "cryptography>=41.0.4" \ # patch CVE "pillow>=9.4" \ "mpi4py>=3.1.4,<3.2" \ @@ -268,7 +271,7 @@ RUN /opt/conda/bin/mamba install -y -c conda-forge \ && /opt/conda/bin/mamba clean -afy # Patches -RUN pip install "pillow>=9.5" opencv-python +RUN pip install "pillow>=9.5" opencv-python huggingface_hub RUN /opt/conda/bin/mamba install -y -c conda-forge \ "requests>=2.31.0" \ && /opt/conda/bin/mamba clean -afy @@ -292,6 +295,14 @@ RUN pip install packaging \ && cd .. \ && rm -rf apex +# Install flash attn and NVIDIA transformer engine +ENV NVTE_FRAMEWORK=pytorch +# Install flash-attn using instructions from https://github.com/Dao-AILab/flash-attention#installation-and-features +# Set MAX_JOBS=4 to avoid OOM issues in installation process +RUN MAX_JOBS=4 pip install flash-attn==2.0.4 --no-build-isolation +# Install TE using instructions from https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/installation.html +RUN pip install git+https://github.com/NVIDIA/TransformerEngine.git@release_v0.12 + RUN HOME_DIR=/root \ && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \ diff --git a/test/dlc_tests/conftest.py b/test/dlc_tests/conftest.py index d4a545fbb756..976843a168de 100644 --- a/test/dlc_tests/conftest.py +++ b/test/dlc_tests/conftest.py @@ -1025,6 +1025,11 @@ def skip_pt110(): pass +@pytest.fixture(scope="session") +def pt21_and_above_only(): + pass + + @pytest.fixture(scope="session") def pt18_and_above_only(): pass @@ -1154,6 +1159,10 @@ def framework_version_within_limit(metafunc_obj, image): "skip_pt110" in metafunc_obj.fixturenames and is_equal_to_framework_version("1.10.*", image, image_framework_name) ) + pt21_requirement_failed = ( + "pt21_and_above_only" in metafunc_obj.fixturenames + and is_below_framework_version("2.1", image, image_framework_name) + ) pt18_requirement_failed = ( "pt18_and_above_only" in metafunc_obj.fixturenames and is_below_framework_version("1.8", image, image_framework_name) @@ -1181,6 +1190,7 @@ def framework_version_within_limit(metafunc_obj, image): or below_pt113_requirement_failed or pt111_requirement_failed or not_pt110_requirement_failed + or pt21_requirement_failed or pt18_requirement_failed or pt17_requirement_failed or pt16_requirement_failed diff --git a/test/dlc_tests/container_tests/bin/transformerengine/testPTTransformerEngine b/test/dlc_tests/container_tests/bin/transformerengine/testPTTransformerEngine new file mode 100755 index 000000000000..22af8ce92255 --- /dev/null +++ b/test/dlc_tests/container_tests/bin/transformerengine/testPTTransformerEngine @@ -0,0 +1,12 @@ +#!/bin/bash + +set -ex + +git clone --branch release_v0.12 https://github.com/NVIDIA/TransformerEngine.git +cd TransformerEngine/tests/pytorch + +pip install pytest==6.2.5 onnxruntime==1.13.1 onnx +pytest -v -s test_sanity.py +PYTORCH_JIT=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 pytest -v -s test_numerics.py +NVTE_TORCH_COMPILE=0 pytest -v -s test_onnx_export.py +pytest -v -s test_jit.py diff --git a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training.py b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training.py index 59dc3552b6f0..644952e9f208 100644 --- a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training.py +++ b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training.py @@ -620,3 +620,47 @@ def test_pytorch_standalone_hpu( container_name="ec2_training_habana_pytorch_container", enable_habana_async_execution=True, ) + + +@pytest.mark.usefixtures("feature_aws_framework_present") +@pytest.mark.usefixtures("sagemaker") +@pytest.mark.integration("cudnn") +@pytest.mark.model("N/A") +@pytest.mark.parametrize("ec2_instance_type", PT_EC2_SINGLE_GPU_INSTANCE_TYPE, indirect=True) +def test_pytorch_cudnn_match_gpu( + pytorch_training, ec2_connection, region, gpu_only, ec2_instance_type, pt21_and_above_only +): + """ + PT 2.1 reintroduces a dependency on CUDNN to support NVDA TransformerEngine. This test is to ensure that torch CUDNN matches system CUDNN in the container. + """ + container_name = "pt_cudnn_test" + ec2_connection.run(f"$(aws ecr get-login --no-include-email --region {region})", hide=True) + ec2_connection.run(f"docker pull -q {pytorch_training}", hide=True) + ec2_connection.run( + f"nvidia-docker run --name {container_name} -itd {pytorch_training}", hide=True + ) + major_cmd = "cat /usr/include/cudnn_version.h | grep '#define CUDNN_MAJOR'" + minor_cmd = "cat /usr/include/cudnn_version.h | grep '#define CUDNN_MINOR'" + patch_cmd = "cat /usr/include/cudnn_version.h | grep '#define CUDNN_PATCHLEVEL'" + major = ec2_connection.run( + f"nvidia-docker exec --user root {container_name} bash -c '{major_cmd}'", hide=True + ).stdout.split()[-1] + minor = ec2_connection.run( + f"nvidia-docker exec --user root {container_name} bash -c '{minor_cmd}'", hide=True + ).stdout.split()[-1] + patch = ec2_connection.run( + f"nvidia-docker exec --user root {container_name} bash -c '{patch_cmd}'", hide=True + ).stdout.split()[-1] + + cudnn_from_torch = ec2_connection.run( + f"nvidia-docker exec --user root {container_name} python -c 'from torch.backends import cudnn; print(cudnn.version())'", + hide=True, + ).stdout.strip() + + if len(patch) == 1: + patch = f"0{patch}" + + system_cudnn = f"{major}{minor}{patch}" + assert ( + system_cudnn == cudnn_from_torch + ), f"System CUDNN {system_cudnn} and torch cudnn {cudnn_from_torch} do not match. Please downgrade system CUDNN or recompile torch with correct CUDNN verson." diff --git a/test/dlc_tests/ec2/test_transformerengine.py b/test/dlc_tests/ec2/test_transformerengine.py new file mode 100644 index 000000000000..d9bad19a9a92 --- /dev/null +++ b/test/dlc_tests/ec2/test_transformerengine.py @@ -0,0 +1,33 @@ +import os + +import pytest + +import test.test_utils.ec2 as ec2_utils +from test.test_utils import CONTAINER_TESTS_PREFIX, is_pr_context, is_efa_dedicated +from test.test_utils.ec2 import get_efa_ec2_instance_type, filter_efa_instance_type + +PT_TE_TESTS_CMD = os.path.join( + CONTAINER_TESTS_PREFIX, "transformerengine", "testPTTransformerEngine" +) + + +EC2_EFA_GPU_INSTANCE_TYPE_AND_REGION = get_efa_ec2_instance_type( + default="p4d.24xlarge", + filter_function=filter_efa_instance_type, +) + + +@pytest.mark.processor("gpu") +@pytest.mark.model("N/A") +@pytest.mark.integration("transformerengine") +@pytest.mark.usefixtures("sagemaker") +@pytest.mark.allow_p4de_use +@pytest.mark.parametrize("ec2_instance_type,region", EC2_EFA_GPU_INSTANCE_TYPE_AND_REGION) +@pytest.mark.skipif( + is_pr_context() and not is_efa_dedicated(), + reason="Skip heavy instance test in PR context unless explicitly enabled", +) +def test_pytorch_transformerengine( + pytorch_training, ec2_connection, region, ec2_instance_type, gpu_only, py3_only +): + ec2_utils.execute_ec2_training_test(ec2_connection, pytorch_training, PT_TE_TESTS_CMD)