diff --git a/pytorch/training/buildspec.yml b/pytorch/training/buildspec.yml
index 00e393291f32..f44b91988043 100644
--- a/pytorch/training/buildspec.yml
+++ b/pytorch/training/buildspec.yml
@@ -41,21 +41,21 @@ images:
   #   target: ec2
   #   context:
   #     <<: *TRAINING_CONTEXT
-  # BuildEC2GPUPTTrainPy3cu121DockerImage:
-  #   <<: *TRAINING_REPOSITORY
-  #   build: &PYTORCH_GPU_TRAINING_PY3 false
-  #   image_size_baseline: 19700
-  #   device_type: &DEVICE_TYPE gpu
-  #   python_version: &DOCKER_PYTHON_VERSION py3
-  #   tag_python_version: &TAG_PYTHON_VERSION py310
-  #   cuda_version: &CUDA_VERSION cu121
-  #   os_version: &OS_VERSION ubuntu20.04
-  #   tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ]
-  #   docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile.,
-  #                        *DEVICE_TYPE ]
-  #   target: ec2
-  #   context:
-  #     <<: *TRAINING_CONTEXT
+  BuildEC2GPUPTTrainPy3cu121DockerImage:
+    <<: *TRAINING_REPOSITORY
+    build: &PYTORCH_GPU_TRAINING_PY3 false
+    image_size_baseline: 19700
+    device_type: &DEVICE_TYPE gpu
+    python_version: &DOCKER_PYTHON_VERSION py3
+    tag_python_version: &TAG_PYTHON_VERSION py310
+    cuda_version: &CUDA_VERSION cu121
+    os_version: &OS_VERSION ubuntu20.04
+    tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ]
+    docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile.,
+                         *DEVICE_TYPE ]
+    target: ec2
+    context:
+      <<: *TRAINING_CONTEXT
   # BuildEC2GPUPTTrainPy3cu118DockerImage:
   #   <<: *TRAINING_REPOSITORY
   #   build: &PYTORCH_GPU_TRAINING_PY3 false
@@ -84,21 +84,21 @@ images:
   #   target: sagemaker
   #   context:
   #     <<: *TRAINING_CONTEXT
-  BuildSageMakerGPUPTTrainPy3DockerImage:
-    <<: *TRAINING_REPOSITORY
-    build: &PYTORCH_GPU_TRAINING_PY3 false
-    image_size_baseline: 21500
-    device_type: &DEVICE_TYPE gpu
-    python_version: &DOCKER_PYTHON_VERSION py3
-    tag_python_version: &TAG_PYTHON_VERSION py310
-    cuda_version: &CUDA_VERSION cu118
-    os_version: &OS_VERSION ubuntu20.04
-    tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
-    docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile.,
-                         *DEVICE_TYPE ]
-    target: sagemaker
-    context:
-      <<: *TRAINING_CONTEXT
+  # BuildSageMakerGPUPTTrainPy3DockerImage:
+  #   <<: *TRAINING_REPOSITORY
+  #   build: &PYTORCH_GPU_TRAINING_PY3 false
+  #   image_size_baseline: 21500
+  #   device_type: &DEVICE_TYPE gpu
+  #   python_version: &DOCKER_PYTHON_VERSION py3
+  #   tag_python_version: &TAG_PYTHON_VERSION py310
+  #   cuda_version: &CUDA_VERSION cu118
+  #   os_version: &OS_VERSION ubuntu20.04
+  #   tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
+  #   docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile.,
+  #                        *DEVICE_TYPE ]
+  #   target: sagemaker
+  #   context:
+  #     <<: *TRAINING_CONTEXT
   # BuildPyTorchExampleGPUTrainPy3cu121DockerImage:
   #   <<: *TRAINING_REPOSITORY
   #   build: &PYTORCH_GPU_TRAINING_PY3 false
diff --git a/pytorch/training/docker/2.0/py3/cu121/Dockerfile.gpu b/pytorch/training/docker/2.0/py3/cu121/Dockerfile.gpu
index b70e7baf404c..2e709c886d85 100644
--- a/pytorch/training/docker/2.0/py3/cu121/Dockerfile.gpu
+++ b/pytorch/training/docker/2.0/py3/cu121/Dockerfile.gpu
@@ -46,6 +46,7 @@ ENV PATH /opt/conda/bin:$PATH
 # 5.2 is G3 EC2 instance, 7.5 is G4*, 7.0 is p3*, 8.0 is P4*, 8.6 is G5* and 9.0 is P5*
 ENV TORCH_CUDA_ARCH_LIST="5.2;7.0+PTX;7.5;8.0;8.6;9.0"
 ENV TORCH_NVCC_FLAGS="-Xfatbin -compress-all"
+ENV CUDNN_VERSION=8.9.3.28
 ENV NCCL_VERSION=2.18.3
 ENV EFA_VERSION=1.24.1
 ENV GDRCOPY_VERSION=2.3.1
@@ -68,6 +69,8 @@ RUN apt-get update \
     build-essential \
     ca-certificates \
     cmake \
+    libcudnn8=$CUDNN_VERSION-1+cuda12.1 \
+    libcudnn8-dev=$CUDNN_VERSION-1+cuda12.1 \
     curl \
     emacs \
     git \
@@ -133,7 +136,7 @@ RUN /opt/conda/bin/mamba install -y -c conda-forge \
     # Adding package for studio kernels 
     ipykernel \
     # patch CVE
-    "cryptography>=41.0.2" \
+    "cryptography>=41.0.4" \
     # patch CVE
     "pillow>=9.4" \
     "mpi4py>=3.1.4,<3.2" \
@@ -268,7 +271,7 @@ RUN /opt/conda/bin/mamba install -y -c conda-forge \
   && /opt/conda/bin/mamba clean -afy
 
 # Patches
-RUN pip install "pillow>=9.5" opencv-python
+RUN pip install "pillow>=9.5" opencv-python huggingface_hub
 RUN /opt/conda/bin/mamba install -y -c conda-forge \
   "requests>=2.31.0" \
   && /opt/conda/bin/mamba clean -afy
@@ -292,6 +295,14 @@ RUN pip install packaging \
   && cd .. \
   && rm -rf apex
 
+# Install flash attn and NVIDIA transformer engine
+ENV NVTE_FRAMEWORK=pytorch
+# Install flash-attn using instructions from https://github.com/Dao-AILab/flash-attention#installation-and-features
+# Set MAX_JOBS=4 to avoid OOM issues in installation process
+RUN MAX_JOBS=4 pip install flash-attn==2.0.4 --no-build-isolation
+# Install TE using instructions from https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/installation.html
+RUN pip install git+https://github.com/NVIDIA/TransformerEngine.git@release_v0.12
+
 RUN HOME_DIR=/root \
  && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
  && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \
diff --git a/test/dlc_tests/conftest.py b/test/dlc_tests/conftest.py
index d4a545fbb756..976843a168de 100644
--- a/test/dlc_tests/conftest.py
+++ b/test/dlc_tests/conftest.py
@@ -1025,6 +1025,11 @@ def skip_pt110():
     pass
 
 
+@pytest.fixture(scope="session")
+def pt21_and_above_only():
+    pass
+
+
 @pytest.fixture(scope="session")
 def pt18_and_above_only():
     pass
@@ -1154,6 +1159,10 @@ def framework_version_within_limit(metafunc_obj, image):
             "skip_pt110" in metafunc_obj.fixturenames
             and is_equal_to_framework_version("1.10.*", image, image_framework_name)
         )
+        pt21_requirement_failed = (
+            "pt21_and_above_only" in metafunc_obj.fixturenames
+            and is_below_framework_version("2.1", image, image_framework_name)
+        )
         pt18_requirement_failed = (
             "pt18_and_above_only" in metafunc_obj.fixturenames
             and is_below_framework_version("1.8", image, image_framework_name)
@@ -1181,6 +1190,7 @@ def framework_version_within_limit(metafunc_obj, image):
             or below_pt113_requirement_failed
             or pt111_requirement_failed
             or not_pt110_requirement_failed
+            or pt21_requirement_failed
             or pt18_requirement_failed
             or pt17_requirement_failed
             or pt16_requirement_failed
diff --git a/test/dlc_tests/container_tests/bin/transformerengine/testPTTransformerEngine b/test/dlc_tests/container_tests/bin/transformerengine/testPTTransformerEngine
new file mode 100755
index 000000000000..22af8ce92255
--- /dev/null
+++ b/test/dlc_tests/container_tests/bin/transformerengine/testPTTransformerEngine
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+set -ex
+
+git clone --branch release_v0.12 https://github.com/NVIDIA/TransformerEngine.git
+cd TransformerEngine/tests/pytorch
+
+pip install pytest==6.2.5 onnxruntime==1.13.1 onnx
+pytest -v -s test_sanity.py
+PYTORCH_JIT=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 pytest -v -s test_numerics.py
+NVTE_TORCH_COMPILE=0 pytest -v -s test_onnx_export.py
+pytest -v -s test_jit.py
diff --git a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training.py b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training.py
index 59dc3552b6f0..644952e9f208 100644
--- a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training.py
+++ b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training.py
@@ -620,3 +620,47 @@ def test_pytorch_standalone_hpu(
         container_name="ec2_training_habana_pytorch_container",
         enable_habana_async_execution=True,
     )
+
+
+@pytest.mark.usefixtures("feature_aws_framework_present")
+@pytest.mark.usefixtures("sagemaker")
+@pytest.mark.integration("cudnn")
+@pytest.mark.model("N/A")
+@pytest.mark.parametrize("ec2_instance_type", PT_EC2_SINGLE_GPU_INSTANCE_TYPE, indirect=True)
+def test_pytorch_cudnn_match_gpu(
+    pytorch_training, ec2_connection, region, gpu_only, ec2_instance_type, pt21_and_above_only
+):
+    """
+    PT 2.1 reintroduces a dependency on CUDNN to support NVDA TransformerEngine. This test is to ensure that torch CUDNN matches system CUDNN in the container.
+    """
+    container_name = "pt_cudnn_test"
+    ec2_connection.run(f"$(aws ecr get-login --no-include-email --region {region})", hide=True)
+    ec2_connection.run(f"docker pull -q {pytorch_training}", hide=True)
+    ec2_connection.run(
+        f"nvidia-docker run --name {container_name} -itd {pytorch_training}", hide=True
+    )
+    major_cmd = "cat /usr/include/cudnn_version.h | grep '#define CUDNN_MAJOR'"
+    minor_cmd = "cat /usr/include/cudnn_version.h | grep '#define CUDNN_MINOR'"
+    patch_cmd = "cat /usr/include/cudnn_version.h | grep '#define CUDNN_PATCHLEVEL'"
+    major = ec2_connection.run(
+        f"nvidia-docker exec --user root {container_name} bash -c '{major_cmd}'", hide=True
+    ).stdout.split()[-1]
+    minor = ec2_connection.run(
+        f"nvidia-docker exec --user root {container_name} bash -c '{minor_cmd}'", hide=True
+    ).stdout.split()[-1]
+    patch = ec2_connection.run(
+        f"nvidia-docker exec --user root {container_name} bash -c '{patch_cmd}'", hide=True
+    ).stdout.split()[-1]
+
+    cudnn_from_torch = ec2_connection.run(
+        f"nvidia-docker exec --user root {container_name} python -c 'from torch.backends import cudnn; print(cudnn.version())'",
+        hide=True,
+    ).stdout.strip()
+
+    if len(patch) == 1:
+        patch = f"0{patch}"
+
+    system_cudnn = f"{major}{minor}{patch}"
+    assert (
+        system_cudnn == cudnn_from_torch
+    ), f"System CUDNN {system_cudnn} and torch cudnn {cudnn_from_torch} do not match. Please downgrade system CUDNN or recompile torch with correct CUDNN verson."
diff --git a/test/dlc_tests/ec2/test_transformerengine.py b/test/dlc_tests/ec2/test_transformerengine.py
new file mode 100644
index 000000000000..d9bad19a9a92
--- /dev/null
+++ b/test/dlc_tests/ec2/test_transformerengine.py
@@ -0,0 +1,33 @@
+import os
+
+import pytest
+
+import test.test_utils.ec2 as ec2_utils
+from test.test_utils import CONTAINER_TESTS_PREFIX, is_pr_context, is_efa_dedicated
+from test.test_utils.ec2 import get_efa_ec2_instance_type, filter_efa_instance_type
+
+PT_TE_TESTS_CMD = os.path.join(
+    CONTAINER_TESTS_PREFIX, "transformerengine", "testPTTransformerEngine"
+)
+
+
+EC2_EFA_GPU_INSTANCE_TYPE_AND_REGION = get_efa_ec2_instance_type(
+    default="p4d.24xlarge",
+    filter_function=filter_efa_instance_type,
+)
+
+
+@pytest.mark.processor("gpu")
+@pytest.mark.model("N/A")
+@pytest.mark.integration("transformerengine")
+@pytest.mark.usefixtures("sagemaker")
+@pytest.mark.allow_p4de_use
+@pytest.mark.parametrize("ec2_instance_type,region", EC2_EFA_GPU_INSTANCE_TYPE_AND_REGION)
+@pytest.mark.skipif(
+    is_pr_context() and not is_efa_dedicated(),
+    reason="Skip heavy instance test in PR context unless explicitly enabled",
+)
+def test_pytorch_transformerengine(
+    pytorch_training, ec2_connection, region, ec2_instance_type, gpu_only, py3_only
+):
+    ec2_utils.execute_ec2_training_test(ec2_connection, pytorch_training, PT_TE_TESTS_CMD)