-
Notifications
You must be signed in to change notification settings - Fork 516
Add TransformerEngine to PT 2.0 training images #3315
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
813bfe9
9a50f6b
4653068
227daaa
efe2170
97d3440
d5d0314
e1d10c8
d9d742d
ce8d087
ee98782
22d7d60
af662fd
c97541b
6cb71c8
d5626a4
3d83645
15df3aa
e91071d
02c9187
6f1caca
95a9003
7530535
51594ab
91285fe
f6976e5
c410e1d
edd5550
6c91e67
799e144
7a8f34b
5e14ead
6368fc0
8ae87c2
4295532
52c8b86
b2787cd
2c43c26
da102bb
7fb2477
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
#!/bin/bash | ||
|
||
set -ex | ||
|
||
git clone --branch release_v0.12 https://github.com/NVIDIA/TransformerEngine.git | ||
cd TransformerEngine/tests/pytorch | ||
|
||
pip install pytest==6.2.5 onnxruntime==1.13.1 onnx | ||
pytest -v -s test_sanity.py | ||
PYTORCH_JIT=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 pytest -v -s test_numerics.py | ||
NVTE_TORCH_COMPILE=0 pytest -v -s test_onnx_export.py | ||
pytest -v -s test_jit.py | ||
arjkesh marked this conversation as resolved.
Show resolved
Hide resolved
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -620,3 +620,47 @@ def test_pytorch_standalone_hpu( | |
container_name="ec2_training_habana_pytorch_container", | ||
enable_habana_async_execution=True, | ||
) | ||
|
||
|
||
@pytest.mark.usefixtures("feature_aws_framework_present") | ||
@pytest.mark.usefixtures("sagemaker") | ||
@pytest.mark.integration("cudnn") | ||
@pytest.mark.model("N/A") | ||
@pytest.mark.parametrize("ec2_instance_type", PT_EC2_SINGLE_GPU_INSTANCE_TYPE, indirect=True) | ||
def test_pytorch_cudnn_match_gpu( | ||
pytorch_training, ec2_connection, region, gpu_only, ec2_instance_type, pt21_and_above_only | ||
): | ||
""" | ||
PT 2.1 reintroduces a dependency on CUDNN to support NVDA TransformerEngine. This test is to ensure that torch CUDNN matches system CUDNN in the container. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. there is no PT 2.1 yet There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is no PT 2.1 yet, this is an anticipatory test that we are adding to ensure that torch binaries are compiled with the same cudnn as exists in the container |
||
""" | ||
container_name = "pt_cudnn_test" | ||
ec2_connection.run(f"$(aws ecr get-login --no-include-email --region {region})", hide=True) | ||
ec2_connection.run(f"docker pull -q {pytorch_training}", hide=True) | ||
ec2_connection.run( | ||
f"nvidia-docker run --name {container_name} -itd {pytorch_training}", hide=True | ||
) | ||
major_cmd = "cat /usr/include/cudnn_version.h | grep '#define CUDNN_MAJOR'" | ||
minor_cmd = "cat /usr/include/cudnn_version.h | grep '#define CUDNN_MINOR'" | ||
patch_cmd = "cat /usr/include/cudnn_version.h | grep '#define CUDNN_PATCHLEVEL'" | ||
major = ec2_connection.run( | ||
f"nvidia-docker exec --user root {container_name} bash -c '{major_cmd}'", hide=True | ||
).stdout.split()[-1] | ||
minor = ec2_connection.run( | ||
f"nvidia-docker exec --user root {container_name} bash -c '{minor_cmd}'", hide=True | ||
).stdout.split()[-1] | ||
patch = ec2_connection.run( | ||
f"nvidia-docker exec --user root {container_name} bash -c '{patch_cmd}'", hide=True | ||
).stdout.split()[-1] | ||
|
||
cudnn_from_torch = ec2_connection.run( | ||
f"nvidia-docker exec --user root {container_name} python -c 'from torch.backends import cudnn; print(cudnn.version())'", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this cudnn comes from pytorch and not from installed from OS package, right? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This cudnn represents the cudnn version that torch is compiled with, not the DLC cudnn version. There are basically static links to cudnn from torch - while it doesn't appear to be a big issue if there are slightly different versions of cudnn from compile --> system, adding this test for future safety so that the versions don't go out of sync |
||
hide=True, | ||
).stdout.strip() | ||
|
||
if len(patch) == 1: | ||
patch = f"0{patch}" | ||
|
||
system_cudnn = f"{major}{minor}{patch}" | ||
assert ( | ||
system_cudnn == cudnn_from_torch | ||
), f"System CUDNN {system_cudnn} and torch cudnn {cudnn_from_torch} do not match. Please downgrade system CUDNN or recompile torch with correct CUDNN verson." |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
import os | ||
|
||
import pytest | ||
|
||
import test.test_utils.ec2 as ec2_utils | ||
from test.test_utils import CONTAINER_TESTS_PREFIX, is_pr_context, is_efa_dedicated | ||
from test.test_utils.ec2 import get_efa_ec2_instance_type, filter_efa_instance_type | ||
|
||
PT_TE_TESTS_CMD = os.path.join( | ||
CONTAINER_TESTS_PREFIX, "transformerengine", "testPTTransformerEngine" | ||
) | ||
|
||
|
||
EC2_EFA_GPU_INSTANCE_TYPE_AND_REGION = get_efa_ec2_instance_type( | ||
default="p4d.24xlarge", | ||
arjkesh marked this conversation as resolved.
Show resolved
Hide resolved
|
||
filter_function=filter_efa_instance_type, | ||
) | ||
|
||
|
||
@pytest.mark.processor("gpu") | ||
@pytest.mark.model("N/A") | ||
@pytest.mark.integration("transformerengine") | ||
@pytest.mark.usefixtures("sagemaker") | ||
@pytest.mark.allow_p4de_use | ||
@pytest.mark.parametrize("ec2_instance_type,region", EC2_EFA_GPU_INSTANCE_TYPE_AND_REGION) | ||
@pytest.mark.skipif( | ||
is_pr_context() and not is_efa_dedicated(), | ||
reason="Skip heavy instance test in PR context unless explicitly enabled", | ||
) | ||
def test_pytorch_transformerengine( | ||
pytorch_training, ec2_connection, region, ec2_instance_type, gpu_only, py3_only | ||
): | ||
ec2_utils.execute_ec2_training_test(ec2_connection, pytorch_training, PT_TE_TESTS_CMD) |
Uh oh!
There was an error while loading. Please reload this page.