From 760dcb9af076e8930baf3d449687d80727274156 Mon Sep 17 00:00:00 2001 From: ndodda-amazon Date: Wed, 17 Feb 2021 11:06:14 -0800 Subject: [PATCH 1/6] Enable SDP ZCC tests --- .../pytorch/test_smdataparallel.py | 9 --------- .../tensorflow2/test_tf2_smdataparallel.py | 12 ------------ 2 files changed, 21 deletions(-) diff --git a/tests/zero_code_change/smdataparallel_tests/pytorch/test_smdataparallel.py b/tests/zero_code_change/smdataparallel_tests/pytorch/test_smdataparallel.py index f950d0236..a25d25dbc 100644 --- a/tests/zero_code_change/smdataparallel_tests/pytorch/test_smdataparallel.py +++ b/tests/zero_code_change/smdataparallel_tests/pytorch/test_smdataparallel.py @@ -44,9 +44,6 @@ def mode_allworkers(out_dir, mode): assert len(tr.tensor(tr.tensor_names(collection="weights")[0]).workers(0)) == num_workers -@pytest.mark.skip( - reason="Requires SMDataParallel docker image which is private as of now. It would be available in general DLC sometime in mid of November 2020" -) def test_gpu_allworkers(out_dir): mode_allworkers(out_dir, "gpu") @@ -60,9 +57,6 @@ def smdataparallel_profiler_config_path(config_folder, monkeypatch): os.remove(config_path) -@pytest.mark.skip( - reason="Requires SMDataParallel docker image which is private as of now. It would be available in general DLC sometime in mid of November 2020" -) @pytest.mark.parametrize("mode", ["gpu"]) @pytest.mark.parametrize("worker_function", [mode_allworkers]) def test_mode_workers_dynamic_smdataparallel_profiler( @@ -141,8 +135,5 @@ def mode_allworkers_saveall(out_dir, mode): assert len(tr.tensor(tr.tensor_names(collection="losses")[0]).workers(0)) == num_workers -@pytest.mark.skip( - reason="Requires SMDataParallel docker image which is private as of now. It would be available in general DLC sometime in mid of November 2020" -) def test_gpu_allworkers_saveall(out_dir): mode_allworkers_saveall(out_dir, "gpu") diff --git a/tests/zero_code_change/smdataparallel_tests/tensorflow2/test_tf2_smdataparallel.py b/tests/zero_code_change/smdataparallel_tests/tensorflow2/test_tf2_smdataparallel.py index f98d06108..884e7f6c2 100644 --- a/tests/zero_code_change/smdataparallel_tests/tensorflow2/test_tf2_smdataparallel.py +++ b/tests/zero_code_change/smdataparallel_tests/tensorflow2/test_tf2_smdataparallel.py @@ -38,9 +38,6 @@ def basic_test(out_dir, mode): tf.__version__ < "2.3.0", reason="smdistributed.dataparallel supports TF version 2.3.0 and above", ) -@pytest.mark.skip( - reason="Requires SMDataParallel docker image which is private as of now. It would be available in general DLC sometime in mid of November 2020" -) def test_gpu(out_dir): basic_test(out_dir, "gpu") @@ -69,9 +66,6 @@ def mode_allworkers(out_dir, mode): tf.__version__ < "2.3.0", reason="smdistributed.dataparallel supports TF version 2.3.0 and above", ) -@pytest.mark.skip( - reason="Requires SMDataParallel docker image which is private as of now. It would be available in general DLC sometime in mid of November 2020" -) def test_gpu_allworkers(out_dir): mode_allworkers(out_dir, "gpu") @@ -98,9 +92,6 @@ def mode_allworkers_saveall(out_dir, mode): tf.__version__ < "2.3.0", reason="smdistributed.dataparallel supports TF version 2.3.0 and above", ) -@pytest.mark.skip( - reason="Requires SMDataParallel docker image which is private as of now. It would be available in general DLC sometime in mid of November 2020" -) def test_gpu_allworkers_saveall(out_dir): mode_allworkers_saveall(out_dir, "gpu") @@ -128,8 +119,5 @@ def mode_allworkers_default_collections(out_dir, mode): tf.__version__ < "2.3.0", reason="smdistributed.dataparallel supports TF version 2.3.0 and above", ) -@pytest.mark.skip( - reason="Requires SMDataParallel docker image which is private as of now. It would be available in general DLC sometime in mid of November 2020" -) def test_gpu_allworkers_default_collections(out_dir): mode_allworkers_default_collections(out_dir, "gpu") From fac30efb29cd347c58cb480cc9f95ccdb496ce38 Mon Sep 17 00:00:00 2001 From: ndodda-amazon Date: Thu, 18 Feb 2021 14:16:51 -0800 Subject: [PATCH 2/6] debug sdp tests --- config/tests.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/config/tests.sh b/config/tests.sh index 310c04f4a..f25b9858e 100644 --- a/config/tests.sh +++ b/config/tests.sh @@ -26,11 +26,13 @@ run_for_framework() { python -m pytest ${code_coverage_smdebug:+--cov=./ --cov-append} tests/zero_code_change/test_pytorch_integration.py python -m pytest ${code_coverage_smdebug:+--cov=./ --cov-append} tests/zero_code_change/test_pytorch_multiprocessing.py python -m pytest ${code_coverage_smdebug:+--cov=./ --cov-append} tests/zero_code_change/test_training_with_no_grad_updates.py + python -m pytest ${code_coverage_smdebug:+--cov=./ --cov-append} tests/zero_code_change/smdataparallel_tests/pytorch/test_smdataparallel.py elif [ "$1" = "tensorflow" ] ; then python -m pytest ${code_coverage_smdebug:+--cov=./ --cov-append} tests/zero_code_change/test_tensorflow_integration.py elif [ "$1" = "tensorflow2" ] ; then python -m pytest ${code_coverage_smdebug:+--cov=./ --cov-append} tests/zero_code_change/test_tensorflow2_gradtape_integration.py python -m pytest ${code_coverage_smdebug:+--cov=./ --cov-append} tests/zero_code_change/test_tensorflow2_integration.py + python -m pytest ${code_coverage_smdebug:+--cov=./ --cov-append} tests/zero_code_change/smdataparallel_tests/tensorflow2/test_tf2_smdataparallel.py fi else From 81de6b47019cec29afe4930bbb373f3f4811f2f2 Mon Sep 17 00:00:00 2001 From: ndodda-amazon Date: Thu, 18 Feb 2021 15:35:38 -0800 Subject: [PATCH 3/6] skip if not on gpu image --- .../pytorch/test_smdataparallel.py | 8 +++++++- .../tensorflow2/test_tf2_smdataparallel.py | 17 ++++++++++++++++- .../smdataparallel_tests/utils.py | 13 +++++++++++++ 3 files changed, 36 insertions(+), 2 deletions(-) diff --git a/tests/zero_code_change/smdataparallel_tests/pytorch/test_smdataparallel.py b/tests/zero_code_change/smdataparallel_tests/pytorch/test_smdataparallel.py index a25d25dbc..af65c96cf 100644 --- a/tests/zero_code_change/smdataparallel_tests/pytorch/test_smdataparallel.py +++ b/tests/zero_code_change/smdataparallel_tests/pytorch/test_smdataparallel.py @@ -10,7 +10,10 @@ SMDATAPARALLEL_PYTORCH_TEST_MNIST_ARGS, SMDATAPARALLEL_PYTORCH_TEST_MNIST_SCRIPT, ) -from tests.zero_code_change.smdataparallel_tests.utils import launch_smdataparallel_job +from tests.zero_code_change.smdataparallel_tests.utils import ( + is_gpu_available, + launch_smdataparallel_job, +) from tests.zero_code_change.utils import build_json from torch.cuda import device_count @@ -44,6 +47,7 @@ def mode_allworkers(out_dir, mode): assert len(tr.tensor(tr.tensor_names(collection="weights")[0]).workers(0)) == num_workers +@pytest.mark.skipif(is_gpu_available("pytorch"), reason="This test needs GPUs to run correctly.") def test_gpu_allworkers(out_dir): mode_allworkers(out_dir, "gpu") @@ -57,6 +61,7 @@ def smdataparallel_profiler_config_path(config_folder, monkeypatch): os.remove(config_path) +@pytest.mark.skipif(is_gpu_available("pytorch"), reason="This test needs GPUs to run correctly.") @pytest.mark.parametrize("mode", ["gpu"]) @pytest.mark.parametrize("worker_function", [mode_allworkers]) def test_mode_workers_dynamic_smdataparallel_profiler( @@ -135,5 +140,6 @@ def mode_allworkers_saveall(out_dir, mode): assert len(tr.tensor(tr.tensor_names(collection="losses")[0]).workers(0)) == num_workers +@pytest.mark.skipif(is_gpu_available("pytorch"), reason="This test needs GPUs to run correctly.") def test_gpu_allworkers_saveall(out_dir): mode_allworkers_saveall(out_dir, "gpu") diff --git a/tests/zero_code_change/smdataparallel_tests/tensorflow2/test_tf2_smdataparallel.py b/tests/zero_code_change/smdataparallel_tests/tensorflow2/test_tf2_smdataparallel.py index 884e7f6c2..7b22220f2 100644 --- a/tests/zero_code_change/smdataparallel_tests/tensorflow2/test_tf2_smdataparallel.py +++ b/tests/zero_code_change/smdataparallel_tests/tensorflow2/test_tf2_smdataparallel.py @@ -4,7 +4,10 @@ from tests.zero_code_change.smdataparallel_tests.constants import ( SMDATAPARALLEL_TF2_TEST_MNIST_SCRIPT, ) -from tests.zero_code_change.smdataparallel_tests.utils import launch_smdataparallel_job +from tests.zero_code_change.smdataparallel_tests.utils import ( + is_gpu_available, + launch_smdataparallel_job, +) from tests.zero_code_change.tf_utils import get_available_gpus from tests.zero_code_change.utils import build_json @@ -38,6 +41,9 @@ def basic_test(out_dir, mode): tf.__version__ < "2.3.0", reason="smdistributed.dataparallel supports TF version 2.3.0 and above", ) +@pytest.mark.skipif( + is_gpu_available("tensorflow2"), reason="This test needs GPUs to run correctly." +) def test_gpu(out_dir): basic_test(out_dir, "gpu") @@ -66,6 +72,9 @@ def mode_allworkers(out_dir, mode): tf.__version__ < "2.3.0", reason="smdistributed.dataparallel supports TF version 2.3.0 and above", ) +@pytest.mark.skipif( + is_gpu_available("tensorflow2"), reason="This test needs GPUs to run correctly." +) def test_gpu_allworkers(out_dir): mode_allworkers(out_dir, "gpu") @@ -92,6 +101,9 @@ def mode_allworkers_saveall(out_dir, mode): tf.__version__ < "2.3.0", reason="smdistributed.dataparallel supports TF version 2.3.0 and above", ) +@pytest.mark.skipif( + is_gpu_available("tensorflow2"), reason="This test needs GPUs to run correctly." +) def test_gpu_allworkers_saveall(out_dir): mode_allworkers_saveall(out_dir, "gpu") @@ -119,5 +131,8 @@ def mode_allworkers_default_collections(out_dir, mode): tf.__version__ < "2.3.0", reason="smdistributed.dataparallel supports TF version 2.3.0 and above", ) +@pytest.mark.skipif( + is_gpu_available("tensorflow2"), reason="This test needs GPUs to run correctly." +) def test_gpu_allworkers_default_collections(out_dir): mode_allworkers_default_collections(out_dir, "gpu") diff --git a/tests/zero_code_change/smdataparallel_tests/utils.py b/tests/zero_code_change/smdataparallel_tests/utils.py index e436c6ed2..8d0ab3323 100644 --- a/tests/zero_code_change/smdataparallel_tests/utils.py +++ b/tests/zero_code_change/smdataparallel_tests/utils.py @@ -10,3 +10,16 @@ def launch_smdataparallel_job(script_file_path, script_args, num_workers, config env_dict["SMDEBUG_CONFIG_FILE_PATH"] = f"{config_file_path}" env_dict["PYTHONPATH"] = "/home/ubuntu/sagemaker-debugger/" subprocess.check_call(command, env=env_dict) + + +def is_gpu_available(framework): + if framework == "tensorflow2": + import tensorflow as tf + + return tf.config.list_physical_devices("GPU") > 0 + elif framework == "pytorch": + import torch + + return torch.cuda.is_available() + else: + raise Exception("Invalid framework passed in.") From b5c2de39cbe81fd21771a4723bd1f5a001cf0657 Mon Sep 17 00:00:00 2001 From: ndodda-amazon Date: Thu, 18 Feb 2021 16:00:45 -0800 Subject: [PATCH 4/6] fix tests --- .../pytorch/test_smdataparallel.py | 12 +++++++++--- .../tensorflow2/test_tf2_smdataparallel.py | 8 ++++---- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/tests/zero_code_change/smdataparallel_tests/pytorch/test_smdataparallel.py b/tests/zero_code_change/smdataparallel_tests/pytorch/test_smdataparallel.py index af65c96cf..5f0b9a378 100644 --- a/tests/zero_code_change/smdataparallel_tests/pytorch/test_smdataparallel.py +++ b/tests/zero_code_change/smdataparallel_tests/pytorch/test_smdataparallel.py @@ -47,7 +47,9 @@ def mode_allworkers(out_dir, mode): assert len(tr.tensor(tr.tensor_names(collection="weights")[0]).workers(0)) == num_workers -@pytest.mark.skipif(is_gpu_available("pytorch"), reason="This test needs GPUs to run correctly.") +@pytest.mark.skipif( + not is_gpu_available("pytorch"), reason="This test needs GPUs to run correctly." +) def test_gpu_allworkers(out_dir): mode_allworkers(out_dir, "gpu") @@ -61,7 +63,9 @@ def smdataparallel_profiler_config_path(config_folder, monkeypatch): os.remove(config_path) -@pytest.mark.skipif(is_gpu_available("pytorch"), reason="This test needs GPUs to run correctly.") +@pytest.mark.skipif( + not is_gpu_available("pytorch"), reason="This test needs GPUs to run correctly." +) @pytest.mark.parametrize("mode", ["gpu"]) @pytest.mark.parametrize("worker_function", [mode_allworkers]) def test_mode_workers_dynamic_smdataparallel_profiler( @@ -140,6 +144,8 @@ def mode_allworkers_saveall(out_dir, mode): assert len(tr.tensor(tr.tensor_names(collection="losses")[0]).workers(0)) == num_workers -@pytest.mark.skipif(is_gpu_available("pytorch"), reason="This test needs GPUs to run correctly.") +@pytest.mark.skipif( + not is_gpu_available("pytorch"), reason="This test needs GPUs to run correctly." +) def test_gpu_allworkers_saveall(out_dir): mode_allworkers_saveall(out_dir, "gpu") diff --git a/tests/zero_code_change/smdataparallel_tests/tensorflow2/test_tf2_smdataparallel.py b/tests/zero_code_change/smdataparallel_tests/tensorflow2/test_tf2_smdataparallel.py index 7b22220f2..a8fbf0497 100644 --- a/tests/zero_code_change/smdataparallel_tests/tensorflow2/test_tf2_smdataparallel.py +++ b/tests/zero_code_change/smdataparallel_tests/tensorflow2/test_tf2_smdataparallel.py @@ -42,7 +42,7 @@ def basic_test(out_dir, mode): reason="smdistributed.dataparallel supports TF version 2.3.0 and above", ) @pytest.mark.skipif( - is_gpu_available("tensorflow2"), reason="This test needs GPUs to run correctly." + not is_gpu_available("tensorflow2"), reason="This test needs GPUs to run correctly." ) def test_gpu(out_dir): basic_test(out_dir, "gpu") @@ -73,7 +73,7 @@ def mode_allworkers(out_dir, mode): reason="smdistributed.dataparallel supports TF version 2.3.0 and above", ) @pytest.mark.skipif( - is_gpu_available("tensorflow2"), reason="This test needs GPUs to run correctly." + not is_gpu_available("tensorflow2"), reason="This test needs GPUs to run correctly." ) def test_gpu_allworkers(out_dir): mode_allworkers(out_dir, "gpu") @@ -102,7 +102,7 @@ def mode_allworkers_saveall(out_dir, mode): reason="smdistributed.dataparallel supports TF version 2.3.0 and above", ) @pytest.mark.skipif( - is_gpu_available("tensorflow2"), reason="This test needs GPUs to run correctly." + not is_gpu_available("tensorflow2"), reason="This test needs GPUs to run correctly." ) def test_gpu_allworkers_saveall(out_dir): mode_allworkers_saveall(out_dir, "gpu") @@ -132,7 +132,7 @@ def mode_allworkers_default_collections(out_dir, mode): reason="smdistributed.dataparallel supports TF version 2.3.0 and above", ) @pytest.mark.skipif( - is_gpu_available("tensorflow2"), reason="This test needs GPUs to run correctly." + not is_gpu_available("tensorflow2"), reason="This test needs GPUs to run correctly." ) def test_gpu_allworkers_default_collections(out_dir): mode_allworkers_default_collections(out_dir, "gpu") From 5a472c54817cf88120b7e8941e3f32aff5123179 Mon Sep 17 00:00:00 2001 From: ndodda-amazon Date: Thu, 18 Feb 2021 16:01:37 -0800 Subject: [PATCH 5/6] fix tests --- tests/zero_code_change/smdataparallel_tests/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/zero_code_change/smdataparallel_tests/utils.py b/tests/zero_code_change/smdataparallel_tests/utils.py index 8d0ab3323..53a0f08a6 100644 --- a/tests/zero_code_change/smdataparallel_tests/utils.py +++ b/tests/zero_code_change/smdataparallel_tests/utils.py @@ -16,7 +16,7 @@ def is_gpu_available(framework): if framework == "tensorflow2": import tensorflow as tf - return tf.config.list_physical_devices("GPU") > 0 + return len(tf.config.list_physical_devices("GPU")) > 0 elif framework == "pytorch": import torch From 8d72560c05f7a3440d770145b4703acc8b0bee28 Mon Sep 17 00:00:00 2001 From: ndodda-amazon Date: Thu, 18 Feb 2021 21:39:57 -0800 Subject: [PATCH 6/6] retrigger CI after increasing build timeout