diff --git a/config/tests.sh b/config/tests.sh index 310c04f4a..f25b9858e 100644 --- a/config/tests.sh +++ b/config/tests.sh @@ -26,11 +26,13 @@ run_for_framework() { python -m pytest ${code_coverage_smdebug:+--cov=./ --cov-append} tests/zero_code_change/test_pytorch_integration.py python -m pytest ${code_coverage_smdebug:+--cov=./ --cov-append} tests/zero_code_change/test_pytorch_multiprocessing.py python -m pytest ${code_coverage_smdebug:+--cov=./ --cov-append} tests/zero_code_change/test_training_with_no_grad_updates.py + python -m pytest ${code_coverage_smdebug:+--cov=./ --cov-append} tests/zero_code_change/smdataparallel_tests/pytorch/test_smdataparallel.py elif [ "$1" = "tensorflow" ] ; then python -m pytest ${code_coverage_smdebug:+--cov=./ --cov-append} tests/zero_code_change/test_tensorflow_integration.py elif [ "$1" = "tensorflow2" ] ; then python -m pytest ${code_coverage_smdebug:+--cov=./ --cov-append} tests/zero_code_change/test_tensorflow2_gradtape_integration.py python -m pytest ${code_coverage_smdebug:+--cov=./ --cov-append} tests/zero_code_change/test_tensorflow2_integration.py + python -m pytest ${code_coverage_smdebug:+--cov=./ --cov-append} tests/zero_code_change/smdataparallel_tests/tensorflow2/test_tf2_smdataparallel.py fi else diff --git a/tests/zero_code_change/smdataparallel_tests/pytorch/test_smdataparallel.py b/tests/zero_code_change/smdataparallel_tests/pytorch/test_smdataparallel.py index f950d0236..5f0b9a378 100644 --- a/tests/zero_code_change/smdataparallel_tests/pytorch/test_smdataparallel.py +++ b/tests/zero_code_change/smdataparallel_tests/pytorch/test_smdataparallel.py @@ -10,7 +10,10 @@ SMDATAPARALLEL_PYTORCH_TEST_MNIST_ARGS, SMDATAPARALLEL_PYTORCH_TEST_MNIST_SCRIPT, ) -from tests.zero_code_change.smdataparallel_tests.utils import launch_smdataparallel_job +from tests.zero_code_change.smdataparallel_tests.utils import ( + is_gpu_available, + launch_smdataparallel_job, +) from tests.zero_code_change.utils import build_json from torch.cuda import device_count @@ -44,8 +47,8 @@ def mode_allworkers(out_dir, mode): assert len(tr.tensor(tr.tensor_names(collection="weights")[0]).workers(0)) == num_workers -@pytest.mark.skip( - reason="Requires SMDataParallel docker image which is private as of now. It would be available in general DLC sometime in mid of November 2020" +@pytest.mark.skipif( + not is_gpu_available("pytorch"), reason="This test needs GPUs to run correctly." ) def test_gpu_allworkers(out_dir): mode_allworkers(out_dir, "gpu") @@ -60,8 +63,8 @@ def smdataparallel_profiler_config_path(config_folder, monkeypatch): os.remove(config_path) -@pytest.mark.skip( - reason="Requires SMDataParallel docker image which is private as of now. It would be available in general DLC sometime in mid of November 2020" +@pytest.mark.skipif( + not is_gpu_available("pytorch"), reason="This test needs GPUs to run correctly." ) @pytest.mark.parametrize("mode", ["gpu"]) @pytest.mark.parametrize("worker_function", [mode_allworkers]) @@ -141,8 +144,8 @@ def mode_allworkers_saveall(out_dir, mode): assert len(tr.tensor(tr.tensor_names(collection="losses")[0]).workers(0)) == num_workers -@pytest.mark.skip( - reason="Requires SMDataParallel docker image which is private as of now. It would be available in general DLC sometime in mid of November 2020" +@pytest.mark.skipif( + not is_gpu_available("pytorch"), reason="This test needs GPUs to run correctly." ) def test_gpu_allworkers_saveall(out_dir): mode_allworkers_saveall(out_dir, "gpu") diff --git a/tests/zero_code_change/smdataparallel_tests/tensorflow2/test_tf2_smdataparallel.py b/tests/zero_code_change/smdataparallel_tests/tensorflow2/test_tf2_smdataparallel.py index f98d06108..a8fbf0497 100644 --- a/tests/zero_code_change/smdataparallel_tests/tensorflow2/test_tf2_smdataparallel.py +++ b/tests/zero_code_change/smdataparallel_tests/tensorflow2/test_tf2_smdataparallel.py @@ -4,7 +4,10 @@ from tests.zero_code_change.smdataparallel_tests.constants import ( SMDATAPARALLEL_TF2_TEST_MNIST_SCRIPT, ) -from tests.zero_code_change.smdataparallel_tests.utils import launch_smdataparallel_job +from tests.zero_code_change.smdataparallel_tests.utils import ( + is_gpu_available, + launch_smdataparallel_job, +) from tests.zero_code_change.tf_utils import get_available_gpus from tests.zero_code_change.utils import build_json @@ -38,8 +41,8 @@ def basic_test(out_dir, mode): tf.__version__ < "2.3.0", reason="smdistributed.dataparallel supports TF version 2.3.0 and above", ) -@pytest.mark.skip( - reason="Requires SMDataParallel docker image which is private as of now. It would be available in general DLC sometime in mid of November 2020" +@pytest.mark.skipif( + not is_gpu_available("tensorflow2"), reason="This test needs GPUs to run correctly." ) def test_gpu(out_dir): basic_test(out_dir, "gpu") @@ -69,8 +72,8 @@ def mode_allworkers(out_dir, mode): tf.__version__ < "2.3.0", reason="smdistributed.dataparallel supports TF version 2.3.0 and above", ) -@pytest.mark.skip( - reason="Requires SMDataParallel docker image which is private as of now. It would be available in general DLC sometime in mid of November 2020" +@pytest.mark.skipif( + not is_gpu_available("tensorflow2"), reason="This test needs GPUs to run correctly." ) def test_gpu_allworkers(out_dir): mode_allworkers(out_dir, "gpu") @@ -98,8 +101,8 @@ def mode_allworkers_saveall(out_dir, mode): tf.__version__ < "2.3.0", reason="smdistributed.dataparallel supports TF version 2.3.0 and above", ) -@pytest.mark.skip( - reason="Requires SMDataParallel docker image which is private as of now. It would be available in general DLC sometime in mid of November 2020" +@pytest.mark.skipif( + not is_gpu_available("tensorflow2"), reason="This test needs GPUs to run correctly." ) def test_gpu_allworkers_saveall(out_dir): mode_allworkers_saveall(out_dir, "gpu") @@ -128,8 +131,8 @@ def mode_allworkers_default_collections(out_dir, mode): tf.__version__ < "2.3.0", reason="smdistributed.dataparallel supports TF version 2.3.0 and above", ) -@pytest.mark.skip( - reason="Requires SMDataParallel docker image which is private as of now. It would be available in general DLC sometime in mid of November 2020" +@pytest.mark.skipif( + not is_gpu_available("tensorflow2"), reason="This test needs GPUs to run correctly." ) def test_gpu_allworkers_default_collections(out_dir): mode_allworkers_default_collections(out_dir, "gpu") diff --git a/tests/zero_code_change/smdataparallel_tests/utils.py b/tests/zero_code_change/smdataparallel_tests/utils.py index e436c6ed2..53a0f08a6 100644 --- a/tests/zero_code_change/smdataparallel_tests/utils.py +++ b/tests/zero_code_change/smdataparallel_tests/utils.py @@ -10,3 +10,16 @@ def launch_smdataparallel_job(script_file_path, script_args, num_workers, config env_dict["SMDEBUG_CONFIG_FILE_PATH"] = f"{config_file_path}" env_dict["PYTHONPATH"] = "/home/ubuntu/sagemaker-debugger/" subprocess.check_call(command, env=env_dict) + + +def is_gpu_available(framework): + if framework == "tensorflow2": + import tensorflow as tf + + return len(tf.config.list_physical_devices("GPU")) > 0 + elif framework == "pytorch": + import torch + + return torch.cuda.is_available() + else: + raise Exception("Invalid framework passed in.")