awslabs · ndodda-amazon · Feb 17, 2021 · Feb 18, 2021 · Feb 18, 2021 · Feb 19, 2021
diff --git a/config/tests.sh b/config/tests.sh
@@ -26,11 +26,13 @@ run_for_framework() {
         python -m pytest ${code_coverage_smdebug:+--cov=./ --cov-append}  tests/zero_code_change/test_pytorch_integration.py
         python -m pytest ${code_coverage_smdebug:+--cov=./ --cov-append}  tests/zero_code_change/test_pytorch_multiprocessing.py
         python -m pytest ${code_coverage_smdebug:+--cov=./ --cov-append}  tests/zero_code_change/test_training_with_no_grad_updates.py
+        python -m pytest ${code_coverage_smdebug:+--cov=./ --cov-append}  tests/zero_code_change/smdataparallel_tests/pytorch/test_smdataparallel.py
       elif [ "$1" = "tensorflow" ] ; then
         python -m pytest ${code_coverage_smdebug:+--cov=./ --cov-append}  tests/zero_code_change/test_tensorflow_integration.py
       elif [ "$1" = "tensorflow2" ] ; then
         python -m pytest ${code_coverage_smdebug:+--cov=./ --cov-append}  tests/zero_code_change/test_tensorflow2_gradtape_integration.py
         python -m pytest ${code_coverage_smdebug:+--cov=./ --cov-append}  tests/zero_code_change/test_tensorflow2_integration.py
+        python -m pytest ${code_coverage_smdebug:+--cov=./ --cov-append}  tests/zero_code_change/smdataparallel_tests/tensorflow2/test_tf2_smdataparallel.py
       fi
 
     else

diff --git a/tests/zero_code_change/smdataparallel_tests/pytorch/test_smdataparallel.py b/tests/zero_code_change/smdataparallel_tests/pytorch/test_smdataparallel.py
@@ -10,7 +10,10 @@
     SMDATAPARALLEL_PYTORCH_TEST_MNIST_ARGS,
     SMDATAPARALLEL_PYTORCH_TEST_MNIST_SCRIPT,
 )
-from tests.zero_code_change.smdataparallel_tests.utils import launch_smdataparallel_job
+from tests.zero_code_change.smdataparallel_tests.utils import (
+    is_gpu_available,
+    launch_smdataparallel_job,
+)
 from tests.zero_code_change.utils import build_json
 from torch.cuda import device_count
 
@@ -44,8 +47,8 @@ def mode_allworkers(out_dir, mode):
     assert len(tr.tensor(tr.tensor_names(collection="weights")[0]).workers(0)) == num_workers
 
 
-@pytest.mark.skip(
-    reason="Requires SMDataParallel docker image which is private as of now. It would be available in general DLC sometime in mid of November 2020"
+@pytest.mark.skipif(
+    not is_gpu_available("pytorch"), reason="This test needs GPUs to run correctly."
 )
 def test_gpu_allworkers(out_dir):
     mode_allworkers(out_dir, "gpu")
@@ -60,8 +63,8 @@ def smdataparallel_profiler_config_path(config_folder, monkeypatch):
         os.remove(config_path)
 
 
-@pytest.mark.skip(
-    reason="Requires SMDataParallel docker image which is private as of now. It would be available in general DLC sometime in mid of November 2020"
+@pytest.mark.skipif(
+    not is_gpu_available("pytorch"), reason="This test needs GPUs to run correctly."
 )
 @pytest.mark.parametrize("mode", ["gpu"])
 @pytest.mark.parametrize("worker_function", [mode_allworkers])
@@ -141,8 +144,8 @@ def mode_allworkers_saveall(out_dir, mode):
     assert len(tr.tensor(tr.tensor_names(collection="losses")[0]).workers(0)) == num_workers
 
 
-@pytest.mark.skip(
-    reason="Requires SMDataParallel docker image which is private as of now. It would be available in general DLC sometime in mid of November 2020"
+@pytest.mark.skipif(
+    not is_gpu_available("pytorch"), reason="This test needs GPUs to run correctly."
 )
 def test_gpu_allworkers_saveall(out_dir):
     mode_allworkers_saveall(out_dir, "gpu")
diff --git a/tests/zero_code_change/smdataparallel_tests/tensorflow2/test_tf2_smdataparallel.py b/tests/zero_code_change/smdataparallel_tests/tensorflow2/test_tf2_smdataparallel.py
@@ -4,7 +4,10 @@
 from tests.zero_code_change.smdataparallel_tests.constants import (
     SMDATAPARALLEL_TF2_TEST_MNIST_SCRIPT,
 )
-from tests.zero_code_change.smdataparallel_tests.utils import launch_smdataparallel_job
+from tests.zero_code_change.smdataparallel_tests.utils import (
+    is_gpu_available,
+    launch_smdataparallel_job,
+)
 from tests.zero_code_change.tf_utils import get_available_gpus
 from tests.zero_code_change.utils import build_json
 
@@ -38,8 +41,8 @@ def basic_test(out_dir, mode):
     tf.__version__ < "2.3.0",
     reason="smdistributed.dataparallel supports TF version 2.3.0 and above",
 )
-@pytest.mark.skip(
-    reason="Requires SMDataParallel docker image which is private as of now. It would be available in general DLC sometime in mid of November 2020"
+@pytest.mark.skipif(
+    not is_gpu_available("tensorflow2"), reason="This test needs GPUs to run correctly."
 )
 def test_gpu(out_dir):
     basic_test(out_dir, "gpu")
@@ -69,8 +72,8 @@ def mode_allworkers(out_dir, mode):
     tf.__version__ < "2.3.0",
     reason="smdistributed.dataparallel supports TF version 2.3.0 and above",
 )
-@pytest.mark.skip(
-    reason="Requires SMDataParallel docker image which is private as of now. It would be available in general DLC sometime in mid of November 2020"
+@pytest.mark.skipif(
+    not is_gpu_available("tensorflow2"), reason="This test needs GPUs to run correctly."
 )
 def test_gpu_allworkers(out_dir):
     mode_allworkers(out_dir, "gpu")
@@ -98,8 +101,8 @@ def mode_allworkers_saveall(out_dir, mode):
     tf.__version__ < "2.3.0",
     reason="smdistributed.dataparallel supports TF version 2.3.0 and above",
 )
-@pytest.mark.skip(
-    reason="Requires SMDataParallel docker image which is private as of now. It would be available in general DLC sometime in mid of November 2020"
+@pytest.mark.skipif(
+    not is_gpu_available("tensorflow2"), reason="This test needs GPUs to run correctly."
 )
 def test_gpu_allworkers_saveall(out_dir):
     mode_allworkers_saveall(out_dir, "gpu")
@@ -128,8 +131,8 @@ def mode_allworkers_default_collections(out_dir, mode):
     tf.__version__ < "2.3.0",
     reason="smdistributed.dataparallel supports TF version 2.3.0 and above",
 )
-@pytest.mark.skip(
-    reason="Requires SMDataParallel docker image which is private as of now. It would be available in general DLC sometime in mid of November 2020"
+@pytest.mark.skipif(
+    not is_gpu_available("tensorflow2"), reason="This test needs GPUs to run correctly."
 )
 def test_gpu_allworkers_default_collections(out_dir):
     mode_allworkers_default_collections(out_dir, "gpu")
diff --git a/tests/zero_code_change/smdataparallel_tests/utils.py b/tests/zero_code_change/smdataparallel_tests/utils.py
@@ -10,3 +10,16 @@ def launch_smdataparallel_job(script_file_path, script_args, num_workers, config
     env_dict["SMDEBUG_CONFIG_FILE_PATH"] = f"{config_file_path}"
     env_dict["PYTHONPATH"] = "/home/ubuntu/sagemaker-debugger/"
     subprocess.check_call(command, env=env_dict)
+
+
+def is_gpu_available(framework):
+    if framework == "tensorflow2":
+        import tensorflow as tf
+
+        return len(tf.config.list_physical_devices("GPU")) > 0
+    elif framework == "pytorch":
+        import torch
+
+        return torch.cuda.is_available()
+    else:
+        raise Exception("Invalid framework passed in.")