Fix to execute efa tests on mainline (#1083)

jeet4320 · web-flow · commit 80c6494bd78f · 2021-04-30T13:28:53.000-07:00
diff --git a/test/dlc_tests/eks/eks_manifest_templates/tensorflow/training/multi_node_gpu_training.yaml b/test/dlc_tests/eks/eks_manifest_templates/tensorflow/training/multi_node_gpu_training.yaml
@@ -31,6 +31,8 @@ spec:
               - -x
               - LD_LIBRARY_PATH
               - -x
+              - RDMAV_FORK_SAFE=1
+              - -x
               - PATH
               - -x
               - NCCL_SOCKET_IFNAME=eth0
diff --git a/test/sagemaker_tests/mxnet/training/conftest.py b/test/sagemaker_tests/mxnet/training/conftest.py
@@ -46,18 +46,7 @@ def pytest_addoption(parser):
     parser.addoption('--tag', default=None)
     parser.addoption('--generate-coverage-doc', default=False, action='store_true',
                      help='use this option to generate test coverage doc')
-    parser.addoption(
-        "--efa", action="store_true", default=False, help="Run only efa tests",
-    )
-
-def pytest_configure(config):
-    config.addinivalue_line("markers", "efa(): explicitly mark to run efa tests")
-
-def pytest_runtest_setup(item):
-    if item.config.getoption("--efa"):
-        efa_tests = [mark for mark in item.iter_markers(name="efa")]
-        if not efa_tests:
-            pytest.skip("Skipping non-efa tests")
+
 
 def pytest_collection_modifyitems(session, config, items):
     if config.getoption("--generate-coverage-doc"):
diff --git a/test/sagemaker_tests/pytorch/training/conftest.py b/test/sagemaker_tests/pytorch/training/conftest.py
@@ -105,20 +105,6 @@ def pytest_addoption(parser):
     parser.addoption('--tag', default=None)
     parser.addoption('--generate-coverage-doc', default=False, action='store_true',
                      help='use this option to generate test coverage doc')
-    parser.addoption(
-        "--efa", action="store_true", default=False, help="Run only efa tests",
-    )
-
-
-def pytest_configure(config):
-    config.addinivalue_line("markers", "efa(): explicitly mark to run efa tests")
-
-
-def pytest_runtest_setup(item):
-    if item.config.getoption("--efa"):
-        efa_tests = [mark for mark in item.iter_markers(name="efa")]
-        if not efa_tests:
-            pytest.skip("Skipping non-efa tests")
 
 
 def pytest_collection_modifyitems(session, config, items):
diff --git a/test/sagemaker_tests/tensorflow/tensorflow1_training/integration/conftest.py b/test/sagemaker_tests/tensorflow/tensorflow1_training/integration/conftest.py
@@ -42,18 +42,7 @@ def pytest_addoption(parser):
     parser.addoption('--instance-type', default=None)
     parser.addoption('--generate-coverage-doc', default=False, action='store_true',
                      help='use this option to generate test coverage doc')
-    parser.addoption(
-        "--efa", action="store_true", default=False, help="Run only efa tests",
-    )
 
-def pytest_configure(config):
-    config.addinivalue_line("markers", "efa(): explicitly mark to run efa tests")
-
-def pytest_runtest_setup(item):
-    if item.config.getoption("--efa"):
-        efa_tests = [mark for mark in item.iter_markers(name="efa")]
-        if not efa_tests:
-            pytest.skip("Skipping non-efa tests")
 
 def pytest_collection_modifyitems(session, config, items):
     if config.getoption("--generate-coverage-doc"):
diff --git a/test/sagemaker_tests/tensorflow/tensorflow2_training/integration/conftest.py b/test/sagemaker_tests/tensorflow/tensorflow2_training/integration/conftest.py
@@ -42,18 +42,7 @@ def pytest_addoption(parser):
     parser.addoption('--instance-type', default=None)
     parser.addoption('--generate-coverage-doc', default=False, action='store_true',
                      help='use this option to generate test coverage doc')
-    parser.addoption(
-        "--efa", action="store_true", default=False, help="Run only efa tests",
-    )
 
-def pytest_configure(config):
-    config.addinivalue_line("markers", "efa(): explicitly mark to run efa tests")
-
-def pytest_runtest_setup(item):
-    if item.config.getoption("--efa"):
-        efa_tests = [mark for mark in item.iter_markers(name="efa")]
-        if not efa_tests:
-            pytest.skip("Skipping non-efa tests")
 
 def pytest_collection_modifyitems(session, config, items):
     if config.getoption("--generate-coverage-doc"):
diff --git a/test/test_utils/sagemaker.py b/test/test_utils/sagemaker.py
@@ -139,10 +139,15 @@ def generate_sagemaker_pytest_cmd(image, sagemaker_test_type):
     local_test_report = os.path.join(UBUNTU_HOME_DIR, "test", f"{job_type}_{tag}_sm_local.xml")
     is_py3 = " python3 -m "
 
+    efa_flag = ""
+    if job_type == "training" and (framework_major_version == "tensorflow" or framework == "pytorch"):
+        efa_dedicated = os.getenv("EFA_DEDICATED", "False").lower() == "true"
+        efa_flag = '--efa' if efa_dedicated else '-m not efa'
+
     remote_pytest_cmd = (
         f"pytest -rA {integration_path} --region {region} --processor {processor} {docker_base_arg} "
         f"{sm_remote_docker_base_name} --tag {tag} {framework_version_arg} {framework_version} "
-        f"{aws_id_arg} {account_id} {instance_type_arg} {instance_type} --junitxml {test_report}"
+        f"{aws_id_arg} {account_id} {instance_type_arg} {instance_type} {efa_flag} --junitxml {test_report}"
     )
 
     if processor == "eia" :