[tensorflow, pytorch][build][sagemaker] Updated smdataparallel binary to support EFA (#1075)

karan6181 · jeet4320 · web-flow · commit faa3383192ed · 2021-04-29T11:33:56.000-07:00
* [tensorflow, pytorch][build][sagemaker] Updated smdataparallel binary to support EFA

Co-authored-by: Jeetendra Patil &lt;jspatil@amazon.com&gt;
diff --git a/pytorch/training/docker/1.8/py3/Dockerfile.cpu b/pytorch/training/docker/1.8/py3/Dockerfile.cpu
@@ -116,7 +116,7 @@ RUN pip install --upgrade pip --trusted-host pypi.org --trusted-host files.pytho
     sagemaker-experiments==0.* \
     "sagemaker-pytorch-training<3" \
     psutil==5.6.7 \
-    Pillow==7.1.0 \
+    Pillow==8.2.0 \
  && pip uninstall -y torch \
  && pip install --no-cache-dir -U ${PT_TRAINING_URL} \
  && pip uninstall -y torchvision \
diff --git a/pytorch/training/docker/1.8/py3/cu111/Dockerfile.gpu b/pytorch/training/docker/1.8/py3/cu111/Dockerfile.gpu
@@ -41,7 +41,7 @@ ENV MANUAL_BUILD=0
 ARG PT_TRAINING_URL=https://aws-pytorch-binaries.s3-us-west-2.amazonaws.com/r1.8.1_aws/20210325-012734/e1343088f0beb99438343e1e99e8d71ffb972b47/gpu/torch-1.8.1-cp36-cp36m-manylinux1_x86_64.whl
 ARG PT_TORCHVISION_URL=https://torchvision-build.s3-us-west-2.amazonaws.com/1.8.1/gpu/torchvision-0.9.1-cp36-cp36m-linux_x86_64.whl
 ARG SMD_MODEL_PARALLEL_URL=https://sagemaker-distributed-model-parallel.s3.amazonaws.com/pytorch-1.8/build-artifacts/2021-03-26-22-01/smdistributed_modelparallel-1.3.1-cp36-cp36m-linux_x86_64.whl
-ARG SMDATAPARALLEL_BINARY=https://smdataparallel.s3.amazonaws.com/binary/pytorch/1.8.1/cu111/2021-04-16/smdistributed_dataparallel-1.2.0-cp36-cp36m-linux_x86_64.whl
+ARG SMDATAPARALLEL_BINARY=https://smdataparallel.s3.amazonaws.com/binary/pytorch/1.8.1/cu111/2021-04-28/smdistributed_dataparallel-1.2.0-cp36-cp36m-linux_x86_64.whl
 
 RUN apt-get update \
  && apt-get install -y --allow-change-held-packages --no-install-recommends \
diff --git a/tensorflow/training/docker/2.4/py3/Dockerfile.cpu b/tensorflow/training/docker/2.4/py3/Dockerfile.cpu
@@ -127,7 +127,7 @@ RUN ${PIP} install --no-cache-dir -U \
     scipy==1.5.2 \
     scikit-learn==0.23 \
     pandas==1.1 \
-    Pillow==7.2.0 \
+    Pillow==8.2.0 \
     # python-dateutil==2.8.1 to satisfy botocore associated with latest awscli
     python-dateutil==2.8.1 \
     # install PyYAML>=5.4 to avoid conflict with latest awscli
diff --git a/tensorflow/training/docker/2.4/py3/cu110/Dockerfile.gpu b/tensorflow/training/docker/2.4/py3/cu110/Dockerfile.gpu
@@ -39,7 +39,7 @@ ARG ESTIMATOR_URL=https://aws-tensorflow-binaries.s3-us-west-2.amazonaws.com/est
 # the nightly builds. Therefore, while updating the smdebug version, please ensure that the format is not disturbed.
 ARG SMDEBUG_VERSION=1.0.8
 
-ARG SMDATAPARALLEL_BINARY=https://smdataparallel.s3.amazonaws.com/binary/tensorflow/2.4.1/cu110/2021-04-12/smdistributed_dataparallel-1.2.0-cp37-cp37m-linux_x86_64.whl
+ARG SMDATAPARALLEL_BINARY=https://smdataparallel.s3.amazonaws.com/binary/tensorflow/2.4.1/cu110/2021-04-28/smdistributed_dataparallel-1.2.0-cp37-cp37m-linux_x86_64.whl
 
 ARG SMMODELPARALLEL_BINARY=https://sagemaker-distributed-model-parallel.s3.amazonaws.com/tensorflow-2.4/build-artifacts/2021-03-26-21-57/smdistributed_modelparallel-1.3.1-cp37-cp37m-linux_x86_64.whl
 
@@ -193,7 +193,7 @@ RUN ${PIP} install --no-cache-dir -U \
     scipy==1.5.2 \
     scikit-learn==0.23 \
     pandas==1.1 \
-    Pillow==7.2.0 \
+    Pillow==8.2.0 \
     # python-dateutil==2.8.1 to satisfy botocore associated with latest awscli
     python-dateutil==2.8.1 \
     # install PyYAML>=5.4 to avoid conflict with latest awscli
diff --git a/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_distributed_operations.py b/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_distributed_operations.py
@@ -146,21 +146,21 @@ def test_mnist_gpu(sagemaker_session, framework_version, ecr_image, dist_gpu_bac
 @pytest.mark.skip_cpu
 @pytest.mark.skip_py2_containers
 @pytest.mark.parametrize("test_script, num_processes", [("smmodelparallel_pt_mnist.py", 8)])
-def test_smmodelparallel_mnist_multigpu_multinode(ecr_image, instance_type, py_version, sagemaker_session, tmpdir, test_script, num_processes):
+def test_smmodelparallel_mnist_multigpu_multinode(n_virginia_ecr_image, instance_type, py_version, n_virginia_sagemaker_session, tmpdir, test_script, num_processes):
     """
     Tests pt mnist command via script mode
     """
     instance_type = "ml.p3.16xlarge"
-    validate_or_skip_smmodelparallel(ecr_image)
+    validate_or_skip_smmodelparallel(n_virginia_ecr_image)
     with timeout(minutes=DEFAULT_TIMEOUT):
         pytorch = PyTorch(
             entry_point=test_script,
             role='SageMakerRole',
-            image_uri=ecr_image,
+            image_uri=n_virginia_ecr_image,
             source_dir=mnist_path,
             instance_count=2,
             instance_type=instance_type,
-            sagemaker_session=sagemaker_session,
+            sagemaker_session=n_virginia_sagemaker_session,
             hyperparameters = {"assert-losses": 1, "amp": 1, "ddp": 1, "data-dir": "data/training", "epochs": 5},
             distribution={
                 "smdistributed": {
diff --git a/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_smdataparallel.py b/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_smdataparallel.py
@@ -140,14 +140,14 @@ def test_smdataparallel_mnist(n_virginia_sagemaker_session, framework_version, n
 @pytest.mark.integration("smdataparallel_smmodelparallel")
 @pytest.mark.model("mnist")
 @pytest.mark.parametrize('instance_types', ["ml.p3.16xlarge"])
-def test_smmodelparallel_smdataparallel_mnist(instance_types, ecr_image, py_version, sagemaker_session, tmpdir):
+def test_smmodelparallel_smdataparallel_mnist(instance_types, n_virginia_ecr_image, py_version, n_virginia_sagemaker_session, tmpdir):
     """
     Tests SM Distributed DataParallel and ModelParallel single-node via script mode
     This test has been added for SM DataParallelism and ModelParallelism tests for re:invent.
     TODO: Consider reworking these tests after re:Invent releases are done
     """
-    can_run_modelparallel = can_run_smmodelparallel(ecr_image)
-    can_run_dataparallel = can_run_smdataparallel(ecr_image)
+    can_run_modelparallel = can_run_smmodelparallel(n_virginia_ecr_image)
+    can_run_dataparallel = can_run_smdataparallel(n_virginia_ecr_image)
     if can_run_dataparallel and can_run_modelparallel:
         entry_point = 'smdataparallel_smmodelparallel_mnist_script_mode.sh'
     elif can_run_dataparallel:
@@ -160,12 +160,12 @@ def test_smmodelparallel_smdataparallel_mnist(instance_types, ecr_image, py_vers
     with timeout(minutes=DEFAULT_TIMEOUT):
         pytorch = PyTorch(entry_point=entry_point,
                           role='SageMakerRole',
-                          image_uri=ecr_image,
+                          image_uri=n_virginia_ecr_image,
                           source_dir=mnist_path,
                           instance_count=1,
                           instance_type=instance_types,
-                          sagemaker_session=sagemaker_session)
+                          sagemaker_session=n_virginia_sagemaker_session)
 
-        pytorch = _disable_sm_profiler(sagemaker_session.boto_region_name, pytorch)
+        pytorch = _disable_sm_profiler(n_virginia_sagemaker_session.boto_region_name, pytorch)
 
         pytorch.fit()