Skip to content

Commit faa3383

Browse files
karan6181jeet4320
andauthored
[tensorflow, pytorch][build][sagemaker] Updated smdataparallel binary to support EFA (#1075)
* [tensorflow, pytorch][build][sagemaker] Updated smdataparallel binary to support EFA Co-authored-by: Jeetendra Patil <[email protected]>
1 parent ec861ef commit faa3383

File tree

6 files changed

+15
-15
lines changed

6 files changed

+15
-15
lines changed

pytorch/training/docker/1.8/py3/Dockerfile.cpu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ RUN pip install --upgrade pip --trusted-host pypi.org --trusted-host files.pytho
116116
sagemaker-experiments==0.* \
117117
"sagemaker-pytorch-training<3" \
118118
psutil==5.6.7 \
119-
Pillow==7.1.0 \
119+
Pillow==8.2.0 \
120120
&& pip uninstall -y torch \
121121
&& pip install --no-cache-dir -U ${PT_TRAINING_URL} \
122122
&& pip uninstall -y torchvision \

pytorch/training/docker/1.8/py3/cu111/Dockerfile.gpu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ ENV MANUAL_BUILD=0
4141
ARG PT_TRAINING_URL=https://aws-pytorch-binaries.s3-us-west-2.amazonaws.com/r1.8.1_aws/20210325-012734/e1343088f0beb99438343e1e99e8d71ffb972b47/gpu/torch-1.8.1-cp36-cp36m-manylinux1_x86_64.whl
4242
ARG PT_TORCHVISION_URL=https://torchvision-build.s3-us-west-2.amazonaws.com/1.8.1/gpu/torchvision-0.9.1-cp36-cp36m-linux_x86_64.whl
4343
ARG SMD_MODEL_PARALLEL_URL=https://sagemaker-distributed-model-parallel.s3.amazonaws.com/pytorch-1.8/build-artifacts/2021-03-26-22-01/smdistributed_modelparallel-1.3.1-cp36-cp36m-linux_x86_64.whl
44-
ARG SMDATAPARALLEL_BINARY=https://smdataparallel.s3.amazonaws.com/binary/pytorch/1.8.1/cu111/2021-04-16/smdistributed_dataparallel-1.2.0-cp36-cp36m-linux_x86_64.whl
44+
ARG SMDATAPARALLEL_BINARY=https://smdataparallel.s3.amazonaws.com/binary/pytorch/1.8.1/cu111/2021-04-28/smdistributed_dataparallel-1.2.0-cp36-cp36m-linux_x86_64.whl
4545

4646
RUN apt-get update \
4747
&& apt-get install -y --allow-change-held-packages --no-install-recommends \

tensorflow/training/docker/2.4/py3/Dockerfile.cpu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ RUN ${PIP} install --no-cache-dir -U \
127127
scipy==1.5.2 \
128128
scikit-learn==0.23 \
129129
pandas==1.1 \
130-
Pillow==7.2.0 \
130+
Pillow==8.2.0 \
131131
# python-dateutil==2.8.1 to satisfy botocore associated with latest awscli
132132
python-dateutil==2.8.1 \
133133
# install PyYAML>=5.4 to avoid conflict with latest awscli

tensorflow/training/docker/2.4/py3/cu110/Dockerfile.gpu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ ARG ESTIMATOR_URL=https://aws-tensorflow-binaries.s3-us-west-2.amazonaws.com/est
3939
# the nightly builds. Therefore, while updating the smdebug version, please ensure that the format is not disturbed.
4040
ARG SMDEBUG_VERSION=1.0.8
4141

42-
ARG SMDATAPARALLEL_BINARY=https://smdataparallel.s3.amazonaws.com/binary/tensorflow/2.4.1/cu110/2021-04-12/smdistributed_dataparallel-1.2.0-cp37-cp37m-linux_x86_64.whl
42+
ARG SMDATAPARALLEL_BINARY=https://smdataparallel.s3.amazonaws.com/binary/tensorflow/2.4.1/cu110/2021-04-28/smdistributed_dataparallel-1.2.0-cp37-cp37m-linux_x86_64.whl
4343

4444
ARG SMMODELPARALLEL_BINARY=https://sagemaker-distributed-model-parallel.s3.amazonaws.com/tensorflow-2.4/build-artifacts/2021-03-26-21-57/smdistributed_modelparallel-1.3.1-cp37-cp37m-linux_x86_64.whl
4545

@@ -193,7 +193,7 @@ RUN ${PIP} install --no-cache-dir -U \
193193
scipy==1.5.2 \
194194
scikit-learn==0.23 \
195195
pandas==1.1 \
196-
Pillow==7.2.0 \
196+
Pillow==8.2.0 \
197197
# python-dateutil==2.8.1 to satisfy botocore associated with latest awscli
198198
python-dateutil==2.8.1 \
199199
# install PyYAML>=5.4 to avoid conflict with latest awscli

test/sagemaker_tests/pytorch/training/integration/sagemaker/test_distributed_operations.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -146,21 +146,21 @@ def test_mnist_gpu(sagemaker_session, framework_version, ecr_image, dist_gpu_bac
146146
@pytest.mark.skip_cpu
147147
@pytest.mark.skip_py2_containers
148148
@pytest.mark.parametrize("test_script, num_processes", [("smmodelparallel_pt_mnist.py", 8)])
149-
def test_smmodelparallel_mnist_multigpu_multinode(ecr_image, instance_type, py_version, sagemaker_session, tmpdir, test_script, num_processes):
149+
def test_smmodelparallel_mnist_multigpu_multinode(n_virginia_ecr_image, instance_type, py_version, n_virginia_sagemaker_session, tmpdir, test_script, num_processes):
150150
"""
151151
Tests pt mnist command via script mode
152152
"""
153153
instance_type = "ml.p3.16xlarge"
154-
validate_or_skip_smmodelparallel(ecr_image)
154+
validate_or_skip_smmodelparallel(n_virginia_ecr_image)
155155
with timeout(minutes=DEFAULT_TIMEOUT):
156156
pytorch = PyTorch(
157157
entry_point=test_script,
158158
role='SageMakerRole',
159-
image_uri=ecr_image,
159+
image_uri=n_virginia_ecr_image,
160160
source_dir=mnist_path,
161161
instance_count=2,
162162
instance_type=instance_type,
163-
sagemaker_session=sagemaker_session,
163+
sagemaker_session=n_virginia_sagemaker_session,
164164
hyperparameters = {"assert-losses": 1, "amp": 1, "ddp": 1, "data-dir": "data/training", "epochs": 5},
165165
distribution={
166166
"smdistributed": {

test/sagemaker_tests/pytorch/training/integration/sagemaker/test_smdataparallel.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -140,14 +140,14 @@ def test_smdataparallel_mnist(n_virginia_sagemaker_session, framework_version, n
140140
@pytest.mark.integration("smdataparallel_smmodelparallel")
141141
@pytest.mark.model("mnist")
142142
@pytest.mark.parametrize('instance_types', ["ml.p3.16xlarge"])
143-
def test_smmodelparallel_smdataparallel_mnist(instance_types, ecr_image, py_version, sagemaker_session, tmpdir):
143+
def test_smmodelparallel_smdataparallel_mnist(instance_types, n_virginia_ecr_image, py_version, n_virginia_sagemaker_session, tmpdir):
144144
"""
145145
Tests SM Distributed DataParallel and ModelParallel single-node via script mode
146146
This test has been added for SM DataParallelism and ModelParallelism tests for re:invent.
147147
TODO: Consider reworking these tests after re:Invent releases are done
148148
"""
149-
can_run_modelparallel = can_run_smmodelparallel(ecr_image)
150-
can_run_dataparallel = can_run_smdataparallel(ecr_image)
149+
can_run_modelparallel = can_run_smmodelparallel(n_virginia_ecr_image)
150+
can_run_dataparallel = can_run_smdataparallel(n_virginia_ecr_image)
151151
if can_run_dataparallel and can_run_modelparallel:
152152
entry_point = 'smdataparallel_smmodelparallel_mnist_script_mode.sh'
153153
elif can_run_dataparallel:
@@ -160,12 +160,12 @@ def test_smmodelparallel_smdataparallel_mnist(instance_types, ecr_image, py_vers
160160
with timeout(minutes=DEFAULT_TIMEOUT):
161161
pytorch = PyTorch(entry_point=entry_point,
162162
role='SageMakerRole',
163-
image_uri=ecr_image,
163+
image_uri=n_virginia_ecr_image,
164164
source_dir=mnist_path,
165165
instance_count=1,
166166
instance_type=instance_types,
167-
sagemaker_session=sagemaker_session)
167+
sagemaker_session=n_virginia_sagemaker_session)
168168

169-
pytorch = _disable_sm_profiler(sagemaker_session.boto_region_name, pytorch)
169+
pytorch = _disable_sm_profiler(n_virginia_sagemaker_session.boto_region_name, pytorch)
170170

171171
pytorch.fit()

0 commit comments

Comments
 (0)