diff --git a/pytorch/training/docker/1.8/py3/Dockerfile.cpu b/pytorch/training/docker/1.8/py3/Dockerfile.cpu index 3a0edaaab823..632353e1a5b6 100644 --- a/pytorch/training/docker/1.8/py3/Dockerfile.cpu +++ b/pytorch/training/docker/1.8/py3/Dockerfile.cpu @@ -116,7 +116,7 @@ RUN pip install --upgrade pip --trusted-host pypi.org --trusted-host files.pytho sagemaker-experiments==0.* \ "sagemaker-pytorch-training<3" \ psutil==5.6.7 \ - Pillow==7.1.0 \ + Pillow==8.2.0 \ && pip uninstall -y torch \ && pip install --no-cache-dir -U ${PT_TRAINING_URL} \ && pip uninstall -y torchvision \ diff --git a/pytorch/training/docker/1.8/py3/cu111/Dockerfile.gpu b/pytorch/training/docker/1.8/py3/cu111/Dockerfile.gpu index 0bee946e341e..0df73b793e62 100644 --- a/pytorch/training/docker/1.8/py3/cu111/Dockerfile.gpu +++ b/pytorch/training/docker/1.8/py3/cu111/Dockerfile.gpu @@ -41,7 +41,7 @@ ENV MANUAL_BUILD=0 ARG PT_TRAINING_URL=https://aws-pytorch-binaries.s3-us-west-2.amazonaws.com/r1.8.1_aws/20210325-012734/e1343088f0beb99438343e1e99e8d71ffb972b47/gpu/torch-1.8.1-cp36-cp36m-manylinux1_x86_64.whl ARG PT_TORCHVISION_URL=https://torchvision-build.s3-us-west-2.amazonaws.com/1.8.1/gpu/torchvision-0.9.1-cp36-cp36m-linux_x86_64.whl ARG SMD_MODEL_PARALLEL_URL=https://sagemaker-distributed-model-parallel.s3.amazonaws.com/pytorch-1.8/build-artifacts/2021-03-26-22-01/smdistributed_modelparallel-1.3.1-cp36-cp36m-linux_x86_64.whl -ARG SMDATAPARALLEL_BINARY=https://smdataparallel.s3.amazonaws.com/binary/pytorch/1.8.1/cu111/2021-04-16/smdistributed_dataparallel-1.2.0-cp36-cp36m-linux_x86_64.whl +ARG SMDATAPARALLEL_BINARY=https://smdataparallel.s3.amazonaws.com/binary/pytorch/1.8.1/cu111/2021-04-28/smdistributed_dataparallel-1.2.0-cp36-cp36m-linux_x86_64.whl RUN apt-get update \ && apt-get install -y --allow-change-held-packages --no-install-recommends \ diff --git a/tensorflow/training/docker/2.4/py3/Dockerfile.cpu b/tensorflow/training/docker/2.4/py3/Dockerfile.cpu index 31a398375fcd..ea2c473cd2dc 100644 --- a/tensorflow/training/docker/2.4/py3/Dockerfile.cpu +++ b/tensorflow/training/docker/2.4/py3/Dockerfile.cpu @@ -127,7 +127,7 @@ RUN ${PIP} install --no-cache-dir -U \ scipy==1.5.2 \ scikit-learn==0.23 \ pandas==1.1 \ - Pillow==7.2.0 \ + Pillow==8.2.0 \ # python-dateutil==2.8.1 to satisfy botocore associated with latest awscli python-dateutil==2.8.1 \ # install PyYAML>=5.4 to avoid conflict with latest awscli diff --git a/tensorflow/training/docker/2.4/py3/cu110/Dockerfile.gpu b/tensorflow/training/docker/2.4/py3/cu110/Dockerfile.gpu index f9f021146cd9..2689d30dae04 100644 --- a/tensorflow/training/docker/2.4/py3/cu110/Dockerfile.gpu +++ b/tensorflow/training/docker/2.4/py3/cu110/Dockerfile.gpu @@ -39,7 +39,7 @@ ARG ESTIMATOR_URL=https://aws-tensorflow-binaries.s3-us-west-2.amazonaws.com/est # the nightly builds. Therefore, while updating the smdebug version, please ensure that the format is not disturbed. ARG SMDEBUG_VERSION=1.0.8 -ARG SMDATAPARALLEL_BINARY=https://smdataparallel.s3.amazonaws.com/binary/tensorflow/2.4.1/cu110/2021-04-12/smdistributed_dataparallel-1.2.0-cp37-cp37m-linux_x86_64.whl +ARG SMDATAPARALLEL_BINARY=https://smdataparallel.s3.amazonaws.com/binary/tensorflow/2.4.1/cu110/2021-04-28/smdistributed_dataparallel-1.2.0-cp37-cp37m-linux_x86_64.whl ARG SMMODELPARALLEL_BINARY=https://sagemaker-distributed-model-parallel.s3.amazonaws.com/tensorflow-2.4/build-artifacts/2021-03-26-21-57/smdistributed_modelparallel-1.3.1-cp37-cp37m-linux_x86_64.whl @@ -193,7 +193,7 @@ RUN ${PIP} install --no-cache-dir -U \ scipy==1.5.2 \ scikit-learn==0.23 \ pandas==1.1 \ - Pillow==7.2.0 \ + Pillow==8.2.0 \ # python-dateutil==2.8.1 to satisfy botocore associated with latest awscli python-dateutil==2.8.1 \ # install PyYAML>=5.4 to avoid conflict with latest awscli diff --git a/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_distributed_operations.py b/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_distributed_operations.py index 57896af272ee..16ea3fa41379 100644 --- a/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_distributed_operations.py +++ b/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_distributed_operations.py @@ -146,21 +146,21 @@ def test_mnist_gpu(sagemaker_session, framework_version, ecr_image, dist_gpu_bac @pytest.mark.skip_cpu @pytest.mark.skip_py2_containers @pytest.mark.parametrize("test_script, num_processes", [("smmodelparallel_pt_mnist.py", 8)]) -def test_smmodelparallel_mnist_multigpu_multinode(ecr_image, instance_type, py_version, sagemaker_session, tmpdir, test_script, num_processes): +def test_smmodelparallel_mnist_multigpu_multinode(n_virginia_ecr_image, instance_type, py_version, n_virginia_sagemaker_session, tmpdir, test_script, num_processes): """ Tests pt mnist command via script mode """ instance_type = "ml.p3.16xlarge" - validate_or_skip_smmodelparallel(ecr_image) + validate_or_skip_smmodelparallel(n_virginia_ecr_image) with timeout(minutes=DEFAULT_TIMEOUT): pytorch = PyTorch( entry_point=test_script, role='SageMakerRole', - image_uri=ecr_image, + image_uri=n_virginia_ecr_image, source_dir=mnist_path, instance_count=2, instance_type=instance_type, - sagemaker_session=sagemaker_session, + sagemaker_session=n_virginia_sagemaker_session, hyperparameters = {"assert-losses": 1, "amp": 1, "ddp": 1, "data-dir": "data/training", "epochs": 5}, distribution={ "smdistributed": { diff --git a/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_smdataparallel.py b/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_smdataparallel.py index 42e9f6e7dc6a..4182b4cca8d1 100644 --- a/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_smdataparallel.py +++ b/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_smdataparallel.py @@ -140,14 +140,14 @@ def test_smdataparallel_mnist(n_virginia_sagemaker_session, framework_version, n @pytest.mark.integration("smdataparallel_smmodelparallel") @pytest.mark.model("mnist") @pytest.mark.parametrize('instance_types', ["ml.p3.16xlarge"]) -def test_smmodelparallel_smdataparallel_mnist(instance_types, ecr_image, py_version, sagemaker_session, tmpdir): +def test_smmodelparallel_smdataparallel_mnist(instance_types, n_virginia_ecr_image, py_version, n_virginia_sagemaker_session, tmpdir): """ Tests SM Distributed DataParallel and ModelParallel single-node via script mode This test has been added for SM DataParallelism and ModelParallelism tests for re:invent. TODO: Consider reworking these tests after re:Invent releases are done """ - can_run_modelparallel = can_run_smmodelparallel(ecr_image) - can_run_dataparallel = can_run_smdataparallel(ecr_image) + can_run_modelparallel = can_run_smmodelparallel(n_virginia_ecr_image) + can_run_dataparallel = can_run_smdataparallel(n_virginia_ecr_image) if can_run_dataparallel and can_run_modelparallel: entry_point = 'smdataparallel_smmodelparallel_mnist_script_mode.sh' elif can_run_dataparallel: @@ -160,12 +160,12 @@ def test_smmodelparallel_smdataparallel_mnist(instance_types, ecr_image, py_vers with timeout(minutes=DEFAULT_TIMEOUT): pytorch = PyTorch(entry_point=entry_point, role='SageMakerRole', - image_uri=ecr_image, + image_uri=n_virginia_ecr_image, source_dir=mnist_path, instance_count=1, instance_type=instance_types, - sagemaker_session=sagemaker_session) + sagemaker_session=n_virginia_sagemaker_session) - pytorch = _disable_sm_profiler(sagemaker_session.boto_region_name, pytorch) + pytorch = _disable_sm_profiler(n_virginia_sagemaker_session.boto_region_name, pytorch) pytorch.fit()