feature: add data parallelism support (#11) (#12) (#13)

ChaiBapchya · ChoiByungWook · commit c4c2a8ae485e · 2020-12-10T19:52:29.000-08:00
diff --git a/buildspec.yml b/buildspec.yml
@@ -2,7 +2,7 @@ version: 0.2
 
 env:
   variables:
-    FRAMEWORK_VERSION: '1.4.0'
+    FRAMEWORK_VERSION: '1.6.0'
     CPU_INSTANCE_TYPE: 'ml.c4.xlarge'
     ECR_REPO: 'sagemaker-test'
 
diff --git a/src/sagemaker_pytorch_container/training.py b/src/sagemaker_pytorch_container/training.py
@@ -20,6 +20,7 @@
 from sagemaker_training import entry_point, environment, errors, runner
 
 MASTER_PORT = '7777'
+LAUNCH_SMDATAPARALLEL_ENV_NAME = 'sagemaker_distributed_dataparallel_enabled'
 
 logger = logging.getLogger(__name__)
 
@@ -50,8 +51,15 @@ def train(training_environment):
 
     mpi_enabled = training_environment.additional_framework_parameters.get('sagemaker_mpi_enabled')
 
+    smdataparallel_enabled = training_environment.additional_framework_parameters.get(
+        LAUNCH_SMDATAPARALLEL_ENV_NAME, False
+    )
+
     if mpi_enabled:
         runner_type = runner.MPIRunnerType
+    elif smdataparallel_enabled:
+        runner_type = runner.SMDataParallelRunnerType
+        logger.info('Invoking SMDataParallel')
     else:
         runner_type = runner.ProcessRunnerType
 
diff --git a/test/container/1.6.0/Dockerfile.dlc.cpu b/test/container/1.6.0/Dockerfile.dlc.cpu
@@ -0,0 +1,10 @@
+ARG region
+from 763104351884.dkr.ecr.$region.amazonaws.com/pytorch-training:1.6.0-cpu-py36-ubuntu16.04
+
+COPY lib/changehostname.c /
+COPY lib/start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh
+RUN chmod +x /usr/local/bin/start_with_right_hostname.sh
+
+COPY dist/sagemaker_pytorch_training-*.tar.gz /sagemaker_pytorch_training.tar.gz
+RUN pip install --upgrade --no-cache-dir /sagemaker_pytorch_training.tar.gz && \
+    rm /sagemaker_pytorch_training.tar.gz
diff --git a/test/container/1.6.0/Dockerfile.dlc.gpu b/test/container/1.6.0/Dockerfile.dlc.gpu
@@ -0,0 +1,28 @@
+ARG region
+from 763104351884.dkr.ecr.$region.amazonaws.com/pytorch-training:1.6.0-gpu-py36-cu101-ubuntu16.04
+
+# TODO: Remove once the 1.6.0-gpu-py3 DLC image installs mpi4py
+RUN pip3 install mpi4py==3.0.3
+
+# TODO: Remove once the 1.6.0-gpu-py3 DLC image fixes OpenSSH config
+# Configure OpenSSH so that nodes can communicate with each other
+RUN mkdir -p /var/run/sshd && \
+ sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
+
+RUN rm -rf /root/.ssh/ && \
+ mkdir -p /root/.ssh/ && \
+ ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa && \
+ cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \
+ && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config
+
+# TODO: Remove once the 1.6.0-gpu-py3 DLC image fixes MPI config
+# Comment line in MPI config to prevent mutually exclusive MCA settings
+RUN sed -i '62,62 s/^/#/' /home/.openmpi/etc/openmpi-mca-params.conf
+
+COPY lib/changehostname.c /
+COPY lib/start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh
+RUN chmod +x /usr/local/bin/start_with_right_hostname.sh
+
+COPY dist/sagemaker_pytorch_training-*.tar.gz /sagemaker_pytorch_training.tar.gz
+RUN pip install --upgrade --no-cache-dir /sagemaker_pytorch_training.tar.gz && \
+    rm /sagemaker_pytorch_training.tar.gz
diff --git a/test/container/1.6.0/Dockerfile.pytorch b/test/container/1.6.0/Dockerfile.pytorch
@@ -0,0 +1,23 @@
+from pytorch/pytorch:1.6.0-cuda10.1-cudnn7-runtime
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    jq \
+    build-essential \
+    cmake \
+    gcc
+RUN rm -rf /var/lib/apt/lists/*
+
+COPY lib/changehostname.c /
+COPY lib/start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh
+RUN chmod +x /usr/local/bin/start_with_right_hostname.sh
+
+COPY dist/sagemaker_pytorch_training-*.tar.gz /sagemaker_pytorch_training.tar.gz
+RUN pip install --no-cache-dir /sagemaker_pytorch_training.tar.gz && \
+    rm /sagemaker_pytorch_training.tar.gz
+
+ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main
+
+WORKDIR /
+
+# Starts framework
+ENTRYPOINT ["bash", "-m", "start_with_right_hostname.sh"]
diff --git a/test/integration/local/test_smdataparallel.py b/test/integration/local/test_smdataparallel.py
@@ -0,0 +1,51 @@
+# Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+from __future__ import absolute_import
+
+import json
+import os
+import tarfile
+
+import pytest
+from sagemaker.pytorch import PyTorch
+
+from integration import resources_path
+from utils.local_mode_utils import assert_files_exist
+
+# TODO: Enable the test once SMDataParallel DLC is publicly accessibly
+@pytest.mark.skip(reason="SMDataParallel DLC is not publicly accessible")
+@pytest.mark.skip_cpu
+@pytest.mark.skip_generic
+def test_smdataparallel_training(sagemaker_local_session, image_uri, framework_version, tmpdir):
+    output_path = 'file://' + str(tmpdir)
+
+    estimator = PyTorch(
+        entry_point=os.path.join(resources_path, 'mnist', 'smdataparallel_mnist.py'),
+        role='SageMakerRole',
+        train_instance_type="local_gpu",
+        sagemaker_session=sagemaker_local_session,
+        train_instance_count=1,
+        image_name=image_uri,
+        output_path=output_path,
+        framework_version=framework_version,
+        hyperparameters={'sagemaker_distributed_dataparallel_enabled': True, "save-model": ""})
+    success_files = {
+        'model': ['mnist_cnn.pt'],
+        'output': ['success'],
+    }
+    _train_and_assert_success(estimator, str(tmpdir), success_files)
+
+
+def _train_and_assert_success(estimator, output_path, output_files):
+    estimator.fit()
+    assert_files_exist(output_path, output_files)
diff --git a/test/integration/sagemaker/test_smdataparallel.py b/test/integration/sagemaker/test_smdataparallel.py
@@ -0,0 +1,52 @@
+# Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+from __future__ import absolute_import
+
+import json
+import os
+import tarfile
+
+import pytest
+from sagemaker.pytorch import PyTorch
+
+from integration import resources_path, DEFAULT_TIMEOUT
+from integration.sagemaker.timeout import timeout
+
+
+@pytest.mark.skip(reason="SMDataParallel DLC is not publicly accessible")
+@pytest.mark.skip_cpu
+@pytest.mark.skip_generic
+@pytest.mark.parametrize(
+    "instances, train_instance_type",
+    [(1, "ml.p3.16xlarge"), (2, "ml.p3.16xlarge"), (1, "ml.p3dn.24xlarge"), (2, "ml.p3dn.24xlarge")],
+)
+def test_smdataparallel_training(instances, train_instance_type, sagemaker_session, image_uri, framework_version, tmpdir):
+    default_bucket = sagemaker_session.default_bucket()
+    output_path = "s3://" + os.path.join(default_bucket, "pytorch/smdataparallel")
+
+    estimator = PyTorch(
+        entry_point=os.path.join(resources_path, "mnist", "smdataparallel_mnist.py"),
+        role="SageMakerRole",
+        train_instance_type=train_instance_type,
+        sagemaker_session=sagemaker_session,
+        train_instance_count=instances,
+        image_name=image_uri,
+        output_path=output_path,
+        framework_version=framework_version,
+        hyperparameters={
+            "sagemaker_distributed_dataparallel_enabled": True
+        }
+    )
+
+    with timeout(minutes=DEFAULT_TIMEOUT):
+        estimator.fit()
diff --git a/test/resources/mnist/smdataparallel_mnist.py b/test/resources/mnist/smdataparallel_mnist.py
@@ -0,0 +1,177 @@
+# Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the License. A copy of the License is located at
+#
+# http://aws.amazon.com/apache2.0/
+#
+# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and limitations under the License.
+
+from __future__ import print_function
+import argparse
+import time
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from torchvision import datasets, transforms
+from torch.optim.lr_scheduler import StepLR
+from smdistributed.dataparallel.torch.parallel.distributed import DistributedDataParallel as DDP
+import smdistributed.dataparallel.torch.distributed as dist
+dist.init_process_group()
+
+class Net(nn.Module):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.conv1 = nn.Conv2d(1, 32, 3, 1)
+        self.conv2 = nn.Conv2d(32, 64, 3, 1)
+        self.dropout1 = nn.Dropout2d(0.25)
+        self.dropout2 = nn.Dropout2d(0.5)
+        self.fc1 = nn.Linear(9216, 128)
+        self.fc2 = nn.Linear(128, 10)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = F.relu(x)
+        x = self.conv2(x)
+        x = F.relu(x)
+        x = F.max_pool2d(x, 2)
+        x = self.dropout1(x)
+        x = torch.flatten(x, 1)
+        x = self.fc1(x)
+        x = F.relu(x)
+        x = self.dropout2(x)
+        x = self.fc2(x)
+        output = F.log_softmax(x, dim=1)
+        return output
+
+
+def train(args, model, device, train_loader, optimizer, epoch):
+    model.train()
+    for batch_idx, (data, target) in enumerate(train_loader):
+        data, target = data.to(device), target.to(device)
+        optimizer.zero_grad()
+        output = model(data)
+        loss = F.nll_loss(output, target)
+        loss.backward()
+        optimizer.step()
+        if batch_idx % args.log_interval == 0 and args.rank == 0:
+            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
+                epoch, batch_idx * len(data) * args.world_size, len(train_loader.dataset),
+                100. * batch_idx / len(train_loader), loss.item()))
+        if args.verbose:
+            print('Batch', batch_idx, "from rank", args.rank)
+
+
+def test(model, device, test_loader):
+    model.eval()
+    test_loss = 0
+    correct = 0
+    with torch.no_grad():
+        for data, target in test_loader:
+            data, target = data.to(device), target.to(device)
+            output = model(data)
+            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
+            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
+            correct += pred.eq(target.view_as(pred)).sum().item()
+
+    test_loss /= len(test_loader.dataset)
+
+    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
+        test_loss, correct, len(test_loader.dataset),
+        100. * correct / len(test_loader.dataset)))
+
+
+def main():
+    # Training settings
+    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
+    parser.add_argument('--batch-size', type=int, default=64, metavar='N',
+                        help='input batch size for training (default: 64)')
+    parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
+                        help='input batch size for testing (default: 1000)')
+    parser.add_argument('--epochs', type=int, default=14, metavar='N',
+                        help='number of epochs to train (default: 14)')
+    parser.add_argument('--lr', type=float, default=1.0, metavar='LR',
+                        help='learning rate (default: 1.0)')
+    parser.add_argument('--gamma', type=float, default=0.7, metavar='M',
+                        help='Learning rate step gamma (default: 0.7)')
+    parser.add_argument('--seed', type=int, default=1, metavar='S',
+                        help='random seed (default: 1)')
+    parser.add_argument('--log-interval', type=int, default=10, metavar='N',
+                        help='how many batches to wait before logging training status')
+    parser.add_argument('--save-model', action='store_true', default=False,
+                        help='For Saving the current Model')
+    parser.add_argument('--verbose', action='store_true', default=False,
+                        help='For displaying SMDataParallel-specific logs')
+    parser.add_argument('--data-path', type=str, default='/tmp/data', help='Path for downloading '
+                                                                           'the MNIST dataset')
+
+    args = parser.parse_args()
+    args.world_size = dist.get_world_size()
+    args.rank = rank = dist.get_rank()
+    args.local_rank = local_rank = dist.get_local_rank()
+    args.lr = 1.0
+    args.batch_size //= args.world_size // 8
+    args.batch_size = max(args.batch_size, 1)
+    data_path = args.data_path
+
+    if args.verbose:
+        print('Hello from rank', rank, 'of local_rank',
+                local_rank, 'in world size of', args.world_size)
+
+    if not torch.cuda.is_available():
+        raise Exception("Must run SMDataParallel MNIST example on CUDA-capable devices.")
+
+    torch.manual_seed(args.seed)
+
+    device = torch.device("cuda")
+
+    if local_rank == 0:
+        train_dataset = datasets.MNIST(data_path, train=True, download=True,
+                       transform=transforms.Compose([
+                           transforms.ToTensor(),
+                           transforms.Normalize((0.1307,), (0.3081,))
+                       ]))
+    else:
+        time.sleep(8)
+        train_dataset = datasets.MNIST(data_path, train=True, download=False,
+                       transform=transforms.Compose([
+                           transforms.ToTensor(),
+                           transforms.Normalize((0.1307,), (0.3081,))
+                       ]))
+
+    train_sampler = torch.utils.data.distributed.DistributedSampler(
+            train_dataset,
+            num_replicas=args.world_size,
+            rank=rank)
+    train_loader = torch.utils.data.DataLoader(
+        train_dataset,
+        batch_size=args.batch_size,
+        shuffle=False,
+        num_workers=0,
+        pin_memory=True,
+        sampler=train_sampler)
+    if rank == 0:
+        test_loader = torch.utils.data.DataLoader(
+            datasets.MNIST(data_path, train=False, transform=transforms.Compose([
+                               transforms.ToTensor(),
+                               transforms.Normalize((0.1307,), (0.3081,))
+                           ])),
+            batch_size=args.test_batch_size, shuffle=True)
+
+    model = DDP(Net().to(device))
+    torch.cuda.set_device(local_rank)
+    model.cuda(local_rank)
+    optimizer = optim.Adadelta(model.parameters(), lr=args.lr)
+    scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)
+    for epoch in range(1, args.epochs + 1):
+        train(args, model, device, train_loader, optimizer, epoch)
+        if rank == 0:
+            test(model, device, test_loader)
+        scheduler.step()
+
+    if args.save_model:
+        torch.save(model.state_dict(), "mnist_cnn.pt")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/test/unit/test_train.py b/test/unit/test_train.py