Add sagemaker integration tests (#13)

nadiaya · web-flow · commit 109f7e21f42d · 2018-04-20T22:45:59.000-07:00
* Fix single host training test.

* Update test configuration.

* Add integration tests: initial commit.

* Move common logic into one place.

* Rename test estimator file

* Add non-distributed trained gpu model.

* Add TODO about moving timeout to testing utils in container support or sdk.

* Do not specify entrypoint in local tests.
diff --git a/lib/sagemaker-container-support b/lib/sagemaker-container-support
@@ -1 +1 @@
-Subproject commit 284ddd72da84feff3f39c9a34272dac91a9a1a2a
+Subproject commit afcce5dce498e84da2c534a0f3b517562de4ea5c
diff --git a/test/conftest.py b/test/conftest.py
@@ -10,6 +10,7 @@
 # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
 # ANY KIND, either express or implied. See the License for the specific
 # language governing permissions and limitations under the License.
+import boto3
 import os
 import logging
 import platform
@@ -18,7 +19,7 @@
 import sys
 import tempfile
 
-
+from sagemaker import Session
 from test.utils import local_mode
 
 logger = logging.getLogger(__name__)
@@ -36,6 +37,8 @@
 def pytest_addoption(parser):
     parser.addoption('--build-image', '-D', action="store_true")
     parser.addoption('--build-base-image', '-B', action="store_true")
+    parser.addoption('--aws-id')
+    parser.addoption('--instance-type')
     parser.addoption('--install-container-support', '-C', action="store_true")
     parser.addoption('--docker-base-name', default='pytorch')
     parser.addoption('--region', default='us-west-2')
@@ -46,40 +49,40 @@ def pytest_addoption(parser):
     parser.addoption('--tag', default=None)
 
 
-@pytest.fixture(scope='session')
-def docker_base_name(request):
+@pytest.fixture(scope='session', name='docker_base_name')
+def fixture_docker_base_name(request):
     return request.config.getoption('--docker-base-name')
 
 
-@pytest.fixture(scope='session')
-def region(request):
+@pytest.fixture(scope='session', name='region')
+def fixture_region(request):
     return request.config.getoption('--region')
 
 
-@pytest.fixture(scope='session')
-def framework_version(request):
+@pytest.fixture(scope='session', name='framework_version')
+def fixture_framework_version(request):
     return request.config.getoption('--framework-version')
 
 
-@pytest.fixture(scope='session')
-def py_version(request):
+@pytest.fixture(scope='session', name='py_version')
+def fixture_py_version(request):
     return 'py{}'.format(int(request.config.getoption('--py-version')))
 
 
-@pytest.fixture(scope='session')
-def processor(request):
+@pytest.fixture(scope='session', name='processor')
+def fixture_processor(request):
     return request.config.getoption('--processor')
 
 
-@pytest.fixture(scope='session')
-def tag(request, framework_version, processor, py_version):
+@pytest.fixture(scope='session', name='tag')
+def fixture_tag(request, framework_version, processor, py_version):
     provided_tag = request.config.getoption('--tag')
     default_tag = '{}-{}-{}'.format(framework_version, processor, py_version)
     return provided_tag if provided_tag else default_tag
 
 
-@pytest.fixture(scope='session')
-def docker_image(docker_base_name, tag):
+@pytest.fixture(scope='session', name='docker_image')
+def fixture_docker_image(docker_base_name, tag):
     return '{}:{}'.format(docker_base_name, tag)
 
 
@@ -96,20 +99,20 @@ def opt_ml():
     shutil.rmtree(tmp, True)
 
 
-@pytest.fixture(scope='session')
-def use_gpu(processor):
+@pytest.fixture(scope='session', name='use_gpu')
+def fixture_use_gpu(processor):
     return processor == 'gpu'
 
 
-@pytest.fixture(scope='session', autouse=True)
-def install_container_support(request):
+@pytest.fixture(scope='session', name='install_container_support', autouse=True)
+def fixture_install_container_support(request):
     install = request.config.getoption('--install-container-support')
     if install:
         local_mode.install_container_support()
 
 
-@pytest.fixture(scope='session', autouse=True)
-def build_base_image(request, framework_version, py_version, processor, tag, docker_base_name):
+@pytest.fixture(scope='session', name='build_base_image', autouse=True)
+def fixture_build_base_image(request, framework_version, py_version, processor, tag, docker_base_name):
     build_base_image = request.config.getoption('--build-base-image')
     if build_base_image:
         return local_mode.build_base_image(framework_name=docker_base_name,
@@ -122,8 +125,8 @@ def build_base_image(request, framework_version, py_version, processor, tag, doc
     return tag
 
 
-@pytest.fixture(scope='session', autouse=True)
-def build_image(request, framework_version, py_version, processor, tag, docker_base_name):
+@pytest.fixture(scope='session', name='build_image', autouse=True)
+def fixture_build_image(request, framework_version, py_version, processor, tag, docker_base_name):
     build_image = request.config.getoption('--build-image')
     if build_image:
         return local_mode.build_image(framework_name=docker_base_name,
@@ -134,3 +137,38 @@ def build_image(request, framework_version, py_version, processor, tag, docker_b
                                       cwd=os.path.join(dir_path, '..'))
 
     return tag
+
+
+@pytest.fixture(scope='session', name='sagemaker_session')
+def fixture_sagemaker_session(region):
+    return Session(boto_session=boto3.Session(region_name=region))
+
+
+@pytest.fixture(name='aws_id', scope='session')
+def fixture_aws_id(request):
+    return request.config.getoption('--aws-id')
+
+
+@pytest.fixture(name='instance_type', scope='session')
+def fixture_instance_type(request):
+    return request.config.getoption('--instance-type')
+
+
+@pytest.fixture(name='docker_registry', scope='session')
+def fixture_docker_registry(aws_id, region):
+    return '{}.dkr.ecr.{}.amazonaws.com'.format(aws_id, region)
+
+
+@pytest.fixture(name='ecr_image', scope='session')
+def fixture_ecr_image(docker_registry, docker_base_name, tag):
+    return '{}/{}:{}'.format(docker_registry, docker_base_name, tag)
+
+
+@pytest.fixture(scope='session', name='dist_cpu_backend', params=['tcp', 'gloo'])
+def fixture_dist_cpu_backend(request):
+    return request.param
+
+
+@pytest.fixture(scope='session', name='dist_gpu_backend', params=['gloo'])
+def fixture_dist_gpu_backend(request):
+    return request.param
diff --git a/test/integration/__init__.py b/test/integration/__init__.py
@@ -10,3 +10,17 @@
 # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
 # ANY KIND, either express or implied. See the License for the specific
 # language governing permissions and limitations under the License.
+import os
+
+resources_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'resources'))
+mnist_path = os.path.join(resources_path, 'mnist')
+mnist_script = os.path.join(mnist_path, 'mnist.py')
+data_dir = os.path.join(mnist_path, 'data')
+training_dir = os.path.join(data_dir, 'training')
+dist_operations_path = os.path.join(resources_path, 'distributed_operations.py')
+
+mnist_1d_script = os.path.join(mnist_path, 'mnist_1d.py')
+model_cpu_dir = os.path.join(mnist_path, 'model_cpu')
+model_cpu_1d_dir = os.path.join(model_cpu_dir, '1d')
+model_gpu_dir = os.path.join(mnist_path, 'model_gpu')
+model_gpu_1d_dir = os.path.join(model_gpu_dir, '1d')
diff --git a/test/integration/local/test_distributed_training.py b/test/integration/local/test_distributed_training.py
@@ -10,34 +10,14 @@
 # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
 # ANY KIND, either express or implied. See the License for the specific
 # language governing permissions and limitations under the License.
-import os
 import pytest
 import torch
+from test.integration import data_dir, dist_operations_path, mnist_script
 from test.utils import local_mode
 
-resources_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', 'resources'))
-mnist_path = os.path.join(resources_path, 'mnist')
-mnist_script = os.path.join(mnist_path, 'mnist.py')
-data_dir = os.path.join(mnist_path, 'data')
-training_dir = os.path.join(data_dir, 'training')
-dist_operations_path = os.path.join(resources_path, 'distributed_operations.py')
-
-ENTRYPOINT = ["python", "-m", "pytorch_container.start"]
-
-
-@pytest.fixture(scope='session', name='dist_cpu_backend', params=['tcp', 'gloo'])
-def fixture_dist_cpu_backend(request):
-    return request.param
-
-
-@pytest.fixture(scope='session', name='dist_gpu_backend', params=['gloo'])
-def fixture_dist_gpu_backend(request):
-    return request.param
-
 
 def test_dist_operations_path_cpu(docker_image, opt_ml, dist_cpu_backend):
-    local_mode.train(dist_operations_path, data_dir, docker_image, opt_ml,
-                     entrypoint=ENTRYPOINT, cluster_size=3,
+    local_mode.train(dist_operations_path, data_dir, docker_image, opt_ml, cluster_size=3,
                      hyperparameters={'backend': dist_cpu_backend})
 
     assert local_mode.file_exists(opt_ml, 'model/success'), 'Script success file was not created'
@@ -47,8 +27,7 @@ def test_dist_operations_path_cpu(docker_image, opt_ml, dist_cpu_backend):
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda is not available")
 def test_dist_operations_path_gpu(docker_image, opt_ml, dist_gpu_backend):
-    local_mode.train(dist_operations_path, data_dir, docker_image, opt_ml,
-                     entrypoint=ENTRYPOINT, cluster_size=3,
+    local_mode.train(dist_operations_path, data_dir, docker_image, opt_ml, cluster_size=3,
                      use_gpu=True, hyperparameters={'backend': dist_gpu_backend})
 
     assert local_mode.file_exists(opt_ml, 'model/success'), 'Script success file was not created'
@@ -57,8 +36,7 @@ def test_dist_operations_path_gpu(docker_image, opt_ml, dist_gpu_backend):
 
 
 def test_mnist_cpu(docker_image, opt_ml, dist_cpu_backend):
-    local_mode.train(mnist_script, data_dir, docker_image, opt_ml,
-                     entrypoint=ENTRYPOINT, cluster_size=2,
+    local_mode.train(mnist_script, data_dir, docker_image, opt_ml, cluster_size=2,
                      hyperparameters={'backend': dist_cpu_backend})
 
     assert local_mode.file_exists(opt_ml, 'model/model'), 'Model file was not created'
@@ -68,8 +46,7 @@ def test_mnist_cpu(docker_image, opt_ml, dist_cpu_backend):
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda is not available")
 def test_mnist_gpu(docker_image, opt_ml, dist_gpu_backend):
-    local_mode.train(mnist_script, data_dir, docker_image, opt_ml,
-                     entrypoint=ENTRYPOINT, cluster_size=2,
+    local_mode.train(mnist_script, data_dir, docker_image, opt_ml, cluster_size=2,
                      use_gpu=True, hyperparameters={'backend': dist_gpu_backend})
 
     assert local_mode.file_exists(opt_ml, 'model/model'), 'Model file was not created'
diff --git a/test/integration/local/test_serving.py b/test/integration/local/test_serving.py
@@ -10,50 +10,34 @@
 # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
 # ANY KIND, either express or implied. See the License for the specific
 # language governing permissions and limitations under the License.
-import os
 import json
 from six import StringIO, BytesIO
 import pytest
 import requests
 from test.utils import local_mode
 import torch
-import logging
 import torch.utils.data
 import torch.utils.data.distributed
 from torchvision import datasets, transforms
 import numpy as np
 from container_support.serving import JSON_CONTENT_TYPE, CSV_CONTENT_TYPE, NPY_CONTENT_TYPE
-
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.DEBUG)
-
-mnist_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', '..', 'resources', 'mnist')
-mnist_script = os.path.join(mnist_path, 'mnist.py')
-mnist_1d_script = os.path.join(mnist_path, 'mnist_1d.py')
-model_cpu_dir = os.path.join(mnist_path, 'model_cpu')
-model_cpu_1d_dir = os.path.join(model_cpu_dir, '1d')
-model_gpu_dir = os.path.join(mnist_path, 'model_gpu')
-model_gpu_1d_dir = os.path.join(model_gpu_dir, '1d')
-
-data_dir = os.path.join(mnist_path, 'data')
-training_dir = os.path.join(data_dir, 'training')
-
-ENTRYPOINT = ["python", "-m", "pytorch_container.start"]
+from test.integration import training_dir, mnist_script, mnist_1d_script, model_cpu_dir, model_gpu_dir, \
+    model_cpu_1d_dir
 
 
 @pytest.fixture(name='serve_cpu')
 def fixture_serve_cpu(docker_image, opt_ml):
     def serve(model_dir=model_cpu_dir, script=mnist_script):
         return local_mode.serve(customer_script=script, model_dir=model_dir, image_name=docker_image,
-                                opt_ml=opt_ml, entrypoint=ENTRYPOINT)
+                                opt_ml=opt_ml)
     return serve
 
 
 @pytest.fixture(name='serve_gpu')
 def fixture_serve_gpu(docker_image, opt_ml):
-    def serve(model_dir=model_cpu_dir, script=mnist_script):
+    def serve(model_dir=model_gpu_dir, script=mnist_script):
         return local_mode.serve(customer_script=script, model_dir=model_dir, image_name=docker_image,
-                                use_gpu=True, opt_ml=opt_ml, entrypoint=ENTRYPOINT)
+                                use_gpu=True, opt_ml=opt_ml)
     return serve
 
 
@@ -119,7 +103,6 @@ def _assert_prediction_csv(test_loader, accept):
 
 
 def _get_test_data_loader(batch_size):
-    logger.info('training dir: {}'.format(os.listdir(training_dir)))
     return torch.utils.data.DataLoader(
         datasets.MNIST(training_dir, train=False, transform=transforms.Compose([
             transforms.ToTensor(),
diff --git a/test/integration/local/test_single_machine_training.py b/test/integration/local/test_single_machine_training.py
@@ -10,26 +10,12 @@
 # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
 # ANY KIND, either express or implied. See the License for the specific
 # language governing permissions and limitations under the License.
-import os
 from test.utils import local_mode
-import logging
-
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.DEBUG)
-
-mnist_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'resources', 'mnist'))
-data_dir = os.path.join(mnist_path, 'data')
-
-training_dir = os.path.join(data_dir, 'training')
-
-mnist_script = 'mnist.py'
-
-ENTRYPOINT = ["python", "-m", "pytorch_container.start"]
+from test.integration import data_dir, mnist_script
 
 
 def test_mnist_cpu(docker_image, opt_ml, use_gpu):
-    local_mode.train(mnist_script, data_dir, docker_image, opt_ml,
-                     source_dir=mnist_path, use_gpu=use_gpu, entrypoint=ENTRYPOINT)
+    local_mode.train(mnist_script, data_dir, docker_image, opt_ml, use_gpu=use_gpu)
 
     assert local_mode.file_exists(opt_ml, 'model/model'), 'Model file was not created'
     assert local_mode.file_exists(opt_ml, 'output/success'), 'Success file was not created'
diff --git a/test/integration/sagemaker/__init__.py b/test/integration/sagemaker/__init__.py
@@ -0,0 +1,12 @@
+# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
diff --git a/test/integration/sagemaker/conftest.py b/test/integration/sagemaker/conftest.py
@@ -0,0 +1,19 @@
+# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+import pytest
+
+
+@pytest.fixture(autouse=True)
+def skip_by_device_type(request, tag):
+    if (request.node.get_marker('skip_gpu') and 'gpu' in tag) or (request.node.get_marker('skip_cpu') and 'cpu' in tag):
+        pytest.skip('Skipping because tag is: {}'.format(tag))
diff --git a/test/integration/sagemaker/estimator.py b/test/integration/sagemaker/estimator.py
@@ -0,0 +1,27 @@
+# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+from sagemaker.pytorch.estimator import PyTorch
+
+
+class PytorchTestEstimator(PyTorch):
+    def __init__(self, docker_image_uri, **kwargs):
+        super(PytorchTestEstimator, self).__init__(**kwargs)
+        self.docker_image_uri = docker_image_uri
+
+    def train_image(self):
+        return self.docker_image_uri
+
+    def create_model(self):
+        model = super(PytorchTestEstimator, self).create_model()
+        model.image = self.docker_image_uri
+        return model
diff --git a/test/integration/sagemaker/test_distributed_operations.py b/test/integration/sagemaker/test_distributed_operations.py
diff --git a/test/integration/sagemaker/test_mnist.py b/test/integration/sagemaker/test_mnist.py
diff --git a/test/integration/sagemaker/timeout.py b/test/integration/sagemaker/timeout.py
diff --git a/test/resources/mnist/model_gpu/model b/test/resources/mnist/model_gpu/model